mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 11:30:15 +00:00
483 lines
13 KiB
Go
483 lines
13 KiB
Go
package monitoring
|
|
|
|
import (
|
|
"fmt"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/models"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
const (
|
|
minRunningVMsForRepeatedMemoryCheck = 4
|
|
minRepeatedVMsForSuspicion = 3
|
|
minRepeatedMemoryShare = 0.60
|
|
maxRepeatedMemorySampleNames = 5
|
|
)
|
|
|
|
type repeatedVMMemoryUsage struct {
|
|
suspicious bool
|
|
signature string
|
|
pattern string
|
|
runningCount int
|
|
repeatedCount int
|
|
repeatedMemUsed int64
|
|
sourceBreakdown []string
|
|
sampleVMNames []string
|
|
}
|
|
|
|
type repeatedMemoryGroup struct {
|
|
count int
|
|
sources map[string]int
|
|
names []string
|
|
}
|
|
|
|
func filterVMsByInstance(vms []models.VM, instanceName string) []models.VM {
|
|
filtered := make([]models.VM, 0, len(vms))
|
|
for _, vm := range vms {
|
|
if vm.Instance == instanceName {
|
|
filtered = append(filtered, vm)
|
|
}
|
|
}
|
|
return filtered
|
|
}
|
|
|
|
func detectRepeatedVMMemoryUsage(vms []models.VM) repeatedVMMemoryUsage {
|
|
groups := make(map[int64]*repeatedMemoryGroup)
|
|
runningCount := 0
|
|
|
|
for _, vm := range vms {
|
|
if vm.Type != "qemu" || vm.Status != "running" || vm.Memory.Total <= 0 || vm.Memory.Used <= 0 {
|
|
continue
|
|
}
|
|
runningCount++
|
|
|
|
group, ok := groups[vm.Memory.Used]
|
|
if !ok {
|
|
group = &repeatedMemoryGroup{
|
|
sources: make(map[string]int),
|
|
}
|
|
groups[vm.Memory.Used] = group
|
|
}
|
|
group.count++
|
|
source := strings.TrimSpace(vm.MemorySource)
|
|
if source == "" {
|
|
source = "unknown"
|
|
}
|
|
group.sources[source]++
|
|
if len(group.names) < maxRepeatedMemorySampleNames {
|
|
name := strings.TrimSpace(vm.Name)
|
|
if name == "" {
|
|
name = vm.ID
|
|
}
|
|
group.names = append(group.names, name)
|
|
}
|
|
}
|
|
|
|
if runningCount < minRunningVMsForRepeatedMemoryCheck {
|
|
return repeatedVMMemoryUsage{runningCount: runningCount}
|
|
}
|
|
|
|
var topUsed int64
|
|
var topGroup *repeatedMemoryGroup
|
|
for used, group := range groups {
|
|
if topGroup == nil || group.count > topGroup.count {
|
|
topUsed = used
|
|
topGroup = group
|
|
}
|
|
}
|
|
if topGroup == nil {
|
|
return repeatedVMMemoryUsage{runningCount: runningCount}
|
|
}
|
|
|
|
share := float64(topGroup.count) / float64(runningCount)
|
|
if topGroup.count < minRepeatedVMsForSuspicion || share < minRepeatedMemoryShare {
|
|
return repeatedVMMemoryUsage{runningCount: runningCount}
|
|
}
|
|
|
|
breakdown := formatMemorySourceBreakdown(topGroup.sources)
|
|
sampleNames := append([]string(nil), topGroup.names...)
|
|
sort.Strings(sampleNames)
|
|
|
|
return repeatedVMMemoryUsage{
|
|
suspicious: true,
|
|
signature: fmt.Sprintf("%d:%d:%s", topUsed, topGroup.count, strings.Join(breakdown, ",")),
|
|
pattern: "exact-used",
|
|
runningCount: runningCount,
|
|
repeatedCount: topGroup.count,
|
|
repeatedMemUsed: topUsed,
|
|
sourceBreakdown: breakdown,
|
|
sampleVMNames: sampleNames,
|
|
}
|
|
}
|
|
|
|
func formatMemorySourceBreakdown(counts map[string]int) []string {
|
|
type sourceCount struct {
|
|
source string
|
|
count int
|
|
}
|
|
entries := make([]sourceCount, 0, len(counts))
|
|
for source, count := range counts {
|
|
entries = append(entries, sourceCount{source: source, count: count})
|
|
}
|
|
sort.Slice(entries, func(i, j int) bool {
|
|
if entries[i].count == entries[j].count {
|
|
return entries[i].source < entries[j].source
|
|
}
|
|
return entries[i].count > entries[j].count
|
|
})
|
|
|
|
breakdown := make([]string, 0, len(entries))
|
|
for _, entry := range entries {
|
|
breakdown = append(breakdown, fmt.Sprintf("%s:%d", entry.source, entry.count))
|
|
}
|
|
return breakdown
|
|
}
|
|
|
|
func detectRepeatedLowTrustFullUsage(vms []models.VM, snapshots []GuestMemorySnapshot) repeatedVMMemoryUsage {
|
|
if len(vms) == 0 {
|
|
return repeatedVMMemoryUsage{}
|
|
}
|
|
hasSnapshots := len(vms) == len(snapshots)
|
|
|
|
runningCount := 0
|
|
repeatedCount := 0
|
|
sourceCounts := make(map[string]int)
|
|
sampleNames := make([]string, 0, maxRepeatedMemorySampleNames)
|
|
|
|
for i := range vms {
|
|
vm := vms[i]
|
|
if vm.Type != "qemu" || vm.Status != "running" || vm.Memory.Total <= 0 {
|
|
continue
|
|
}
|
|
runningCount++
|
|
|
|
source := strings.TrimSpace(vm.MemorySource)
|
|
if hasSnapshots {
|
|
used, candidateSource, ok := lowTrustMemoryCandidate(vm, snapshots[i])
|
|
if !ok || used <= 0 {
|
|
continue
|
|
}
|
|
source = candidateSource
|
|
} else if vmMemorySourceReliability(source) > vmMemorySourceReliabilityFallback {
|
|
continue
|
|
}
|
|
|
|
if vm.Memory.Usage < 99 || vmMemorySourceReliability(source) > vmMemorySourceReliabilityFallback {
|
|
continue
|
|
}
|
|
|
|
repeatedCount++
|
|
sourceCounts[source]++
|
|
if len(sampleNames) < maxRepeatedMemorySampleNames {
|
|
name := strings.TrimSpace(vm.Name)
|
|
if name == "" {
|
|
name = vm.ID
|
|
}
|
|
sampleNames = append(sampleNames, name)
|
|
}
|
|
}
|
|
|
|
if runningCount < minRunningVMsForRepeatedMemoryCheck {
|
|
return repeatedVMMemoryUsage{runningCount: runningCount}
|
|
}
|
|
|
|
share := float64(repeatedCount) / float64(runningCount)
|
|
if repeatedCount < minRepeatedVMsForSuspicion || share < minRepeatedMemoryShare {
|
|
return repeatedVMMemoryUsage{runningCount: runningCount}
|
|
}
|
|
|
|
breakdown := formatMemorySourceBreakdown(sourceCounts)
|
|
sort.Strings(sampleNames)
|
|
|
|
return repeatedVMMemoryUsage{
|
|
suspicious: true,
|
|
signature: fmt.Sprintf("full-usage:%d:%s", repeatedCount, strings.Join(breakdown, ",")),
|
|
pattern: "low-trust-full-usage",
|
|
runningCount: runningCount,
|
|
repeatedCount: repeatedCount,
|
|
sourceBreakdown: breakdown,
|
|
sampleVMNames: sampleNames,
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) logSuspiciousRepeatedVMMemoryUsage(instanceName string, currentVMs []models.VM, previousVMs []models.VM) {
|
|
current := detectRepeatedVMMemoryUsage(currentVMs)
|
|
if !current.suspicious {
|
|
current = detectRepeatedLowTrustFullUsage(currentVMs, nil)
|
|
}
|
|
if !current.suspicious {
|
|
return
|
|
}
|
|
|
|
previous := detectRepeatedVMMemoryUsage(previousVMs)
|
|
if previous.suspicious && previous.signature == current.signature {
|
|
return
|
|
}
|
|
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Str("pattern", current.pattern).
|
|
Int("runningVMs", current.runningCount).
|
|
Int("repeatedVMs", current.repeatedCount).
|
|
Int64("repeatedMemoryUsedBytes", current.repeatedMemUsed).
|
|
Strs("memorySources", current.sourceBreakdown).
|
|
Strs("sampleVMs", current.sampleVMNames).
|
|
Msg("Suspicious repeated VM memory-used pattern detected")
|
|
}
|
|
|
|
func stabilizeSuspiciousRepeatedVMMemory(vms []models.VM, alertVMs []models.VM, snapshots []GuestMemorySnapshot, previousVMs []models.VM, now time.Time) int {
|
|
current := detectRepeatedLowTrustVMMemoryUsage(vms, snapshots)
|
|
if !current.suspicious {
|
|
current = detectRepeatedVMMemoryUsage(vms)
|
|
}
|
|
if !current.suspicious {
|
|
current = detectRepeatedLowTrustFullUsage(vms, snapshots)
|
|
}
|
|
if !current.suspicious {
|
|
return 0
|
|
}
|
|
|
|
prevByID := make(map[string]models.VM, len(previousVMs))
|
|
for _, prev := range previousVMs {
|
|
prevByID[prev.ID] = prev
|
|
}
|
|
|
|
applied := 0
|
|
for i := range vms {
|
|
vm := &vms[i]
|
|
if vm.Type != "qemu" || vm.Status != "running" || vm.Memory.Total <= 0 {
|
|
continue
|
|
}
|
|
if vmMemorySourceReliability(vm.MemorySource) > vmMemorySourceReliabilityFallback {
|
|
continue
|
|
}
|
|
switch current.pattern {
|
|
case "low-trust-full-usage":
|
|
if vm.Memory.Usage < 99 {
|
|
continue
|
|
}
|
|
default:
|
|
if vm.Memory.Used != current.repeatedMemUsed {
|
|
continue
|
|
}
|
|
}
|
|
|
|
prev, ok := prevByID[vm.ID]
|
|
if !ok || !shouldStabilizeSuspiciousRepeatedVMMemory(prev, *vm, now) {
|
|
continue
|
|
}
|
|
|
|
currentBalloon := vm.Memory.Balloon
|
|
vm.Memory = prev.Memory
|
|
if currentBalloon > 0 {
|
|
vm.Memory.Balloon = currentBalloon
|
|
}
|
|
vm.MemorySource = "previous-snapshot"
|
|
|
|
if len(alertVMs) == len(vms) {
|
|
alertVMs[i].Memory = vm.Memory
|
|
alertVMs[i].MemorySource = vm.MemorySource
|
|
}
|
|
|
|
if len(snapshots) == len(vms) {
|
|
snapshots[i].Memory = vm.Memory
|
|
snapshots[i].MemorySource = vm.MemorySource
|
|
snapshots[i].Notes = appendSnapshotNote(snapshots[i].Notes, "preserved-previous-memory-after-repeated-low-trust-pattern")
|
|
}
|
|
|
|
applied++
|
|
}
|
|
|
|
if applied == 0 {
|
|
return 0
|
|
}
|
|
|
|
log.Warn().
|
|
Str("pattern", current.pattern).
|
|
Int("runningVMs", current.runningCount).
|
|
Int("repeatedVMs", current.repeatedCount).
|
|
Int("stabilizedVMs", applied).
|
|
Int64("repeatedMemoryUsedBytes", current.repeatedMemUsed).
|
|
Strs("memorySources", current.sourceBreakdown).
|
|
Strs("sampleVMs", current.sampleVMNames).
|
|
Msg("Stabilized suspicious repeated VM memory-used pattern with previous readings")
|
|
|
|
return applied
|
|
}
|
|
|
|
func detectRepeatedLowTrustVMMemoryUsage(vms []models.VM, snapshots []GuestMemorySnapshot) repeatedVMMemoryUsage {
|
|
if len(vms) == 0 || len(vms) != len(snapshots) {
|
|
return repeatedVMMemoryUsage{}
|
|
}
|
|
|
|
groups := make(map[int64]*repeatedMemoryGroup)
|
|
runningCount := 0
|
|
|
|
for i := range vms {
|
|
vm := vms[i]
|
|
if vm.Type != "qemu" || vm.Status != "running" || vm.Memory.Total <= 0 {
|
|
continue
|
|
}
|
|
runningCount++
|
|
|
|
used, source, ok := lowTrustMemoryCandidate(vm, snapshots[i])
|
|
if !ok || used <= 0 {
|
|
continue
|
|
}
|
|
|
|
group, exists := groups[used]
|
|
if !exists {
|
|
group = &repeatedMemoryGroup{sources: make(map[string]int)}
|
|
groups[used] = group
|
|
}
|
|
group.count++
|
|
group.sources[source]++
|
|
if len(group.names) < maxRepeatedMemorySampleNames {
|
|
name := strings.TrimSpace(vm.Name)
|
|
if name == "" {
|
|
name = vm.ID
|
|
}
|
|
group.names = append(group.names, name)
|
|
}
|
|
}
|
|
|
|
if runningCount < minRunningVMsForRepeatedMemoryCheck {
|
|
return repeatedVMMemoryUsage{runningCount: runningCount}
|
|
}
|
|
|
|
var topUsed int64
|
|
var topGroup *repeatedMemoryGroup
|
|
for used, group := range groups {
|
|
if topGroup == nil || group.count > topGroup.count {
|
|
topUsed = used
|
|
topGroup = group
|
|
}
|
|
}
|
|
if topGroup == nil {
|
|
return repeatedVMMemoryUsage{runningCount: runningCount}
|
|
}
|
|
|
|
share := float64(topGroup.count) / float64(runningCount)
|
|
if topGroup.count < minRepeatedVMsForSuspicion || share < minRepeatedMemoryShare {
|
|
return repeatedVMMemoryUsage{runningCount: runningCount}
|
|
}
|
|
|
|
breakdown := formatMemorySourceBreakdown(topGroup.sources)
|
|
sampleNames := append([]string(nil), topGroup.names...)
|
|
sort.Strings(sampleNames)
|
|
|
|
return repeatedVMMemoryUsage{
|
|
suspicious: true,
|
|
signature: fmt.Sprintf("%d:%d:%s", topUsed, topGroup.count, strings.Join(breakdown, ",")),
|
|
runningCount: runningCount,
|
|
repeatedCount: topGroup.count,
|
|
repeatedMemUsed: topUsed,
|
|
sourceBreakdown: breakdown,
|
|
sampleVMNames: sampleNames,
|
|
}
|
|
}
|
|
|
|
func lowTrustMemoryCandidate(vm models.VM, snapshot GuestMemorySnapshot) (int64, string, bool) {
|
|
source := strings.TrimSpace(snapshot.MemorySource)
|
|
if source == "" {
|
|
source = strings.TrimSpace(vm.MemorySource)
|
|
}
|
|
|
|
switch {
|
|
case source == "status-mem":
|
|
return clampLowTrustMemoryCandidate(vm.Memory.Total, snapshot.Raw.StatusMem, "status-mem")
|
|
case source == "status-freemem":
|
|
if snapshot.Raw.StatusFreeMem == 0 || uint64(vm.Memory.Total) < snapshot.Raw.StatusFreeMem {
|
|
return 0, "", false
|
|
}
|
|
return int64(uint64(vm.Memory.Total) - snapshot.Raw.StatusFreeMem), "status-freemem", true
|
|
case source == "listing-mem" || source == "cluster-resources":
|
|
return clampLowTrustMemoryCandidate(vm.Memory.Total, snapshot.Raw.ListingMem, source)
|
|
case source == "meminfo-total-minus-used":
|
|
if snapshot.Raw.MemInfoTotalMinusUsed == 0 || uint64(vm.Memory.Total) < snapshot.Raw.MemInfoTotalMinusUsed {
|
|
return 0, "", false
|
|
}
|
|
return int64(uint64(vm.Memory.Total) - snapshot.Raw.MemInfoTotalMinusUsed), "meminfo-total-minus-used", true
|
|
case source == "previous-snapshot":
|
|
if !snapshotHasNote(snapshot.Notes, "preserved-previous-memory-after-low-trust-fallback") &&
|
|
!snapshotHasNote(snapshot.Notes, "preserved-previous-memory-after-repeated-low-trust-pattern") {
|
|
return 0, "", false
|
|
}
|
|
switch {
|
|
case snapshot.Raw.StatusMem > 0:
|
|
return clampLowTrustMemoryCandidate(vm.Memory.Total, snapshot.Raw.StatusMem, "status-mem")
|
|
case snapshot.Raw.MemInfoTotalMinusUsed > 0 && uint64(vm.Memory.Total) >= snapshot.Raw.MemInfoTotalMinusUsed:
|
|
return int64(uint64(vm.Memory.Total) - snapshot.Raw.MemInfoTotalMinusUsed), "meminfo-total-minus-used", true
|
|
case snapshot.Raw.ListingMem > 0:
|
|
return clampLowTrustMemoryCandidate(vm.Memory.Total, snapshot.Raw.ListingMem, "listing-mem")
|
|
}
|
|
}
|
|
|
|
return 0, "", false
|
|
}
|
|
|
|
func clampLowTrustMemoryCandidate(total int64, used uint64, source string) (int64, string, bool) {
|
|
if total <= 0 || used == 0 {
|
|
return 0, "", false
|
|
}
|
|
if used > uint64(total) {
|
|
used = uint64(total)
|
|
}
|
|
return int64(used), source, true
|
|
}
|
|
|
|
func snapshotHasNote(notes []string, target string) bool {
|
|
for _, note := range notes {
|
|
if note == target {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func shouldStabilizeSuspiciousRepeatedVMMemory(prev, current models.VM, now time.Time) bool {
|
|
if current.Type != "qemu" || current.Status != "running" || prev.Type != "qemu" || prev.Status != "running" {
|
|
return false
|
|
}
|
|
if prev.Memory.Total <= 0 || prev.Memory.Used < 0 {
|
|
return false
|
|
}
|
|
if prev.LastSeen.IsZero() || now.Sub(prev.LastSeen) > vmMemoryCarryForwardMaxAge {
|
|
return false
|
|
}
|
|
if current.Memory.Total > 0 && prev.Memory.Total > 0 && prev.Memory.Total != current.Memory.Total {
|
|
return false
|
|
}
|
|
|
|
prevReliability := vmMemorySourceReliability(prev.MemorySource)
|
|
if prev.MemorySource != "previous-snapshot" && prevReliability < vmMemorySourceReliabilityTrusted {
|
|
return false
|
|
}
|
|
if vmMemorySourceReliability(current.MemorySource) > vmMemorySourceReliabilityFallback {
|
|
return false
|
|
}
|
|
if prev.Memory.Usage > 0 && absFloat64(prev.Memory.Usage-current.Memory.Usage) < vmMemoryCarryForwardMinUsageDelta {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func appendSnapshotNote(notes []string, note string) []string {
|
|
for _, existing := range notes {
|
|
if existing == note {
|
|
return notes
|
|
}
|
|
}
|
|
return append(notes, note)
|
|
}
|
|
|
|
func absFloat64(v float64) float64 {
|
|
if v < 0 {
|
|
return -v
|
|
}
|
|
return v
|
|
}
|