Pulse/internal/monitoring/proxmox_vm_cluster_resource.go

615 lines
20 KiB
Go

package monitoring
import (
"context"
"fmt"
"strings"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/logging"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/pkg/fsfilters"
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
)
type indexedClusterResource struct {
order int
resource proxmox.ClusterResource
}
type efficientQEMUPollResult struct {
order int
vm models.VM
alertVM models.VM
snapshot GuestMemorySnapshot
ok bool
}
func (m *Monitor) nextGuestAgentPollOffset(instanceName string, count int) int {
if m == nil || count <= 1 {
return 0
}
m.guestAgentPollOrderMu.Lock()
defer m.guestAgentPollOrderMu.Unlock()
if m.guestAgentPollCursor == nil {
m.guestAgentPollCursor = make(map[string]int)
}
offset := m.guestAgentPollCursor[instanceName] % count
m.guestAgentPollCursor[instanceName] = (offset + 1) % count
return offset
}
func rotateIndexedClusterResources(resources []indexedClusterResource, offset int) []indexedClusterResource {
count := len(resources)
if count <= 1 {
return append([]indexedClusterResource(nil), resources...)
}
offset %= count
if offset < 0 {
offset += count
}
if offset == 0 {
return append([]indexedClusterResource(nil), resources...)
}
rotated := make([]indexedClusterResource, 0, count)
rotated = append(rotated, resources[offset:]...)
rotated = append(rotated, resources[:offset]...)
return rotated
}
func (m *Monitor) efficientQEMUWorkerCount(total int) int {
if total <= 0 {
return 0
}
if m == nil || m.guestAgentWorkSlots == nil {
return total
}
workers := cap(m.guestAgentWorkSlots)
if workers <= 0 {
return total
}
if workers > total {
return total
}
return workers
}
func (m *Monitor) pollEfficientQEMUResource(
ctx context.Context,
instanceName string,
res proxmox.ClusterResource,
client PVEClientInterface,
vmIDToHostAgent map[string]models.Host,
prevDiskByGuestID map[string]models.Disk,
prevVMByGuestID map[string]models.VM,
) (models.VM, models.VM, GuestMemorySnapshot, bool) {
guestID := makeGuestID(instanceName, res.Node, res.VMID)
diskReadBytes := int64(res.DiskRead)
diskWriteBytes := int64(res.DiskWrite)
networkInBytes := int64(res.NetIn)
networkOutBytes := int64(res.NetOut)
var individualDisks []models.Disk
diskFromAgent := false
diskStatusReason := ""
var ipAddresses []string
var networkInterfaces []models.GuestNetworkInterface
var osName, osVersion, agentVersion string
var prevVM *models.VM
if prev, ok := prevVMByGuestID[guestID]; ok {
prevVM = &prev
}
guestAgentAvailable := false
memTotal := res.MaxMem
memUsed := res.Mem
memorySource := "cluster-resources"
guestRaw := VMMemoryRaw{
ListingMem: res.Mem,
ListingMaxMem: res.MaxMem,
}
var detailedStatus *proxmox.VMStatus
memAvailable := uint64(0)
memRawFree := uint64(0)
memInfoTotalMinusUsed := uint64(0)
rrdUsed := uint64(0)
diskUsed := res.Disk
diskTotal := res.MaxDisk
diskFree := diskTotal - diskUsed
diskUsage := safePercentage(float64(diskUsed), float64(diskTotal))
if diskUsed == 0 && diskTotal > 0 && res.Status == "running" {
diskUsage = -1
diskStatusReason = "no-data"
}
if res.Status != "running" && diskTotal > 0 {
diskUsage = -1
diskStatusReason = "vm-stopped"
}
if res.Status == "running" {
statusCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
status, err := client.GetVMStatus(statusCtx, res.Node, res.VMID)
cancel()
if err != nil {
log.Debug().
Err(err).
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Msg("Could not get VM status to check guest agent availability")
} else if status != nil {
detailedStatus = status
guestRaw.StatusMaxMem = detailedStatus.MaxMem
guestRaw.StatusMem = detailedStatus.Mem
guestRaw.StatusFreeMem = detailedStatus.FreeMem
guestRaw.Balloon = detailedStatus.Balloon
guestRaw.BalloonMin = detailedStatus.BalloonMin
guestRaw.Agent = detailedStatus.Agent.Value
if detailedStatus.MemInfo != nil {
guestRaw.MemInfoUsed = detailedStatus.MemInfo.Used
guestRaw.MemInfoFree = detailedStatus.MemInfo.Free
guestRaw.MemInfoTotal = detailedStatus.MemInfo.Total
guestRaw.MemInfoAvailable = detailedStatus.MemInfo.Available
guestRaw.MemInfoBuffers = detailedStatus.MemInfo.Buffers
guestRaw.MemInfoCached = detailedStatus.MemInfo.Cached
guestRaw.MemInfoShared = detailedStatus.MemInfo.Shared
if detailedStatus.MemInfo.Free > 0 {
memRawFree = detailedStatus.MemInfo.Free
}
selection := selectVMAvailableFromMemInfo(detailedStatus.MemInfo)
memInfoTotalMinusUsed = selection.TotalMinusUsed
guestRaw.MemInfoTotalMinusUsed = memInfoTotalMinusUsed
if selection.Available > 0 {
memAvailable = selection.Available
memorySource = selection.Source
}
}
diskReadBytes = int64(detailedStatus.DiskRead)
diskWriteBytes = int64(detailedStatus.DiskWrite)
networkInBytes = int64(detailedStatus.NetIn)
networkOutBytes = int64(detailedStatus.NetOut)
if detailedStatus.MaxMem > 0 {
memTotal = detailedStatus.MaxMem
}
} else {
log.Debug().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Msg("Could not get VM status, using cluster/resources disk data")
if diskTotal > 0 {
diskUsage = -1
diskStatusReason = "no-status"
}
}
}
guestAgentAvailable = res.Status == "running" && shouldQueryGuestAgent(detailedStatus, prevVM, time.Now())
if guestAgentAvailable && memAvailable == 0 {
m.runGuestAgentVMWork(ctx, instanceName, res.Node, res.Name, res.VMID, func(agentCtx context.Context) {
if agentAvail, agentErr := m.getVMAgentMemAvailable(agentCtx, client, instanceName, res.Node, res.VMID); agentErr == nil && agentAvail > 0 {
memAvailable = agentAvail
memorySource = "guest-agent-meminfo"
guestRaw.MemInfoAvailable = memAvailable
log.Debug().
Str("vm", res.Name).
Str("node", res.Node).
Int("vmid", res.VMID).
Uint64("total", memTotal).
Uint64("available", memAvailable).
Msg("QEMU memory: using guest agent /proc/meminfo (excludes reclaimable cache)")
}
})
}
if res.Status == "running" && memAvailable == 0 {
if rrdEntry, rrdErr := m.getVMRRDMetrics(ctx, client, instanceName, res.Node, res.VMID); rrdErr == nil {
if rrdEntry.total > 0 {
memTotal = rrdEntry.total
}
if rrdEntry.available > 0 {
memAvailable = rrdEntry.available
memorySource = "rrd-memavailable"
guestRaw.RRDAvailable = rrdEntry.available
guestRaw.MemInfoAvailable = rrdEntry.available
} else if rrdEntry.used > 0 {
rrdUsed = rrdEntry.used
memorySource = "rrd-memused"
guestRaw.RRDUsed = rrdEntry.used
}
} else {
log.Debug().
Err(rrdErr).
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Msg("RRD memory data unavailable for VM, using status/cluster resources values")
}
}
if res.Status == "running" && memAvailable == 0 {
if agentHost, ok := vmIDToHostAgent[guestID]; ok && agentHost.Memory.Total > 0 {
agentAvailable := agentHost.Memory.Total - agentHost.Memory.Used
if agentAvailable > 0 {
memAvailable = uint64(agentAvailable)
memorySource = "host-agent"
guestRaw.HostAgentTotal = uint64(agentHost.Memory.Total)
guestRaw.HostAgentUsed = uint64(agentHost.Memory.Used)
}
}
}
if guestAgentAvailable {
m.runGuestAgentFSInfoWork(ctx, instanceName, res.Node, res.Name, res.VMID, func(agentCtx context.Context) {
fsInfoRaw, err := m.retryGuestAgentCall(agentCtx, m.guestAgentFSInfoTimeout, m.guestAgentRetries, func(ctx context.Context) (interface{}, error) {
return client.GetVMFSInfo(ctx, res.Node, res.VMID)
})
var fsInfo []proxmox.VMFileSystem
if err == nil {
if fs, ok := fsInfoRaw.([]proxmox.VMFileSystem); ok {
fsInfo = fs
}
}
if err != nil {
diskStatusReason = classifyGuestAgentDiskStatusError(err)
if diskTotal > 0 {
diskUsage = -1
}
errMsg := err.Error()
switch {
case strings.Contains(errMsg, "500") || strings.Contains(errMsg, "QEMU guest agent is not running"):
log.Info().Str("instance", instanceName).Str("vm", res.Name).Int("vmid", res.VMID).Msg("Guest agent enabled in VM config but not running inside guest OS. Install and start qemu-guest-agent in the VM")
case strings.Contains(errMsg, "timeout") || strings.Contains(errMsg, "deadline exceeded"):
log.Info().Str("instance", instanceName).Str("vm", res.Name).Int("vmid", res.VMID).Msg("Guest agent timeout - agent may be installed but not responding")
case strings.Contains(errMsg, "403") || strings.Contains(errMsg, "401") || strings.Contains(errMsg, "authentication error"):
log.Info().Str("instance", instanceName).Str("vm", res.Name).Int("vmid", res.VMID).Msg("VM disk monitoring permission denied. Check guest-agent permissions for this node")
default:
log.Debug().Err(err).Str("instance", instanceName).Str("vm", res.Name).Int("vmid", res.VMID).Msg("Failed to get filesystem info from guest agent")
}
return
}
if len(fsInfo) == 0 {
diskStatusReason = "no-filesystems"
if diskTotal > 0 {
diskUsage = -1
}
log.Info().Str("instance", instanceName).Str("vm", res.Name).Int("vmid", res.VMID).Msg("Guest agent returned no filesystem info - agent may need restart or VM may have no mounted filesystems")
return
}
var totalBytes, usedBytes uint64
var skippedFS []string
var includedFS []string
seenFilesystems := make(map[string]bool)
for _, fs := range fsInfo {
shouldSkip, reasons := fsfilters.ShouldSkipFilesystem(fs.Type, fs.Mountpoint, fs.TotalBytes, fs.UsedBytes)
if shouldSkip {
skippedFS = append(skippedFS, fmt.Sprintf("%s(%s,%s)", fs.Mountpoint, fs.Type, strings.Join(reasons, ",")))
continue
}
if reason, skip := readOnlyFilesystemReason(fs.Type, fs.TotalBytes, fs.UsedBytes); skip {
skippedFS = append(skippedFS, fmt.Sprintf("%s(%s,%s)", fs.Mountpoint, fs.Type, reason))
continue
}
if fs.TotalBytes <= 0 {
if len(fs.Mountpoint) > 0 {
skippedFS = append(skippedFS, fmt.Sprintf("%s(%s,0GB)", fs.Mountpoint, fs.Type))
}
continue
}
fsTypeLower := strings.ToLower(fs.Type)
countThisFS := true
if fs.Disk != "" {
dedupeKey := fmt.Sprintf("%s:%d", fs.Disk, fs.TotalBytes)
if seenFilesystems[dedupeKey] {
countThisFS = false
} else {
seenFilesystems[dedupeKey] = true
}
} else if fsTypeLower == "btrfs" || fsTypeLower == "zfs" || strings.HasPrefix(fsTypeLower, "zfs") {
dedupeKey := fmt.Sprintf("%s::%d", fsTypeLower, fs.TotalBytes)
if seenFilesystems[dedupeKey] {
countThisFS = false
} else {
seenFilesystems[dedupeKey] = true
}
}
if countThisFS {
totalBytes += fs.TotalBytes
usedBytes += fs.UsedBytes
}
includedFS = append(includedFS, fmt.Sprintf("%s(%s,%.1fGB)", fs.Mountpoint, fs.Type, float64(fs.TotalBytes)/1073741824))
individualDisks = append(individualDisks, models.Disk{
Total: int64(fs.TotalBytes),
Used: int64(fs.UsedBytes),
Free: int64(fs.TotalBytes - fs.UsedBytes),
Usage: safePercentage(float64(fs.UsedBytes), float64(fs.TotalBytes)),
Mountpoint: fs.Mountpoint,
Type: fs.Type,
Device: fs.Disk,
})
}
if len(skippedFS) > 0 {
log.Debug().Str("instance", instanceName).Str("vm", res.Name).Strs("skipped", skippedFS).Msg("Skipped special filesystems")
}
if len(includedFS) > 0 {
log.Debug().Str("instance", instanceName).Str("vm", res.Name).Int("vmid", res.VMID).Strs("included", includedFS).Msg("Filesystems included in disk calculation")
}
if totalBytes > 0 {
diskTotal = totalBytes
diskUsed = usedBytes
diskFree = totalBytes - usedBytes
diskUsage = safePercentage(float64(usedBytes), float64(totalBytes))
diskFromAgent = true
diskStatusReason = ""
} else if diskTotal > 0 {
diskUsage = -1
diskStatusReason = "special-filesystems-only"
}
})
} else if res.Status == "running" && diskTotal > 0 {
diskUsage = -1
if detailedStatus == nil {
diskStatusReason = "no-status"
} else {
diskStatusReason = "agent-disabled"
}
}
if res.Status == "running" {
m.runGuestAgentVMWork(ctx, instanceName, res.Node, res.Name, res.VMID, func(agentCtx context.Context) {
guestIPs, guestIfaces, guestOSName, guestOSVersion, guestAgentVersion := m.fetchGuestAgentMetadata(agentCtx, client, instanceName, res.Node, res.Name, res.VMID, detailedStatus, guestAgentAvailable)
if len(guestIPs) > 0 {
ipAddresses = guestIPs
}
if len(guestIfaces) > 0 {
networkInterfaces = guestIfaces
}
if guestOSName != "" {
osName = guestOSName
}
if guestOSVersion != "" {
osVersion = guestOSVersion
}
if guestAgentVersion != "" {
agentVersion = guestAgentVersion
}
})
}
if res.Status == "running" && !diskFromAgent {
if hostDisk, hostDisks, ok := resolveGuestDiskFromLinkedHostAgent(guestID, vmIDToHostAgent); ok && hostDisk.Total > 0 {
diskTotal = uint64(hostDisk.Total)
diskUsed = uint64(hostDisk.Used)
diskFree = uint64(hostDisk.Free)
diskUsage = hostDisk.Usage
individualDisks = hostDisks
diskFromAgent = true
diskStatusReason = ""
log.Debug().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Float64("usage", hostDisk.Usage).
Msg("QEMU disk: using linked Pulse host agent disk summary")
}
}
if res.Status == "running" && !diskFromAgent && shouldCarryForwardQEMUDisk(diskStatusReason) {
if prev, ok := prevDiskByGuestID[guestID]; ok && prev.Usage > 0 && prev.Total > 0 && prev.Used >= 0 && prev.Used <= prev.Total {
diskTotal = uint64(prev.Total)
diskUsed = uint64(prev.Used)
diskFree = diskTotal - diskUsed
diskUsage = prev.Usage
if prevVM, ok := prevVMByGuestID[guestID]; ok {
individualDisks = cloneGuestDisks(prevVM.Disks)
}
diskStatusReason = "prev-" + diskStatusReason
log.Debug().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Float64("prevUsage", prev.Usage).
Msg("Guest agent disk query failed; carrying forward previous disk data")
}
}
if res.Status == "running" && memAvailable == 0 && memInfoTotalMinusUsed > 0 {
memAvailable = memInfoTotalMinusUsed
memorySource = "meminfo-total-minus-used"
}
switch {
case res.Status != "running":
memorySource = "powered-off"
memUsed = 0
case memAvailable > 0:
if memAvailable > memTotal {
memAvailable = memTotal
}
memUsed = memTotal - memAvailable
case rrdUsed > 0:
memUsed = rrdUsed
memorySource = "rrd-memused"
case detailedStatus != nil:
if selection := selectVMLowTrustUsedMemory(memTotal, detailedStatus); selection.Source != "" {
memUsed = selection.Used
memorySource = selection.Source
} else {
memorySource = "status-unavailable"
}
}
if memUsed > memTotal {
memUsed = memTotal
}
sampleTime := time.Now()
var memory models.Memory
snapshotNotes := []string(nil)
guestAgentHealthy := guestAgentSignalsHealthy(detailedStatus, diskFromAgent, ipAddresses, networkInterfaces, osName, osVersion, agentVersion)
if prev, ok := prevVMByGuestID[guestID]; ok {
switch {
case shouldCarryForwardHealthyGuestLowTrustVMMemory(prev, res.Status, memorySource, memTotal, memUsed, sampleTime, guestAgentHealthy):
fallbackSource := memorySource
memory = prev.Memory
if detailedStatus != nil && detailedStatus.Balloon > 0 {
memory.Balloon = int64(detailedStatus.Balloon)
}
memorySource = "previous-snapshot"
snapshotNotes = append(snapshotNotes, "preserved-previous-memory-for-healthy-guest-low-trust-full-usage")
log.Debug().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Str("previousSource", prev.MemorySource).
Str("fallbackSource", fallbackSource).
Float64("previousUsage", prev.Memory.Usage).
Float64("fallbackUsage", safePercentage(float64(memUsed), float64(memTotal))).
Msg("Preserving previous VM memory metrics for guest with healthy agent signals after low-trust full-usage fallback")
case shouldCarryForwardPreviousVMMemory(prev, res.Status, memorySource, memTotal, memUsed, sampleTime):
fallbackSource := memorySource
memory = prev.Memory
if detailedStatus != nil && detailedStatus.Balloon > 0 {
memory.Balloon = int64(detailedStatus.Balloon)
}
memorySource = "previous-snapshot"
snapshotNotes = append(snapshotNotes, "preserved-previous-memory-after-low-trust-fallback")
log.Debug().
Str("instance", instanceName).
Str("vm", res.Name).
Int("vmid", res.VMID).
Str("previousSource", prev.MemorySource).
Str("fallbackSource", fallbackSource).
Float64("previousUsage", prev.Memory.Usage).
Float64("fallbackUsage", safePercentage(float64(memUsed), float64(memTotal))).
Msg("Preserving previous VM memory metrics after low-trust fallback")
}
}
if memory.Total == 0 {
memFree := uint64(0)
if memTotal >= memUsed {
memFree = memTotal - memUsed
}
memory = models.Memory{
Total: int64(memTotal),
Used: int64(memUsed),
Free: int64(memFree),
Usage: safePercentage(float64(memUsed), float64(memTotal)),
}
if memory.Free < 0 {
memory.Free = 0
}
if memory.Used > memory.Total {
memory.Used = memory.Total
}
if memRawFree > 0 && memFree > memRawFree {
memory.Cache = int64(memFree - memRawFree)
memory.Free = int64(memRawFree)
}
if detailedStatus != nil && detailedStatus.Balloon > 0 {
memory.Balloon = int64(detailedStatus.Balloon)
}
}
currentMetrics := IOMetrics{
DiskRead: diskReadBytes,
DiskWrite: diskWriteBytes,
NetworkIn: networkInBytes,
NetworkOut: networkOutBytes,
Timestamp: sampleTime,
}
diskReadRate, diskWriteRate, netInRate, netOutRate := m.rateTracker.CalculateRates(guestID, currentMetrics)
vm := models.VM{
ID: guestID,
VMID: res.VMID,
Name: res.Name,
Node: res.Node,
Instance: instanceName,
Status: res.Status,
Type: "qemu",
CPU: safeFloat(res.CPU),
CPUs: res.MaxCPU,
Memory: memory,
MemorySource: memorySource,
Disk: models.Disk{
Total: int64(diskTotal),
Used: int64(diskUsed),
Free: int64(diskFree),
Usage: diskUsage,
},
Disks: individualDisks,
DiskStatusReason: diskStatusReason,
IPAddresses: ipAddresses,
OSName: osName,
OSVersion: osVersion,
AgentVersion: agentVersion,
NetworkInterfaces: networkInterfaces,
NetworkIn: max(0, int64(netInRate)),
NetworkOut: max(0, int64(netOutRate)),
DiskRead: max(0, int64(diskReadRate)),
DiskWrite: max(0, int64(diskWriteRate)),
Uptime: int64(res.Uptime),
Template: res.Template == 1,
LastSeen: sampleTime,
}
if res.Tags != "" {
vm.Tags = strings.Split(res.Tags, ";")
for _, tag := range vm.Tags {
switch tag {
case "pulse-no-alerts", "pulse-monitor-only", "pulse-relaxed":
log.Info().Str("vm", vm.Name).Str("node", vm.Node).Str("tag", tag).Msg("Pulse control tag detected on VM")
}
}
}
snapshot := GuestMemorySnapshot{
Name: vm.Name,
Status: vm.Status,
RetrievedAt: sampleTime,
MemorySource: memorySource,
Memory: vm.Memory,
Raw: guestRaw,
Notes: snapshotNotes,
}
alertVM := vm
if alertVM.Status != "running" {
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().
Str("vm", alertVM.Name).
Str("status", alertVM.Status).
Float64("originalCpu", alertVM.CPU).
Float64("originalMemUsage", alertVM.Memory.Usage).
Msg("Non-running VM detected - zeroing metrics")
}
alertVM.CPU = 0
alertVM.Memory.Usage = 0
alertVM.Disk.Usage = 0
alertVM.NetworkIn = 0
alertVM.NetworkOut = 0
alertVM.DiskRead = 0
alertVM.DiskWrite = 0
}
return vm, alertVM, snapshot, true
}