fix(memory): add guest agent /proc/meminfo fallback to avoid VM memory inflation (#1270)

Proxmox status.Mem includes page cache as "used" memory, inflating
reported VM usage. The existing fallbacks (balloon meminfo, RRD, linked
host agent) were frequently unavailable, causing most VMs to fall
through to the inflated status-mem source.

Adds a new last-resort fallback that reads /proc/meminfo via the QEMU
guest agent file-read endpoint to get accurate MemAvailable. Results
are cached (60s positive, 5min negative backoff for unsupported VMs).

Also fixes: RRD memavailable fallback missing from traditional polling
path, cache key collisions in multi-PVE setups, FreeMem underflow
guard inconsistency, and integer overflow in kB-to-bytes conversion.
This commit is contained in:
rcourtman 2026-02-20 10:07:17 +00:00
parent 8c7d507ea4
commit 0ae2806f18
12 changed files with 237 additions and 9 deletions

View file

@ -33,6 +33,10 @@ func (m *mockCephPVEClient) GetCephDF(ctx context.Context) (*proxmox.CephDF, err
return args.Get(0).(*proxmox.CephDF), args.Error(1)
}
func (m *mockCephPVEClient) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
return 0, fmt.Errorf("not implemented")
}
func TestPollCephCluster(t *testing.T) {
t.Run("clears state when ceph not detected", func(t *testing.T) {
m := &Monitor{state: models.NewState()}

View file

@ -79,6 +79,7 @@ type PVEClientInterface interface {
IsClusterMember(ctx context.Context) (bool, error)
GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error)
GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.VMNetworkInterface, error)
GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error)
GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error)
GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error)
GetZFSPoolStatus(ctx context.Context, node string) ([]proxmox.ZFSPoolStatus, error)
@ -803,11 +804,12 @@ type Monitor struct {
rrdCacheMu sync.RWMutex // Protects RRD memavailable cache
nodeRRDMemCache map[string]rrdMemCacheEntry
vmRRDMemCache map[string]rrdMemCacheEntry
removedDockerHosts map[string]time.Time // Track deliberately removed Docker hosts (ID -> removal time)
dockerTokenBindings map[string]string // Track token ID -> agent ID bindings to enforce uniqueness
removedKubernetesClusters map[string]time.Time // Track deliberately removed Kubernetes clusters (ID -> removal time)
kubernetesTokenBindings map[string]string // Track token ID -> agent ID bindings to enforce uniqueness
hostTokenBindings map[string]string // Track tokenID:hostname -> host identity bindings
vmAgentMemCache map[string]agentMemCacheEntry // Guest agent /proc/meminfo cache
removedDockerHosts map[string]time.Time // Track deliberately removed Docker hosts (ID -> removal time)
dockerTokenBindings map[string]string // Track token ID -> agent ID bindings to enforce uniqueness
removedKubernetesClusters map[string]time.Time // Track deliberately removed Kubernetes clusters (ID -> removal time)
kubernetesTokenBindings map[string]string // Track token ID -> agent ID bindings to enforce uniqueness
hostTokenBindings map[string]string // Track tokenID:hostname -> host identity bindings
dockerCommands map[string]*dockerHostCommand
dockerCommandIndex map[string]string
guestMetadataMu sync.RWMutex
@ -850,6 +852,14 @@ type rrdMemCacheEntry struct {
fetchedAt time.Time
}
// agentMemCacheEntry caches MemAvailable read via guest agent file-read of /proc/meminfo.
// A zero available with negative=true means the VM doesn't support this (e.g. Windows, agent off).
type agentMemCacheEntry struct {
available uint64
negative bool // true = read failed, don't retry until TTL expires
fetchedAt time.Time
}
// pendingUpdatesCache caches apt pending updates count per node
type pendingUpdatesCache struct {
count int
@ -1154,6 +1164,9 @@ const (
nodeOfflineGracePeriod = 60 * time.Second // Grace period before marking Proxmox nodes offline
nodeRRDCacheTTL = 30 * time.Second
nodeRRDRequestTimeout = 2 * time.Second
vmAgentMemCacheTTL = 60 * time.Second // Cache guest-agent /proc/meminfo reads
vmAgentMemRequestTimeout = 3 * time.Second // Timeout for guest-agent file-read calls
vmAgentMemNegativeTTL = 5 * time.Minute // Backoff for VMs where guest-agent read fails
)
type taskOutcome struct {
@ -1239,12 +1252,12 @@ func (m *Monitor) getNodeRRDMetrics(ctx context.Context, client PVEClientInterfa
// getVMRRDMetrics fetches Proxmox RRD memavailable for a single VM with a
// short-lived cache to avoid a live API call on every poll for VMs that
// consistently lack guest-agent memory data (e.g. Windows VMs).
func (m *Monitor) getVMRRDMetrics(ctx context.Context, client PVEClientInterface, node string, vmid int) (uint64, error) {
func (m *Monitor) getVMRRDMetrics(ctx context.Context, client PVEClientInterface, instance, node string, vmid int) (uint64, error) {
if client == nil || node == "" || vmid <= 0 {
return 0, fmt.Errorf("invalid arguments for VM RRD lookup")
}
cacheKey := fmt.Sprintf("%s/%d", node, vmid)
cacheKey := fmt.Sprintf("%s/%s/%d", instance, node, vmid)
now := time.Now()
m.rrdCacheMu.RLock()
@ -1285,6 +1298,52 @@ func (m *Monitor) getVMRRDMetrics(ctx context.Context, client PVEClientInterface
return memAvailable, nil
}
// getVMAgentMemAvailable reads MemAvailable via the QEMU guest agent file-read
// endpoint (/proc/meminfo). Results are cached; failed reads use a longer
// negative-cache TTL to avoid hammering VMs that don't support it.
func (m *Monitor) getVMAgentMemAvailable(ctx context.Context, client PVEClientInterface, instance, node string, vmid int) (uint64, error) {
if client == nil || node == "" || vmid <= 0 {
return 0, fmt.Errorf("invalid arguments for guest agent mem lookup")
}
cacheKey := fmt.Sprintf("%s/%s/%d", instance, node, vmid)
now := time.Now()
m.rrdCacheMu.RLock()
if entry, ok := m.vmAgentMemCache[cacheKey]; ok {
ttl := vmAgentMemCacheTTL
if entry.negative {
ttl = vmAgentMemNegativeTTL
}
if now.Sub(entry.fetchedAt) < ttl {
m.rrdCacheMu.RUnlock()
if entry.negative {
return 0, fmt.Errorf("guest agent mem read previously failed (negative cache)")
}
return entry.available, nil
}
}
m.rrdCacheMu.RUnlock()
requestCtx, cancel := context.WithTimeout(ctx, vmAgentMemRequestTimeout)
defer cancel()
available, err := client.GetVMMemAvailableFromAgent(requestCtx, node, vmid)
if err != nil {
// Negative cache: don't retry for a while
m.rrdCacheMu.Lock()
m.vmAgentMemCache[cacheKey] = agentMemCacheEntry{negative: true, fetchedAt: now}
m.rrdCacheMu.Unlock()
return 0, err
}
m.rrdCacheMu.Lock()
m.vmAgentMemCache[cacheKey] = agentMemCacheEntry{available: available, fetchedAt: now}
m.rrdCacheMu.Unlock()
return available, nil
}
// RemoveDockerHost removes a docker host from the shared state and clears related alerts.
func (m *Monitor) RemoveDockerHost(hostID string) (models.DockerHost, error) {
hostID = strings.TrimSpace(hostID)
@ -3149,6 +3208,14 @@ func (m *Monitor) cleanupRRDCache(now time.Time) {
delete(m.vmRRDMemCache, key)
}
}
// Clean up guest-agent memory cache (use longer TTL for negative entries)
agentMaxAge := 2 * vmAgentMemNegativeTTL
for key, entry := range m.vmAgentMemCache {
if now.Sub(entry.fetchedAt) > agentMaxAge {
delete(m.vmAgentMemCache, key)
}
}
}
// cleanupMetricsHistory removes stale entries from the metrics history.
@ -3803,6 +3870,7 @@ func New(cfg *config.Config) (*Monitor, error) {
guestSnapshots: make(map[string]GuestMemorySnapshot),
nodeRRDMemCache: make(map[string]rrdMemCacheEntry),
vmRRDMemCache: make(map[string]rrdMemCacheEntry),
vmAgentMemCache: make(map[string]agentMemCacheEntry),
removedDockerHosts: make(map[string]time.Time),
dockerTokenBindings: make(map[string]string),
removedKubernetesClusters: make(map[string]time.Time),
@ -6872,7 +6940,7 @@ func (m *Monitor) pollVMsAndContainersEfficient(ctx context.Context, instanceNam
// try Proxmox RRD's memavailable (cache-aware) before falling back to status.Mem
// which can include reclaimable page cache (inflating usage). Refs: #1270
if memAvailable == 0 {
if rrdAvailable, rrdErr := m.getVMRRDMetrics(ctx, client, res.Node, res.VMID); rrdErr == nil && rrdAvailable > 0 {
if rrdAvailable, rrdErr := m.getVMRRDMetrics(ctx, client, instanceName, res.Node, res.VMID); rrdErr == nil && rrdAvailable > 0 {
memAvailable = rrdAvailable
memorySource = "rrd-memavailable"
guestRaw.MemInfoAvailable = memAvailable
@ -6917,6 +6985,25 @@ func (m *Monitor) pollVMsAndContainersEfficient(ctx context.Context, instanceNam
}
}
// Last-resort fallback before status-mem: read /proc/meminfo via the
// QEMU guest agent's file-read endpoint. This works for Linux VMs with
// the guest agent running even when the balloon driver doesn't populate
// the meminfo fields. Results are cached with negative backoff. Refs: #1270
if memAvailable == 0 && detailedStatus.Agent.Value > 0 {
if agentAvail, agentErr := m.getVMAgentMemAvailable(ctx, client, instanceName, res.Node, res.VMID); agentErr == nil && agentAvail > 0 {
memAvailable = agentAvail
memorySource = "guest-agent-meminfo"
guestRaw.MemInfoAvailable = memAvailable
log.Debug().
Str("vm", res.Name).
Str("node", res.Node).
Int("vmid", res.VMID).
Uint64("total", memTotal).
Uint64("available", memAvailable).
Msg("QEMU memory: using guest agent /proc/meminfo fallback (excludes reclaimable cache)")
}
}
switch {
case memAvailable > 0:
if memAvailable > memTotal {

View file

@ -295,6 +295,10 @@ func (m *mockPVEClientExtra) GetVMAgentVersion(ctx context.Context, node string,
return "1.0", nil
}
func (m *mockPVEClientExtra) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
return 0, fmt.Errorf("not implemented")
}
func (m *mockPVEClientExtra) GetLXCRRDData(ctx context.Context, node string, vmid int, timeframe string, cf string, ds []string) ([]proxmox.GuestRRDPoint, error) {
return nil, nil
}
@ -345,6 +349,9 @@ func TestMonitor_PollVMsAndContainersEfficient_Extra(t *testing.T) {
metricsHistory: NewMetricsHistory(100, time.Hour),
alertManager: alerts.NewManager(),
stalenessTracker: NewStalenessTracker(nil),
nodeRRDMemCache: make(map[string]rrdMemCacheEntry),
vmRRDMemCache: make(map[string]rrdMemCacheEntry),
vmAgentMemCache: make(map[string]agentMemCacheEntry),
}
defer m.alertManager.Stop()
@ -505,6 +512,10 @@ func (m *mockPVEClientStorage) GetStorageContent(ctx context.Context, node, stor
return m.content, nil
}
func (m *mockPVEClientStorage) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
return 0, fmt.Errorf("not implemented")
}
func TestMonitor_RetryPVEPortFallback_Extra(t *testing.T) {
m := &Monitor{
config: &config.Config{},
@ -865,6 +876,10 @@ func (m *mockPVEClientFailNodes) GetNodes(ctx context.Context) ([]proxmox.Node,
return nil, fmt.Errorf("nodes failed")
}
func (m *mockPVEClientFailNodes) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
return 0, fmt.Errorf("not implemented")
}
type mockExecutor struct {
executed []PollTask
}

View file

@ -27,6 +27,10 @@ type mockPVEClient struct {
func (m *mockPVEClient) GetNodes(ctx context.Context) ([]proxmox.Node, error) { return nil, nil }
func (m *mockPVEClient) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
return 0, fmt.Errorf("not implemented")
}
func TestMonitor_GetConnectionStatuses(t *testing.T) {
// Real Mode
m := &Monitor{
@ -698,6 +702,10 @@ func (m *mockPVEClientExtended) GetVMAgentVersion(ctx context.Context, node stri
return "", nil
}
func (m *mockPVEClientExtended) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
return 0, fmt.Errorf("not implemented")
}
func (m *mockPVEClientExtended) GetZFSPoolStatus(ctx context.Context, node string) ([]proxmox.ZFSPoolStatus, error) {
return nil, nil
}

View file

@ -2,6 +2,7 @@ package monitoring
import (
"context"
"fmt"
"math"
"testing"
"time"
@ -116,6 +117,10 @@ func (s *stubPVEClient) GetVMAgentVersion(ctx context.Context, node string, vmid
return "", nil
}
func (s *stubPVEClient) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
return 0, fmt.Errorf("not implemented")
}
func (s *stubPVEClient) GetZFSPoolStatus(ctx context.Context, node string) ([]proxmox.ZFSPoolStatus, error) {
return nil, nil
}

View file

@ -371,6 +371,22 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clu
// and makes the frontend's balloon marker logic ineffective.
// Refs: #1070
// Fallback: try RRD memavailable (cached). Refs: #1270
if memAvailable == 0 {
if rrdAvailable, rrdErr := m.getVMRRDMetrics(ctx, client, instanceName, n.Node, vm.VMID); rrdErr == nil && rrdAvailable > 0 {
memAvailable = rrdAvailable
memorySource = "rrd-memavailable"
guestRaw.MemInfoAvailable = memAvailable
log.Debug().
Str("vm", vm.Name).
Str("node", n.Node).
Int("vmid", vm.VMID).
Uint64("total", memTotal).
Uint64("available", memAvailable).
Msg("QEMU memory: using RRD memavailable fallback (excludes reclaimable cache)")
}
}
// Fallback: use linked Pulse host agent's memory data.
// gopsutil's Used = Total - Available (excludes page cache),
// so we can derive accurate available memory. Refs: #1270
@ -395,6 +411,23 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clu
}
}
// Last-resort fallback before status-mem: read /proc/meminfo via the
// QEMU guest agent file-read endpoint. Refs: #1270
if memAvailable == 0 && status.Agent.Value > 0 {
if agentAvail, agentErr := m.getVMAgentMemAvailable(ctx, client, instanceName, n.Node, vm.VMID); agentErr == nil && agentAvail > 0 {
memAvailable = agentAvail
memorySource = "guest-agent-meminfo"
guestRaw.MemInfoAvailable = memAvailable
log.Debug().
Str("vm", vm.Name).
Str("node", n.Node).
Int("vmid", vm.VMID).
Uint64("total", memTotal).
Uint64("available", memAvailable).
Msg("QEMU memory: using guest agent /proc/meminfo fallback (excludes reclaimable cache)")
}
}
switch {
case memAvailable > 0:
if memAvailable > memTotal {
@ -411,7 +444,7 @@ func (m *Monitor) pollVMsWithNodes(ctx context.Context, instanceName string, clu
// Refs: #1185
memUsed = vmStatus.Mem
memorySource = "status-mem"
case vmStatus.FreeMem > 0:
case vmStatus.FreeMem > 0 && memTotal >= vmStatus.FreeMem:
memUsed = memTotal - vmStatus.FreeMem
memorySource = "status-freemem"
default:

View file

@ -27,6 +27,10 @@ func (m *mockPVEClientSnapshots) GetContainerSnapshots(ctx context.Context, node
return m.snapshots, nil
}
func (m *mockPVEClientSnapshots) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
return 0, fmt.Errorf("not implemented")
}
func TestMonitor_PollGuestSnapshots_Coverage(t *testing.T) {
m := &Monitor{
state: models.NewState(),

View file

@ -2,6 +2,7 @@ package monitoring
import (
"context"
"fmt"
"testing"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
@ -86,6 +87,9 @@ func (f fakeSnapshotClient) GetVMAgentInfo(ctx context.Context, node string, vmi
func (f fakeSnapshotClient) GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error) {
return "", nil
}
func (f fakeSnapshotClient) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
return 0, fmt.Errorf("not implemented")
}
func (f fakeSnapshotClient) GetZFSPoolStatus(ctx context.Context, node string) ([]proxmox.ZFSPoolStatus, error) {
return nil, nil
}

View file

@ -30,6 +30,10 @@ func (m *mockPVEClientForStorage) GetStorage(ctx context.Context, node string) (
return m.Storages, nil
}
func (m *mockPVEClientForStorage) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
return 0, fmt.Errorf("not implemented")
}
func (m *mockPVEClientForStorage) GetStorageContent(ctx context.Context, node, storage string) ([]proxmox.StorageContent, error) {
if m.StorageToFail != nil && m.StorageToFail[storage] {
return nil, fmt.Errorf("failed to get content")

View file

@ -116,6 +116,10 @@ func (f *fakeStorageClient) GetVMAgentVersion(ctx context.Context, node string,
return "", nil
}
func (f *fakeStorageClient) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
return 0, fmt.Errorf("not implemented")
}
func (f *fakeStorageClient) GetZFSPoolStatus(ctx context.Context, node string) ([]proxmox.ZFSPoolStatus, error) {
return nil, nil
}

View file

@ -1655,6 +1655,52 @@ func (c *Client) GetVMNetworkInterfaces(ctx context.Context, node string, vmid i
return result.Data.Result, nil
}
// GetVMMemAvailableFromAgent reads /proc/meminfo via the QEMU guest agent's
// file-read endpoint and returns MemAvailable in bytes. This is a fallback for
// VMs where the balloon driver does not populate the meminfo field in the
// status endpoint. Returns 0 if the guest agent is unavailable, the file
// cannot be read, or MemAvailable is not present (e.g. Windows VMs).
func (c *Client) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
fileParam := url.QueryEscape("/proc/meminfo")
resp, err := c.get(ctx, fmt.Sprintf("/nodes/%s/qemu/%d/agent/file-read?file=%s", node, vmid, fileParam))
if err != nil {
return 0, fmt.Errorf("guest agent file-read /proc/meminfo: %w", err)
}
defer resp.Body.Close()
var result struct {
Data struct {
Content string `json:"content"`
Truncated *bool `json:"truncated,omitempty"`
} `json:"data"`
}
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return 0, fmt.Errorf("decode file-read response: %w", err)
}
// Parse MemAvailable from /proc/meminfo (format: "MemAvailable: 12345 kB")
for _, line := range strings.Split(result.Data.Content, "\n") {
line = strings.TrimSpace(line)
if !strings.HasPrefix(line, "MemAvailable:") {
continue
}
fields := strings.Fields(line)
if len(fields) < 2 {
continue
}
kB, err := strconv.ParseUint(fields[1], 10, 64)
if err != nil {
return 0, fmt.Errorf("parse MemAvailable value %q: %w", fields[1], err)
}
if kB > math.MaxUint64/1024 {
return 0, fmt.Errorf("MemAvailable value %d kB overflows uint64", kB)
}
return kB * 1024, nil // Convert kB to bytes
}
return 0, fmt.Errorf("MemAvailable not found in /proc/meminfo")
}
// GetVMStatus returns detailed VM status including balloon info
func (c *Client) GetVMStatus(ctx context.Context, node string, vmid int) (*VMStatus, error) {
// Note: Proxmox 9.x removed support for the "full" parameter

View file

@ -1245,6 +1245,20 @@ func (cc *ClusterClient) GetVMNetworkInterfaces(ctx context.Context, node string
return result, err
}
// GetVMMemAvailableFromAgent reads /proc/meminfo via the QEMU guest agent to get MemAvailable.
func (cc *ClusterClient) GetVMMemAvailableFromAgent(ctx context.Context, node string, vmid int) (uint64, error) {
var result uint64
err := cc.executeWithFailover(ctx, func(client *Client) error {
available, err := client.GetVMMemAvailableFromAgent(ctx, node, vmid)
if err != nil {
return err
}
result = available
return nil
})
return result, err
}
// GetClusterResources returns all resources (VMs, containers) across the cluster in a single call
func (cc *ClusterClient) GetClusterResources(ctx context.Context, resourceType string) ([]ClusterResource, error) {
var result []ClusterResource