mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 19:41:17 +00:00
Linux VM page cache (#1270): QEMU VM memory now falls back to Proxmox RRD's memavailable metric (which excludes reclaimable page cache) when the qemu-guest-agent doesn't provide MemInfo.Available. Previously the fallback was detailedStatus.Mem (total - MemFree), inflating usage to 80%+ on VMs with normal Linux page cache. Mirrors the existing LXC rrd-memavailable path. FreeBSD ZFS ARC (#1264, #1051): The host agent now reads kstat.zfs.misc.arcstats.size via SysctlRaw on FreeBSD and subtracts the ARC size from reported memory usage. ZFS ARC is reclaimable under memory pressure (like Linux SReclaimable) but gopsutil counts it as wired/non-reclaimable, causing false 90%+ memory alerts on TrueNAS and FreeBSD hosts. Build-tagged so it compiles cleanly on all platforms. Fixes #1270 Fixes #1264 Fixes #1051 (cherry picked from commit 94502f83ff9ffc6da28aaadc946a2f7d8b4e9bac)
452 lines
12 KiB
Go
452 lines
12 KiB
Go
//go:build integration
|
|
|
|
package monitoring
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"runtime"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/config"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/pbs"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/pmg"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
|
|
)
|
|
|
|
// FailureType describes scripted failure behaviour used by the harness.
|
|
type FailureType int
|
|
|
|
const (
|
|
FailureNone FailureType = iota
|
|
FailureTransient
|
|
FailurePermanent
|
|
)
|
|
|
|
// HarnessScenario captures the configuration for an integration run.
|
|
type HarnessScenario struct {
|
|
Instances []InstanceConfig
|
|
Duration time.Duration
|
|
WarmupDuration time.Duration
|
|
}
|
|
|
|
// InstanceConfig models a single synthetic instance executed during the run.
|
|
type InstanceConfig struct {
|
|
Type string
|
|
Name string
|
|
SuccessRate float64
|
|
FailureSeq []FailureType
|
|
BaseLatency time.Duration
|
|
}
|
|
|
|
// InstanceStats aggregates execution data per instance.
|
|
type InstanceStats struct {
|
|
Total int
|
|
Successes int
|
|
Failures int
|
|
TransientFailures int
|
|
PermanentFailures int
|
|
AverageLatency time.Duration
|
|
LastError string
|
|
LastSuccessAt time.Time
|
|
}
|
|
|
|
// QueueStats summarises task queue behaviour.
|
|
type QueueStats struct {
|
|
MaxDepth int
|
|
AverageDepth float64
|
|
Samples int
|
|
FinalDepth int
|
|
}
|
|
|
|
// StalenessStats captures staleness score distribution.
|
|
type StalenessStats struct {
|
|
Max float64
|
|
Average float64
|
|
Samples int
|
|
}
|
|
|
|
// ResourceStats samples runtime resource usage.
|
|
type ResourceStats struct {
|
|
GoroutinesStart int
|
|
GoroutinesEnd int
|
|
HeapAllocStart uint64
|
|
HeapAllocEnd uint64
|
|
StackInuseStart uint64
|
|
StackInuseEnd uint64
|
|
GCCountStart uint32
|
|
GCCountEnd uint32
|
|
}
|
|
|
|
// HarnessReport is returned after a harness run completes.
|
|
type HarnessReport struct {
|
|
Scenario HarnessScenario
|
|
PerInstanceStats map[string]InstanceStats
|
|
QueueStats QueueStats
|
|
StalenessStats StalenessStats
|
|
ResourceStats ResourceStats
|
|
Health SchedulerHealthResponse
|
|
MaxStaleness time.Duration
|
|
RuntimeSamples []runtimeSnapshot
|
|
}
|
|
|
|
// Harness orchestrates the integration run.
|
|
type Harness struct {
|
|
Monitor *Monitor
|
|
Executor *fakeExecutor
|
|
cancel context.CancelFunc
|
|
scenario HarnessScenario
|
|
dataPath string
|
|
queueMax int
|
|
queueSum int
|
|
queueSamples int
|
|
maxStaleness time.Duration
|
|
sampleEvery time.Duration
|
|
runtimeSamples []runtimeSnapshot
|
|
lastRuntimeSample time.Time
|
|
}
|
|
|
|
// NewHarness constructs a harness configured for the provided scenario.
|
|
func NewHarness(scenario HarnessScenario) *Harness {
|
|
if scenario.Duration <= 0 {
|
|
scenario.Duration = 30 * time.Second
|
|
}
|
|
if scenario.WarmupDuration <= 0 {
|
|
scenario.WarmupDuration = 5 * time.Second
|
|
}
|
|
|
|
tempDir, err := os.MkdirTemp("", "pulse-harness-*")
|
|
if err != nil {
|
|
panic(fmt.Errorf("create harness data dir: %w", err))
|
|
}
|
|
|
|
baseInterval := 3 * time.Second
|
|
minInterval := 750 * time.Millisecond
|
|
maxInterval := 8 * time.Second
|
|
|
|
cfg := &config.Config{
|
|
DataPath: tempDir,
|
|
AdaptivePollingEnabled: true,
|
|
AdaptivePollingBaseInterval: baseInterval,
|
|
AdaptivePollingMinInterval: minInterval,
|
|
AdaptivePollingMaxInterval: maxInterval,
|
|
FrontendPort: 7655,
|
|
PublicURL: "http://127.0.0.1",
|
|
}
|
|
|
|
monitor, err := New(cfg)
|
|
if err != nil {
|
|
panic(fmt.Errorf("create monitor for harness: %w", err))
|
|
}
|
|
|
|
// Populate synthetic client entries so scheduler inventory is aware of instances.
|
|
for _, inst := range scenario.Instances {
|
|
switch strings.ToLower(inst.Type) {
|
|
case "pve":
|
|
monitor.pveClients[inst.Name] = noopPVEClient{}
|
|
case "pbs":
|
|
monitor.pbsClients[inst.Name] = &pbs.Client{}
|
|
case "pmg":
|
|
monitor.pmgClients[inst.Name] = &pmg.Client{}
|
|
default:
|
|
// Unsupported types are ignored for now.
|
|
}
|
|
}
|
|
|
|
exec := newFakeExecutor(monitor, scenario)
|
|
monitor.SetExecutor(exec)
|
|
|
|
return &Harness{
|
|
Monitor: monitor,
|
|
Executor: exec,
|
|
scenario: scenario,
|
|
dataPath: tempDir,
|
|
maxStaleness: cfg.AdaptivePollingMaxInterval,
|
|
sampleEvery: 15 * time.Second,
|
|
}
|
|
}
|
|
|
|
// Run executes the scenario and returns a report of collected statistics.
|
|
func (h *Harness) Run(ctx context.Context) HarnessReport {
|
|
h.runtimeSamples = nil
|
|
runtimeStart := sampleRuntime()
|
|
h.runtimeSamples = append(h.runtimeSamples, runtimeStart)
|
|
h.lastRuntimeSample = runtimeStart.Timestamp
|
|
|
|
runCtx, cancel := context.WithCancel(ctx)
|
|
h.cancel = cancel
|
|
|
|
workerCount := len(h.scenario.Instances)
|
|
if workerCount < 1 {
|
|
workerCount = 1
|
|
}
|
|
|
|
h.Monitor.startTaskWorkers(runCtx, workerCount)
|
|
h.schedule(time.Now())
|
|
|
|
ticker := time.NewTicker(500 * time.Millisecond)
|
|
defer ticker.Stop()
|
|
|
|
runEnd := time.Now().Add(h.scenario.WarmupDuration + h.scenario.Duration)
|
|
|
|
loop:
|
|
for {
|
|
select {
|
|
case <-runCtx.Done():
|
|
break loop
|
|
case <-ticker.C:
|
|
now := time.Now()
|
|
h.schedule(now)
|
|
if h.sampleEvery > 0 && time.Since(h.lastRuntimeSample) >= h.sampleEvery {
|
|
h.recordRuntimeSample()
|
|
}
|
|
if now.After(runEnd) {
|
|
cancel()
|
|
}
|
|
}
|
|
}
|
|
|
|
// Allow in-flight work to finish.
|
|
time.Sleep(500 * time.Millisecond)
|
|
|
|
instanceStats := h.Executor.InstanceReport()
|
|
queueAverage := 0.0
|
|
if h.queueSamples > 0 {
|
|
queueAverage = float64(h.queueSum) / float64(h.queueSamples)
|
|
}
|
|
|
|
finalQueueDepth := h.Monitor.taskQueue.Size()
|
|
health := h.Monitor.SchedulerHealth()
|
|
runtimeEnd := sampleRuntime()
|
|
h.runtimeSamples = append(h.runtimeSamples, runtimeEnd)
|
|
staleness := computeStalenessStats(h.Monitor)
|
|
|
|
h.Monitor.Stop()
|
|
runtimeSamplesCopy := append([]runtimeSnapshot(nil), h.runtimeSamples...)
|
|
h.cleanup()
|
|
if len(runtimeSamplesCopy) == 0 {
|
|
runtimeSamplesCopy = append(runtimeSamplesCopy, runtimeStart, runtimeEnd)
|
|
}
|
|
startSample := runtimeSamplesCopy[0]
|
|
endSample := runtimeSamplesCopy[len(runtimeSamplesCopy)-1]
|
|
|
|
report := HarnessReport{
|
|
Scenario: h.scenario,
|
|
PerInstanceStats: instanceStats,
|
|
QueueStats: QueueStats{
|
|
MaxDepth: h.queueMax,
|
|
AverageDepth: queueAverage,
|
|
Samples: h.queueSamples,
|
|
FinalDepth: finalQueueDepth,
|
|
},
|
|
StalenessStats: staleness,
|
|
ResourceStats: ResourceStats{
|
|
GoroutinesStart: startSample.Goroutines,
|
|
GoroutinesEnd: endSample.Goroutines,
|
|
HeapAllocStart: startSample.HeapAlloc,
|
|
HeapAllocEnd: endSample.HeapAlloc,
|
|
StackInuseStart: startSample.StackInuse,
|
|
StackInuseEnd: endSample.StackInuse,
|
|
GCCountStart: startSample.NumGC,
|
|
GCCountEnd: endSample.NumGC,
|
|
},
|
|
Health: health,
|
|
MaxStaleness: h.maxStaleness,
|
|
RuntimeSamples: runtimeSamplesCopy,
|
|
}
|
|
|
|
return report
|
|
}
|
|
|
|
func (h *Harness) schedule(now time.Time) {
|
|
if h.Monitor == nil {
|
|
return
|
|
}
|
|
|
|
currentDepth := h.Monitor.taskQueue.Size()
|
|
if currentDepth > 0 {
|
|
h.recordQueueDepth(currentDepth)
|
|
return
|
|
}
|
|
|
|
tasks := h.Monitor.buildScheduledTasks(now)
|
|
if len(tasks) == 0 {
|
|
h.recordQueueDepth(currentDepth)
|
|
return
|
|
}
|
|
|
|
for _, task := range tasks {
|
|
h.Monitor.taskQueue.Upsert(task)
|
|
}
|
|
|
|
h.recordQueueDepth(h.Monitor.taskQueue.Size())
|
|
}
|
|
|
|
func (h *Harness) recordQueueDepth(depth int) {
|
|
h.queueSamples++
|
|
h.queueSum += depth
|
|
if depth > h.queueMax {
|
|
h.queueMax = depth
|
|
}
|
|
if h.Monitor != nil && h.Monitor.pollMetrics != nil {
|
|
h.Monitor.pollMetrics.SetQueueDepth(depth)
|
|
}
|
|
}
|
|
|
|
func (h *Harness) recordRuntimeSample() {
|
|
snap := sampleRuntime()
|
|
h.runtimeSamples = append(h.runtimeSamples, snap)
|
|
h.lastRuntimeSample = snap.Timestamp
|
|
}
|
|
|
|
func (h *Harness) cleanup() {
|
|
if h.cancel != nil {
|
|
h.cancel()
|
|
h.cancel = nil
|
|
}
|
|
h.runtimeSamples = nil
|
|
h.lastRuntimeSample = time.Time{}
|
|
if h.dataPath != "" {
|
|
_ = os.RemoveAll(h.dataPath)
|
|
h.dataPath = ""
|
|
}
|
|
}
|
|
|
|
type runtimeSnapshot struct {
|
|
Timestamp time.Time
|
|
Goroutines int
|
|
HeapAlloc uint64
|
|
StackInuse uint64
|
|
NumGC uint32
|
|
}
|
|
|
|
func sampleRuntime() runtimeSnapshot {
|
|
var ms runtime.MemStats
|
|
runtime.ReadMemStats(&ms)
|
|
return runtimeSnapshot{
|
|
Timestamp: time.Now(),
|
|
Goroutines: runtime.NumGoroutine(),
|
|
HeapAlloc: ms.HeapAlloc,
|
|
StackInuse: ms.StackInuse,
|
|
NumGC: ms.NumGC,
|
|
}
|
|
}
|
|
|
|
func computeStalenessStats(m *Monitor) StalenessStats {
|
|
if m == nil || m.stalenessTracker == nil {
|
|
return StalenessStats{}
|
|
}
|
|
|
|
snapshots := m.stalenessTracker.Snapshot()
|
|
if len(snapshots) == 0 {
|
|
return StalenessStats{}
|
|
}
|
|
|
|
var sum float64
|
|
maxScore := 0.0
|
|
for _, snap := range snapshots {
|
|
sum += snap.Score
|
|
if snap.Score > maxScore {
|
|
maxScore = snap.Score
|
|
}
|
|
}
|
|
|
|
avg := sum / float64(len(snapshots))
|
|
return StalenessStats{
|
|
Max: maxScore,
|
|
Average: avg,
|
|
Samples: len(snapshots),
|
|
}
|
|
}
|
|
|
|
func toInstanceType(value string) InstanceType {
|
|
switch strings.ToLower(value) {
|
|
case "pve":
|
|
return InstanceTypePVE
|
|
case "pbs":
|
|
return InstanceTypePBS
|
|
case "pmg":
|
|
return InstanceTypePMG
|
|
default:
|
|
return InstanceType(strings.ToLower(value))
|
|
}
|
|
}
|
|
|
|
type noopPVEClient struct{}
|
|
|
|
func (noopPVEClient) GetNodes(ctx context.Context) ([]proxmox.Node, error) { return nil, nil }
|
|
func (noopPVEClient) GetNodeStatus(ctx context.Context, node string) (*proxmox.NodeStatus, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetNodeRRDData(ctx context.Context, node string, timeframe string, cf string, ds []string) ([]proxmox.NodeRRDPoint, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetLXCRRDData(ctx context.Context, node string, vmid int, timeframe string, cf string, ds []string) ([]proxmox.GuestRRDPoint, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetVMRRDData(ctx context.Context, node string, vmid int, timeframe string, cf string, ds []string) ([]proxmox.GuestRRDPoint, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetVMs(ctx context.Context, node string) ([]proxmox.VM, error) { return nil, nil }
|
|
func (noopPVEClient) GetContainers(ctx context.Context, node string) ([]proxmox.Container, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetStorage(ctx context.Context, node string) ([]proxmox.Storage, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetAllStorage(ctx context.Context) ([]proxmox.Storage, error) { return nil, nil }
|
|
func (noopPVEClient) GetBackupTasks(ctx context.Context) ([]proxmox.Task, error) { return nil, nil }
|
|
func (noopPVEClient) GetReplicationStatus(ctx context.Context) ([]proxmox.ReplicationJob, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetStorageContent(ctx context.Context, node, storage string) ([]proxmox.StorageContent, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetVMSnapshots(ctx context.Context, node string, vmid int) ([]proxmox.Snapshot, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetContainerSnapshots(ctx context.Context, node string, vmid int) ([]proxmox.Snapshot, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetContainerStatus(ctx context.Context, node string, vmid int) (*proxmox.Container, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetContainerConfig(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetContainerInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.ContainerInterface, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) IsClusterMember(ctx context.Context) (bool, error) { return false, nil }
|
|
func (noopPVEClient) GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.VMNetworkInterface, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error) {
|
|
return map[string]interface{}{}, nil
|
|
}
|
|
|
|
func (noopPVEClient) GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error) {
|
|
return "", nil
|
|
}
|
|
func (noopPVEClient) GetZFSPoolStatus(ctx context.Context, node string) ([]proxmox.ZFSPoolStatus, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetZFSPoolsWithDetails(ctx context.Context, node string) ([]proxmox.ZFSPoolInfo, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetDisks(ctx context.Context, node string) ([]proxmox.Disk, error) {
|
|
return nil, nil
|
|
}
|
|
func (noopPVEClient) GetCephStatus(ctx context.Context) (*proxmox.CephStatus, error) { return nil, nil }
|
|
func (noopPVEClient) GetCephDF(ctx context.Context) (*proxmox.CephDF, error) { return nil, nil }
|