Pulse/internal/monitoring/monitor.go
2026-04-11 16:47:37 +01:00

5582 lines
170 KiB
Go

package monitoring
import (
"context"
"encoding/json"
stderrors "errors"
"fmt"
"math"
"math/rand"
"os"
"path/filepath"
"sort"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/ai/memory"
"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
"github.com/rcourtman/pulse-go-rewrite/internal/config"
"github.com/rcourtman/pulse-go-rewrite/internal/discovery"
"github.com/rcourtman/pulse-go-rewrite/internal/logging"
"github.com/rcourtman/pulse-go-rewrite/internal/mock"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/internal/monitoring/errors"
"github.com/rcourtman/pulse-go-rewrite/internal/notifications"
recoverymanager "github.com/rcourtman/pulse-go-rewrite/internal/recovery/manager"
"github.com/rcourtman/pulse-go-rewrite/internal/system"
"github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
"github.com/rcourtman/pulse-go-rewrite/internal/websocket"
"github.com/rcourtman/pulse-go-rewrite/pkg/metrics"
"github.com/rcourtman/pulse-go-rewrite/pkg/pbs"
"github.com/rcourtman/pulse-go-rewrite/pkg/pmg"
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
)
const (
defaultTaskTimeout = 90 * time.Second
minTaskTimeout = 30 * time.Second
maxTaskTimeout = 3 * time.Minute
)
const mockKeepRealPollingEnv = "PULSE_MOCK_KEEP_REAL_POLLING"
func keepRealPollingInMockMode() bool {
raw := strings.TrimSpace(strings.ToLower(os.Getenv(mockKeepRealPollingEnv)))
switch raw {
case "1", "true", "yes", "on":
return true
case "", "0", "false", "no", "off":
return false
default:
return false
}
}
// newProxmoxClientFunc is a variable that holds the function to create a new Proxmox client.
// It is used to allow mocking the client creation in tests.
var newProxmoxClientFunc = func(cfg proxmox.ClientConfig) (PVEClientInterface, error) {
return proxmox.NewClient(cfg)
}
// PVEClientInterface defines the interface for PVE clients (both regular and cluster)
type PVEClientInterface interface {
GetNodes(ctx context.Context) ([]proxmox.Node, error)
GetNodeStatus(ctx context.Context, node string) (*proxmox.NodeStatus, error)
GetNodeRRDData(ctx context.Context, node string, timeframe string, cf string, ds []string) ([]proxmox.NodeRRDPoint, error)
GetLXCRRDData(ctx context.Context, node string, vmid int, timeframe string, cf string, ds []string) ([]proxmox.GuestRRDPoint, error)
GetVMRRDData(ctx context.Context, node string, vmid int, timeframe string, cf string, ds []string) ([]proxmox.GuestRRDPoint, error)
GetVMs(ctx context.Context, node string) ([]proxmox.VM, error)
GetContainers(ctx context.Context, node string) ([]proxmox.Container, error)
GetStorage(ctx context.Context, node string) ([]proxmox.Storage, error)
GetAllStorage(ctx context.Context) ([]proxmox.Storage, error)
GetBackupTasks(ctx context.Context) ([]proxmox.Task, error)
GetReplicationStatus(ctx context.Context) ([]proxmox.ReplicationJob, error)
GetStorageContent(ctx context.Context, node, storage string) ([]proxmox.StorageContent, error)
GetVMSnapshots(ctx context.Context, node string, vmid int) ([]proxmox.Snapshot, error)
GetContainerSnapshots(ctx context.Context, node string, vmid int) ([]proxmox.Snapshot, error)
GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error)
GetContainerStatus(ctx context.Context, node string, vmid int) (*proxmox.Container, error)
GetContainerConfig(ctx context.Context, node string, vmid int) (map[string]interface{}, error)
GetContainerInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.ContainerInterface, error)
GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error)
IsClusterMember(ctx context.Context) (bool, error)
GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error)
GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.VMNetworkInterface, error)
GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error)
GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error)
GetZFSPoolStatus(ctx context.Context, node string) ([]proxmox.ZFSPoolStatus, error)
GetZFSPoolsWithDetails(ctx context.Context, node string) ([]proxmox.ZFSPoolInfo, error)
GetDisks(ctx context.Context, node string) ([]proxmox.Disk, error)
GetNodePendingUpdates(ctx context.Context, node string) ([]proxmox.AptPackage, error)
GetCephStatus(ctx context.Context) (*proxmox.CephStatus, error)
GetCephDF(ctx context.Context) (*proxmox.CephDF, error)
}
// ResourceStoreInterface provides methods for polling optimization and resource access.
// When an agent is monitoring a node, we can reduce API polling for that node.
type ResourceStoreInterface interface {
// ShouldSkipAPIPolling returns true if API polling should be skipped for the hostname
// because an agent is providing richer data.
ShouldSkipAPIPolling(hostname string) bool
// GetPollingRecommendations returns a map of hostname -> polling multiplier.
// 0 = skip entirely, 0.5 = half frequency, 1 = normal
GetPollingRecommendations() map[string]float64
// GetAll returns all resources in the store (for WebSocket broadcasts)
GetAll() []unifiedresources.Resource
// PopulateFromSnapshot updates the store with data from a StateSnapshot
PopulateFromSnapshot(snapshot models.StateSnapshot)
}
// SupplementalRecordStore is an optional extension for resource stores that can
// ingest source-native unified records in addition to legacy snapshots.
type SupplementalRecordStore interface {
PopulateSupplementalRecords(source unifiedresources.DataSource, records []unifiedresources.IngestRecord)
}
// AtomicSnapshotResourceStore is an optional extension for stores that can
// atomically replace their canonical registry from a snapshot plus
// supplemental records in a single swap.
type AtomicSnapshotResourceStore interface {
PopulateSnapshotAndSupplemental(snapshot models.StateSnapshot, recordsBySource map[unifiedresources.DataSource][]unifiedresources.IngestRecord)
}
// MetricsTargetResourceStore optionally resolves the history/metrics target for
// a canonical resource in the live unified store.
type MetricsTargetResourceStore interface {
MetricsTargetForResource(resourceID string) *unifiedresources.MetricsTarget
}
// UnifiedResourceFreshnessStore is an optional extension for stores that track
// their own canonical-resource freshness independent of state.LastUpdate.
type UnifiedResourceFreshnessStore interface {
UnifiedResourceFreshness() time.Time
}
// MonitorSupplementalRecordsProvider emits source-native records outside the
// poll-provider scheduling path (for example, dedicated background pollers).
type MonitorSupplementalRecordsProvider interface {
SupplementalRecords(m *Monitor, orgID string) []unifiedresources.IngestRecord
}
// MonitorSupplementalChangesProvider optionally emits canonical resource
// timeline changes alongside supplemental records.
type MonitorSupplementalChangesProvider interface {
SupplementalChanges(m *Monitor, orgID string) []unifiedresources.ResourceChange
}
// MonitorSupplementalInventoryReadinessProvider optionally reports when a
// supplemental provider's current org-scoped inventory is settled enough to be
// consumed by billing and monitored-system admission boundaries.
//
// Providers that suppress snapshot-owned sources must implement this contract
// so the monitor can fail closed until the canonical store has been rebuilt
// from a settled provider baseline.
type MonitorSupplementalInventoryReadinessProvider interface {
SupplementalInventoryReadyAt(m *Monitor, orgID string) (time.Time, bool)
}
// MonitorPhysicalDiskTemperatureHistoryProvider optionally exposes source-native
// physical-disk temperature history through the canonical monitoring chart
// boundary when Pulse's own stored history is shallow.
type MonitorPhysicalDiskTemperatureHistoryProvider interface {
PhysicalDiskTemperatureHistory(m *Monitor, orgID string, duration time.Duration) map[string][]MetricPoint
}
func getNodeDisplayName(instance *config.PVEInstance, nodeName string) string {
baseName := strings.TrimSpace(nodeName)
if baseName == "" {
baseName = "unknown-node"
}
if instance == nil {
return baseName
}
friendly := strings.TrimSpace(instance.Name)
if instance.IsCluster {
if endpointLabel := lookupClusterEndpointLabel(instance, nodeName); endpointLabel != "" {
return endpointLabel
}
if baseName != "" && baseName != "unknown-node" {
return baseName
}
if friendly != "" {
return friendly
}
return baseName
}
if friendly != "" {
return friendly
}
if baseName != "" && baseName != "unknown-node" {
return baseName
}
if label := normalizeEndpointHost(instance.Host); label != "" && !isLikelyIPAddress(label) {
return label
}
return baseName
}
func (m *Monitor) getInstanceConfig(instanceName string) *config.PVEInstance {
if m == nil {
return nil
}
m.mu.RLock()
defer m.mu.RUnlock()
if m.config == nil {
return nil
}
for i := range m.config.PVEInstances {
if strings.EqualFold(m.config.PVEInstances[i].Name, instanceName) {
instanceCopy := m.config.PVEInstances[i]
return &instanceCopy
}
}
return nil
}
func (m *Monitor) totalClientCount() int {
if m == nil {
return 0
}
total := 0
for _, provider := range m.pollProviderSnapshotWithBuiltins() {
if provider == nil {
continue
}
total += len(provider.ListInstances(m))
}
return total
}
func (m *Monitor) getPVEClient(name string) (PVEClientInterface, bool) {
if m == nil {
return nil, false
}
m.mu.RLock()
defer m.mu.RUnlock()
client, ok := m.pveClients[name]
return client, ok
}
func (m *Monitor) getPBSClient(name string) (*pbs.Client, bool) {
if m == nil {
return nil, false
}
m.mu.RLock()
defer m.mu.RUnlock()
client, ok := m.pbsClients[name]
return client, ok
}
func (m *Monitor) getPMGClient(name string) (*pmg.Client, bool) {
if m == nil {
return nil, false
}
m.mu.RLock()
defer m.mu.RUnlock()
client, ok := m.pmgClients[name]
return client, ok
}
func mergeNVMeTempsIntoDisks(disks []models.PhysicalDisk, nodes []models.Node) []models.PhysicalDisk {
if len(disks) == 0 || len(nodes) == 0 {
return disks
}
// Build temperature maps by node for both SMART and legacy NVMe data
smartTempsByNode := make(map[string][]models.DiskTemp)
nvmeTempsByNode := make(map[string][]models.NVMeTemp)
for _, node := range nodes {
log.Debug().
Str("nodeName", node.Name).
Bool("hasTemp", node.Temperature != nil).
Bool("tempAvailable", node.Temperature != nil && node.Temperature.Available).
Int("smartCount", func() int {
if node.Temperature != nil {
return len(node.Temperature.SMART)
}
return 0
}()).
Msg("mergeNVMeTempsIntoDisks: checking node temperature")
if node.Temperature == nil || !node.Temperature.Available {
continue
}
// Collect SMART temps (preferred source)
if len(node.Temperature.SMART) > 0 {
temps := make([]models.DiskTemp, len(node.Temperature.SMART))
copy(temps, node.Temperature.SMART)
smartTempsByNode[node.Name] = temps
log.Debug().
Str("nodeName", node.Name).
Int("smartTempCount", len(temps)).
Msg("mergeNVMeTempsIntoDisks: collected SMART temps for node")
}
// Collect legacy NVMe temps as fallback
if len(node.Temperature.NVMe) > 0 {
temps := make([]models.NVMeTemp, len(node.Temperature.NVMe))
copy(temps, node.Temperature.NVMe)
sort.Slice(temps, func(i, j int) bool {
return temps[i].Device < temps[j].Device
})
nvmeTempsByNode[node.Name] = temps
}
}
if len(smartTempsByNode) == 0 && len(nvmeTempsByNode) == 0 {
log.Debug().
Int("diskCount", len(disks)).
Msg("mergeNVMeTempsIntoDisks: no SMART or NVMe temperature data available")
return disks
}
log.Debug().
Int("smartNodeCount", len(smartTempsByNode)).
Int("nvmeNodeCount", len(nvmeTempsByNode)).
Int("diskCount", len(disks)).
Msg("mergeNVMeTempsIntoDisks: starting disk temperature merge")
updated := make([]models.PhysicalDisk, len(disks))
copy(updated, disks)
// Process SMART temperatures first (preferred method)
for i := range updated {
smartTemps, ok := smartTempsByNode[updated[i].Node]
log.Debug().
Str("diskDevPath", updated[i].DevPath).
Str("diskNode", updated[i].Node).
Bool("hasSMARTData", ok).
Int("smartTempCount", len(smartTemps)).
Msg("mergeNVMeTempsIntoDisks: checking disk for SMART temp match")
if !ok || len(smartTemps) == 0 {
continue
}
// Try to match by WWN (most reliable)
if updated[i].WWN != "" {
for _, temp := range smartTemps {
if temp.WWN != "" && strings.EqualFold(temp.WWN, updated[i].WWN) {
if temp.Temperature > 0 && !temp.StandbySkipped {
updated[i].Temperature = temp.Temperature
log.Debug().
Str("disk", updated[i].DevPath).
Str("wwn", updated[i].WWN).
Int("temp", temp.Temperature).
Msg("Matched SMART temperature by WWN")
}
continue
}
}
}
// Fall back to serial number match (case-insensitive)
if updated[i].Serial != "" && updated[i].Temperature == 0 {
for _, temp := range smartTemps {
if temp.Serial != "" && strings.EqualFold(temp.Serial, updated[i].Serial) {
if temp.Temperature > 0 && !temp.StandbySkipped {
updated[i].Temperature = temp.Temperature
log.Debug().
Str("disk", updated[i].DevPath).
Str("serial", updated[i].Serial).
Int("temp", temp.Temperature).
Msg("Matched SMART temperature by serial")
}
continue
}
}
}
// Last resort: match by device path (normalized)
if updated[i].Temperature == 0 {
normalizedDevPath := strings.TrimPrefix(updated[i].DevPath, "/dev/")
for _, temp := range smartTemps {
normalizedTempDev := strings.TrimPrefix(temp.Device, "/dev/")
if normalizedTempDev == normalizedDevPath {
if temp.Temperature > 0 && !temp.StandbySkipped {
updated[i].Temperature = temp.Temperature
log.Debug().
Str("disk", updated[i].DevPath).
Int("temp", temp.Temperature).
Msg("Matched SMART temperature by device path")
}
break
}
}
}
}
// Process legacy NVMe temperatures for disks that didn't get SMART data
disksByNode := make(map[string][]int)
for i := range updated {
if strings.EqualFold(updated[i].Type, "nvme") && updated[i].Temperature == 0 {
disksByNode[updated[i].Node] = append(disksByNode[updated[i].Node], i)
}
}
for nodeName, diskIndexes := range disksByNode {
temps, ok := nvmeTempsByNode[nodeName]
if !ok || len(temps) == 0 {
continue
}
sort.Slice(diskIndexes, func(i, j int) bool {
return updated[diskIndexes[i]].DevPath < updated[diskIndexes[j]].DevPath
})
for idx, diskIdx := range diskIndexes {
if idx >= len(temps) {
break
}
tempVal := temps[idx].Temp
if tempVal <= 0 || math.IsNaN(tempVal) {
continue
}
updated[diskIdx].Temperature = int(math.Round(tempVal))
log.Debug().
Str("disk", updated[diskIdx].DevPath).
Int("temp", updated[diskIdx].Temperature).
Msg("Matched legacy NVMe temperature by index")
}
}
return updated
}
// mergeHostAgentSMARTIntoDisks merges SMART temperature data from linked host agents
// into physical disks for Proxmox nodes. This allows disk temps collected by the
// pulse-agent running on a PVE node to populate the Physical Disks view.
func mergeHostAgentSMARTIntoDisks(disks []models.PhysicalDisk, nodes []models.Node, hosts []models.Host) []models.PhysicalDisk {
if len(disks) == 0 || len(nodes) == 0 || len(hosts) == 0 {
return disks
}
// Build a map of host ID to host for quick lookup
hostByID := make(map[string]*models.Host, len(hosts))
for i := range hosts {
hostByID[hosts[i].ID] = &hosts[i]
}
// Build a map of node name to linked host's SMART data
smartByNodeName := make(map[string][]models.HostDiskSMART)
for _, node := range nodes {
if node.LinkedAgentID == "" {
continue
}
host, ok := hostByID[node.LinkedAgentID]
if !ok || len(host.Sensors.SMART) == 0 {
continue
}
smartByNodeName[node.Name] = host.Sensors.SMART
log.Debug().
Str("nodeName", node.Name).
Str("hostAgentID", node.LinkedAgentID).
Int("smartDiskCount", len(host.Sensors.SMART)).
Msg("mergeHostAgentSMARTIntoDisks: found linked host agent with SMART data")
}
if len(smartByNodeName) == 0 {
return disks
}
updated := make([]models.PhysicalDisk, len(disks))
copy(updated, disks)
for i := range updated {
smartData, ok := smartByNodeName[updated[i].Node]
if !ok || len(smartData) == 0 {
continue
}
// Find matching SMART entry by WWN, serial, or device path
var matched *models.HostDiskSMART
// Try to match by WWN (most reliable)
if updated[i].WWN != "" {
for j := range smartData {
if smartData[j].WWN != "" && strings.EqualFold(smartData[j].WWN, updated[i].WWN) {
matched = &smartData[j]
break
}
}
}
// Fall back to serial number match
if matched == nil && updated[i].Serial != "" {
for j := range smartData {
if smartData[j].Serial != "" && strings.EqualFold(smartData[j].Serial, updated[i].Serial) {
matched = &smartData[j]
break
}
}
}
// Last resort: match by device path
if matched == nil {
normalizedDevPath := strings.TrimPrefix(updated[i].DevPath, "/dev/")
for j := range smartData {
normalizedDiskDev := strings.TrimPrefix(smartData[j].Device, "/dev/")
if normalizedDiskDev == normalizedDevPath {
matched = &smartData[j]
break
}
}
}
if matched == nil || matched.Standby {
continue
}
// Merge temperature if not already set
if updated[i].Temperature == 0 && matched.Temperature > 0 {
updated[i].Temperature = matched.Temperature
log.Debug().
Str("device", updated[i].DevPath).
Int("temp", matched.Temperature).
Msg("Matched host agent SMART temperature")
}
// Always merge SMART attributes from host agent
if matched.Attributes != nil {
updated[i].SmartAttributes = smartAttributesCopy(matched.Attributes)
if updated[i].Wearout < 0 {
if derivedWearout := deriveWearoutFromSMARTAttributes(matched.Attributes); derivedWearout >= 0 {
updated[i].Wearout = derivedWearout
}
}
}
if (strings.TrimSpace(updated[i].Health) == "" || strings.EqualFold(updated[i].Health, "unknown")) && strings.TrimSpace(matched.Health) != "" {
updated[i].Health = matched.Health
}
}
return updated
}
func deriveWearoutFromSMARTAttributes(attrs *models.SMARTAttributes) int {
if attrs == nil || attrs.PercentageUsed == nil {
return -1
}
used := *attrs.PercentageUsed
if used < 0 {
used = 0
}
if used > 100 {
used = 100
}
return 100 - used
}
func physicalDiskFromReadStateView(view *unifiedresources.PhysicalDiskView) models.PhysicalDisk {
if view == nil {
return models.PhysicalDisk{}
}
return models.PhysicalDisk{
ID: view.ID(),
Node: view.Node(),
Instance: view.Instance(),
DevPath: view.DevPath(),
Model: view.Model(),
Serial: view.Serial(),
WWN: view.WWN(),
Type: view.DiskType(),
Size: view.SizeBytes(),
Health: view.Health(),
Wearout: view.Wearout(),
Temperature: view.Temperature(),
RPM: view.RPM(),
Used: view.Used(),
SmartAttributes: smartAttributesFromUnifiedMeta(view.SMART()),
LastChecked: view.LastSeen(),
}
}
func smartAttributesFromUnifiedMeta(in *unifiedresources.SMARTMeta) *models.SMARTAttributes {
if in == nil {
return nil
}
out := &models.SMARTAttributes{}
if in.PowerOnHours != 0 {
value := in.PowerOnHours
out.PowerOnHours = &value
}
if in.PowerCycles != 0 {
value := in.PowerCycles
out.PowerCycles = &value
}
if in.ReallocatedSectors != 0 {
value := in.ReallocatedSectors
out.ReallocatedSectors = &value
}
if in.PendingSectors != 0 {
value := in.PendingSectors
out.PendingSectors = &value
}
if in.OfflineUncorrectable != 0 {
value := in.OfflineUncorrectable
out.OfflineUncorrectable = &value
}
if in.UDMACRCErrors != 0 {
value := in.UDMACRCErrors
out.UDMACRCErrors = &value
}
if in.PercentageUsed != 0 {
value := in.PercentageUsed
out.PercentageUsed = &value
}
if in.AvailableSpare != 0 {
value := in.AvailableSpare
out.AvailableSpare = &value
}
if in.MediaErrors != 0 {
value := in.MediaErrors
out.MediaErrors = &value
}
if in.UnsafeShutdowns != 0 {
value := in.UnsafeShutdowns
out.UnsafeShutdowns = &value
}
if out.PowerOnHours == nil &&
out.PowerCycles == nil &&
out.ReallocatedSectors == nil &&
out.PendingSectors == nil &&
out.OfflineUncorrectable == nil &&
out.UDMACRCErrors == nil &&
out.PercentageUsed == nil &&
out.AvailableSpare == nil &&
out.MediaErrors == nil &&
out.UnsafeShutdowns == nil {
return nil
}
return out
}
func physicalDisksForInstanceFromReadState(readState unifiedresources.ReadState, instance string) []models.PhysicalDisk {
if readState == nil {
return nil
}
out := make([]models.PhysicalDisk, 0)
for _, disk := range readState.PhysicalDisks() {
if disk == nil || disk.Instance() != instance {
continue
}
out = append(out, physicalDiskFromReadStateView(disk))
}
return out
}
func nodesForInstanceFromReadState(readState unifiedresources.ReadState, instance string) []models.Node {
if readState == nil {
return nil
}
out := make([]models.Node, 0)
for _, node := range readState.Nodes() {
if node == nil || node.Instance() != instance {
continue
}
out = append(out, nodeFromReadStateView(node))
}
return out
}
func hostsFromReadState(readState unifiedresources.ReadState) []models.Host {
if readState == nil {
return nil
}
out := make([]models.Host, 0)
for _, host := range readState.Hosts() {
if host == nil {
continue
}
out = append(out, hostFromReadStateView(host))
}
return out
}
// writeSMARTMetrics writes SMART temperature history to the in-memory chart
// buffer and persists SMART attributes when the metrics store is enabled.
func (m *Monitor) writeSMARTMetrics(disk models.PhysicalDisk, now time.Time) {
if shouldSkipNativeMockStateMetricWrites() {
return
}
resourceID := unifiedresources.PhysicalDiskMetricID(disk)
if resourceID == "" {
return
}
// Temperature (always write if > 0)
if disk.Temperature > 0 {
if m.metricsHistory != nil {
m.metricsHistory.AddDiskMetric(resourceID, "smart_temp", float64(disk.Temperature), now)
}
if m.metricsStore != nil {
m.metricsStore.Write("disk", resourceID, "smart_temp", float64(disk.Temperature), now)
}
}
attrs := disk.SmartAttributes
if attrs == nil || m.metricsStore == nil {
return
}
// Common
if attrs.PowerOnHours != nil {
m.metricsStore.Write("disk", resourceID, "smart_power_on_hours", float64(*attrs.PowerOnHours), now)
}
if attrs.PowerCycles != nil {
m.metricsStore.Write("disk", resourceID, "smart_power_cycles", float64(*attrs.PowerCycles), now)
}
// SATA-specific
if attrs.ReallocatedSectors != nil {
m.metricsStore.Write("disk", resourceID, "smart_reallocated_sectors", float64(*attrs.ReallocatedSectors), now)
}
if attrs.PendingSectors != nil {
m.metricsStore.Write("disk", resourceID, "smart_pending_sectors", float64(*attrs.PendingSectors), now)
}
if attrs.OfflineUncorrectable != nil {
m.metricsStore.Write("disk", resourceID, "smart_offline_uncorrectable", float64(*attrs.OfflineUncorrectable), now)
}
if attrs.UDMACRCErrors != nil {
m.metricsStore.Write("disk", resourceID, "smart_crc_errors", float64(*attrs.UDMACRCErrors), now)
}
// NVMe-specific
if attrs.PercentageUsed != nil {
m.metricsStore.Write("disk", resourceID, "smart_percentage_used", float64(*attrs.PercentageUsed), now)
}
if attrs.AvailableSpare != nil {
m.metricsStore.Write("disk", resourceID, "smart_available_spare", float64(*attrs.AvailableSpare), now)
}
if attrs.MediaErrors != nil {
m.metricsStore.Write("disk", resourceID, "smart_media_errors", float64(*attrs.MediaErrors), now)
}
if attrs.UnsafeShutdowns != nil {
m.metricsStore.Write("disk", resourceID, "smart_unsafe_shutdowns", float64(*attrs.UnsafeShutdowns), now)
}
}
// PollExecutor defines the contract for executing polling tasks.
type PollExecutor interface {
Execute(ctx context.Context, task PollTask)
}
type realExecutor struct {
monitor *Monitor
}
func newRealExecutor(m *Monitor) PollExecutor {
return &realExecutor{monitor: m}
}
func (r *realExecutor) Execute(ctx context.Context, task PollTask) {
if r == nil || r.monitor == nil {
return
}
if task.Run != nil {
task.Run(ctx)
return
}
switch strings.ToLower(task.InstanceType) {
case "pve":
if task.PVEClient == nil {
log.Warn().
Str("instance", task.InstanceName).
Msg("PollExecutor received nil PVE client")
return
}
r.monitor.pollPVEInstance(ctx, task.InstanceName, task.PVEClient)
case "pbs":
if task.PBSClient == nil {
log.Warn().
Str("instance", task.InstanceName).
Msg("PollExecutor received nil PBS client")
return
}
r.monitor.pollPBSInstance(ctx, task.InstanceName, task.PBSClient)
case "pmg":
if task.PMGClient == nil {
log.Warn().
Str("instance", task.InstanceName).
Msg("PollExecutor received nil PMG client")
return
}
r.monitor.pollPMGInstance(ctx, task.InstanceName, task.PMGClient)
default:
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().
Str("instance", task.InstanceName).
Str("type", task.InstanceType).
Msg("PollExecutor received unsupported task type")
}
}
}
type instanceInfo struct {
Key string
Type InstanceType
DisplayName string
Connection string
Metadata map[string]string
}
type pollStatus struct {
LastSuccess time.Time
LastErrorAt time.Time
LastErrorMessage string
LastErrorCategory string
ConsecutiveFailures int
FirstFailureAt time.Time
}
type dlqInsight struct {
Reason string
FirstAttempt time.Time
LastAttempt time.Time
RetryCount int
NextRetry time.Time
}
type ErrorDetail struct {
At time.Time `json:"at"`
Message string `json:"message"`
Category string `json:"category"`
}
type InstancePollStatus struct {
LastSuccess *time.Time `json:"lastSuccess,omitempty"`
LastError *ErrorDetail `json:"lastError,omitempty"`
ConsecutiveFailures int `json:"consecutiveFailures"`
FirstFailureAt *time.Time `json:"firstFailureAt,omitempty"`
}
type InstanceBreaker struct {
State string `json:"state"`
Since *time.Time `json:"since,omitempty"`
LastTransition *time.Time `json:"lastTransition,omitempty"`
RetryAt *time.Time `json:"retryAt,omitempty"`
FailureCount int `json:"failureCount"`
}
type InstanceDLQ struct {
Present bool `json:"present"`
Reason string `json:"reason,omitempty"`
FirstAttempt *time.Time `json:"firstAttempt,omitempty"`
LastAttempt *time.Time `json:"lastAttempt,omitempty"`
RetryCount int `json:"retryCount,omitempty"`
NextRetry *time.Time `json:"nextRetry,omitempty"`
}
type InstanceHealth struct {
Key string `json:"key"`
Type string `json:"type"`
DisplayName string `json:"displayName"`
Instance string `json:"instance"`
Connection string `json:"connection"`
PollStatus InstancePollStatus `json:"pollStatus"`
Breaker InstanceBreaker `json:"breaker"`
DeadLetter InstanceDLQ `json:"deadLetter"`
Warnings []string `json:"warnings"`
}
func (h InstanceHealth) NormalizeCollections() InstanceHealth {
if h.Warnings == nil {
h.Warnings = []string{}
}
return h
}
// Monitor handles all monitoring operations
type Monitor struct {
config *config.Config
state *models.State
orgID string // Organization ID for tenant isolation (empty = default/legacy)
pveClients map[string]PVEClientInterface
pbsClients map[string]*pbs.Client
pmgClients map[string]*pmg.Client
pollProviders map[InstanceType]PollProvider
pollMetrics *PollMetrics
scheduler *AdaptiveScheduler
stalenessTracker *StalenessTracker
taskQueue *TaskQueue
pollTimeout time.Duration
circuitBreakers map[string]*circuitBreaker
deadLetterQueue *TaskQueue
failureCounts map[string]int
lastOutcome map[string]taskOutcome
backoffCfg backoffConfig
rng *rand.Rand
maxRetryAttempts int
tempCollector *TemperatureCollector // SSH-based temperature collector
guestMetadataStore *config.GuestMetadataStore
dockerMetadataStore *config.DockerMetadataStore
hostMetadataStore *config.HostMetadataStore
mu sync.RWMutex
startTime time.Time
rateTracker *RateTracker
metricsHistory *MetricsHistory
metricsStore *metrics.Store // Persistent SQLite metrics storage
alertManager *alerts.Manager
alertResolvedAICallback func(*alerts.Alert)
alertTriggeredAICallback func(*alerts.Alert)
incidentStore *memory.IncidentStore
notificationMgr *notifications.NotificationManager
configPersist *config.ConfigPersistence
discoveryService *discovery.Service // Background discovery service
activePollCount int32 // Number of active polling operations
pollCounter int64 // Counter for polling cycles
authFailures map[string]int // Track consecutive auth failures per node
lastAuthAttempt map[string]time.Time // Track last auth attempt time
lastClusterCheck map[string]time.Time // Track last cluster check for standalone nodes
lastPhysicalDiskPoll map[string]time.Time // Track last physical disk poll time per instance
lastPVEBackupPoll map[string]time.Time // Track last PVE backup poll per instance
lastPBSBackupPoll map[string]time.Time // Track last PBS backup poll per instance
backupPermissionWarnings map[string]string // Track backup permission issues per instance (instance -> warning message)
persistence *config.ConfigPersistence // Add persistence for saving updated configs
pbsBackupPollers map[string]bool // Track PBS backup polling goroutines per instance
pbsBackupCacheTime map[string]map[pbsBackupGroupKey]time.Time // Track when each PBS backup group was last fetched
runtimeCtx context.Context // Context used while monitor is running
wsHub *websocket.Hub // Hub used for broadcasting state
diagMu sync.RWMutex // Protects diagnostic snapshot maps
nodeSnapshots map[string]NodeMemorySnapshot
guestSnapshots map[string]GuestMemorySnapshot
rrdCacheMu sync.RWMutex // Protects short-lived guest memory caches.
nodeRRDMemCache map[string]rrdMemCacheEntry
vmRRDMemCache map[string]rrdMemCacheEntry
vmAgentMemCache map[string]agentMemCacheEntry
removedDockerHosts map[string]time.Time // Track deliberately removed Docker hosts (ID -> removal time)
dockerTokenBindings map[string]string // Track token ID -> agent ID bindings to enforce uniqueness
removedKubernetesClusters map[string]time.Time // Track deliberately removed Kubernetes clusters (ID -> removal time)
kubernetesTokenBindings map[string]string // Track token ID -> agent ID bindings to enforce uniqueness
removedHostAgents map[string]time.Time // Track deliberately removed host agents (ID -> removal time)
hostTokenBindings map[string]string // Track tokenID:hostname -> host identity bindings
dockerCommands map[string]*dockerHostCommand
dockerCommandIndex map[string]string
guestMetadataMu sync.RWMutex
guestMetadataCache map[string]guestMetadataCacheEntry
guestMetadataLimiterMu sync.Mutex
guestMetadataLimiter map[string]time.Time
guestMetadataSlots chan struct{}
guestMetadataMinRefresh time.Duration
guestMetadataRefreshJitter time.Duration
guestMetadataRetryBackoff time.Duration
guestMetadataHoldDuration time.Duration
// Configurable guest agent timeouts (refs #592)
guestAgentFSInfoTimeout time.Duration
guestAgentNetworkTimeout time.Duration
guestAgentOSInfoTimeout time.Duration
guestAgentVersionTimeout time.Duration
guestAgentRetries int
executor PollExecutor
breakerBaseRetry time.Duration
breakerMaxDelay time.Duration
breakerHalfOpenWindow time.Duration
instanceInfoCache map[string]*instanceInfo
pollStatusMap map[string]*pollStatus
dlqInsightMap map[string]*dlqInsight
nodeLastOnline map[string]time.Time // Track last time each node was seen online (for grace period)
nodePendingUpdatesCache map[string]pendingUpdatesCache // Cache pending updates per node (checked every 30 min)
resourceStore ResourceStoreInterface // Optional unified resource store for polling optimization
supplementalProviders map[unifiedresources.DataSource]MonitorSupplementalRecordsProvider
recoveryManager *recoverymanager.Manager // Optional recovery store manager for backup rollups
mockMetricsCancel context.CancelFunc
mockMetricsWg sync.WaitGroup
dockerChecker DockerChecker // Optional Docker checker for LXC containers
// Agent profile cache to avoid disk I/O on every report (refs #1094)
agentProfileCacheMu sync.RWMutex
agentProfileCache *agentProfileCacheEntry
// Cluster sensor cache: temperature data collected by an agent on one Proxmox
// cluster node via SSH to its siblings. Keyed by lowercase node name.
clusterSensorsMu sync.RWMutex
clusterSensorsCache map[string]clusterSensorsCacheEntry
mockChartCacheMu sync.RWMutex
mockChartMapCache map[mockChartMetricMapCacheKey]map[string][]MetricPoint
}
// clusterSensorsCacheEntry stores temperature data collected by a sibling agent via SSH.
type clusterSensorsCacheEntry struct {
sensors models.HostSensorSummary
updatedAt time.Time
}
type rrdMemCacheEntry struct {
available uint64
used uint64
total uint64
netIn float64
netOut float64
hasNetIn bool
hasNetOut bool
fetchedAt time.Time
}
// pendingUpdatesCache caches apt pending updates count per node
type pendingUpdatesCache struct {
count int
checkedAt time.Time
}
// TTL for pending updates cache (30 minutes - balance between freshness and API load)
const pendingUpdatesCacheTTL = 30 * time.Minute
// agentProfileCacheEntry caches agent profiles and assignments to avoid disk I/O on every agent report.
// TTL is 60 seconds to balance freshness with performance.
type agentProfileCacheEntry struct {
profiles []models.AgentProfile
assignments []models.AgentProfileAssignment
loadedAt time.Time
}
const agentProfileCacheTTL = 60 * time.Second
// shouldRunBackupPoll determines whether a backup polling cycle should execute.
// Returns whether polling should run, a human-readable skip reason, and the timestamp to record.
func (m *Monitor) shouldRunBackupPoll(last time.Time, now time.Time) (bool, string, time.Time) {
if m == nil || m.config == nil {
return false, "configuration unavailable", last
}
if !m.config.EnableBackupPolling {
return false, "backup polling globally disabled", last
}
interval := m.config.BackupPollingInterval
if interval > 0 {
if !last.IsZero() && now.Sub(last) < interval {
next := last.Add(interval)
return false, fmt.Sprintf("next run scheduled for %s", next.Format(time.RFC3339)), last
}
return true, "", now
}
backupCycles := m.config.BackupPollingCycles
if backupCycles <= 0 {
backupCycles = 10
}
if m.pollCounter%int64(backupCycles) == 0 || m.pollCounter == 1 {
return true, "", now
}
remaining := int64(backupCycles) - (m.pollCounter % int64(backupCycles))
return false, fmt.Sprintf("next run in %d polling cycles", remaining), last
}
const (
dockerConnectionPrefix = "docker-"
kubernetesConnectionPrefix = "kubernetes-"
hostConnectionPrefix = "host-"
dockerOfflineGraceMultiplier = 4
dockerMinimumHealthWindow = 30 * time.Second
dockerMaximumHealthWindow = 10 * time.Minute
kubernetesOfflineGraceMultiplier = 4
kubernetesMinimumHealthWindow = 30 * time.Second
kubernetesMaximumHealthWindow = 10 * time.Minute
hostOfflineGraceMultiplier = 6
hostMinimumHealthWindow = 60 * time.Second
hostMaximumHealthWindow = 10 * time.Minute
nodeOfflineGracePeriod = 60 * time.Second // Grace period before marking Proxmox nodes offline
nodeRRDCacheTTL = 30 * time.Second
nodeRRDRequestTimeout = 2 * time.Second
)
type taskOutcome struct {
success bool
transient bool
err error
recordedAt time.Time
}
func (m *Monitor) getNodeRRDMetrics(ctx context.Context, client PVEClientInterface, nodeName string) (rrdMemCacheEntry, error) {
if client == nil || nodeName == "" {
return rrdMemCacheEntry{}, fmt.Errorf("invalid arguments for RRD lookup")
}
now := time.Now()
m.rrdCacheMu.RLock()
if entry, ok := m.nodeRRDMemCache[nodeName]; ok && now.Sub(entry.fetchedAt) < nodeRRDCacheTTL {
m.rrdCacheMu.RUnlock()
return entry, nil
}
m.rrdCacheMu.RUnlock()
requestCtx, cancel := context.WithTimeout(ctx, nodeRRDRequestTimeout)
defer cancel()
points, err := client.GetNodeRRDData(requestCtx, nodeName, "hour", "AVERAGE", []string{"memavailable", "memused", "memtotal", "netin", "netout"})
if err != nil {
return rrdMemCacheEntry{}, err
}
var memAvailable uint64
var memUsed uint64
var memTotal uint64
var netIn float64
var netOut float64
var hasNetIn bool
var hasNetOut bool
for i := len(points) - 1; i >= 0; i-- {
point := points[i]
if memTotal == 0 && point.MemTotal != nil && !math.IsNaN(*point.MemTotal) && *point.MemTotal > 0 {
memTotal = uint64(math.Round(*point.MemTotal))
}
if memAvailable == 0 && point.MemAvailable != nil && !math.IsNaN(*point.MemAvailable) && *point.MemAvailable > 0 {
memAvailable = uint64(math.Round(*point.MemAvailable))
}
if memUsed == 0 && point.MemUsed != nil && !math.IsNaN(*point.MemUsed) && *point.MemUsed > 0 {
memUsed = uint64(math.Round(*point.MemUsed))
}
if !hasNetIn && point.NetIn != nil && !math.IsNaN(*point.NetIn) {
netIn = *point.NetIn
hasNetIn = true
}
if !hasNetOut && point.NetOut != nil && !math.IsNaN(*point.NetOut) {
netOut = *point.NetOut
hasNetOut = true
}
}
if memTotal > 0 {
if memAvailable > memTotal {
memAvailable = memTotal
}
if memUsed > memTotal {
memUsed = memTotal
}
}
if memAvailable == 0 && memUsed == 0 && !hasNetIn && !hasNetOut {
return rrdMemCacheEntry{}, fmt.Errorf("rrd node metrics not present")
}
entry := rrdMemCacheEntry{
available: memAvailable,
used: memUsed,
total: memTotal,
netIn: netIn,
netOut: netOut,
hasNetIn: hasNetIn,
hasNetOut: hasNetOut,
fetchedAt: now,
}
m.rrdCacheMu.Lock()
m.nodeRRDMemCache[nodeName] = entry
m.rrdCacheMu.Unlock()
return entry, nil
}
// getVMRRDMetrics fetches Proxmox RRD memavailable for a single VM with a
// short-lived cache to avoid a live API call on every poll for VMs that
// consistently lack guest-agent memory data (e.g. Windows VMs).
func (m *Monitor) getVMRRDMetrics(ctx context.Context, client PVEClientInterface, instanceName, node string, vmid int) (uint64, error) {
if client == nil || node == "" || vmid <= 0 {
return 0, fmt.Errorf("invalid arguments for VM RRD lookup")
}
cacheKey := guestMemoryCacheKey(instanceName, node, vmid)
now := time.Now()
m.rrdCacheMu.RLock()
if entry, ok := m.vmRRDMemCache[cacheKey]; ok && now.Sub(entry.fetchedAt) < nodeRRDCacheTTL {
m.rrdCacheMu.RUnlock()
return entry.available, nil
}
m.rrdCacheMu.RUnlock()
requestCtx, cancel := context.WithTimeout(ctx, nodeRRDRequestTimeout)
defer cancel()
points, err := client.GetVMRRDData(requestCtx, node, vmid, "hour", "AVERAGE", []string{"memavailable"})
if err != nil {
return 0, err
}
if len(points) == 0 {
return 0, fmt.Errorf("no RRD points for VM %s/%d", node, vmid)
}
var memAvailable uint64
for i := len(points) - 1; i >= 0; i-- {
p := points[i]
if p.MemAvailable != nil && !math.IsNaN(*p.MemAvailable) && *p.MemAvailable > 0 {
memAvailable = uint64(math.Round(*p.MemAvailable))
break
}
}
if memAvailable == 0 {
return 0, fmt.Errorf("rrd memavailable not present for VM %s/%d", node, vmid)
}
entry := rrdMemCacheEntry{available: memAvailable, fetchedAt: now}
m.rrdCacheMu.Lock()
if m.vmRRDMemCache == nil {
m.vmRRDMemCache = make(map[string]rrdMemCacheEntry)
}
m.vmRRDMemCache[cacheKey] = entry
m.rrdCacheMu.Unlock()
return memAvailable, nil
}
// RemoveDockerHost removes a docker host from the shared state and clears related alerts.
func (m *Monitor) GetConnectionStatuses() map[string]bool {
if m == nil {
return map[string]bool{}
}
if mock.IsMockEnabled() {
statuses := make(map[string]bool)
state := mock.CurrentFixtureGraph().State
for _, node := range state.Nodes {
key := "pve-" + node.Name
statuses[key] = strings.ToLower(node.Status) == "online"
if node.Host != "" {
statuses[node.Host] = strings.ToLower(node.Status) == "online"
}
}
for _, pbsInst := range state.PBSInstances {
key := "pbs-" + pbsInst.Name
statuses[key] = strings.ToLower(pbsInst.Status) != "offline"
if pbsInst.Host != "" {
statuses[pbsInst.Host] = strings.ToLower(pbsInst.Status) != "offline"
}
}
for _, pmgInst := range state.PMGInstances {
key := "pmg-" + pmgInst.Name
statuses[key] = strings.ToLower(pmgInst.Status) != "offline"
if pmgInst.Host != "" {
statuses[pmgInst.Host] = strings.ToLower(pmgInst.Status) != "offline"
}
}
for _, dockerHost := range state.DockerHosts {
key := dockerConnectionPrefix + dockerHost.ID
statuses[key] = strings.ToLower(dockerHost.Status) == "online"
}
return statuses
}
statuses := make(map[string]bool)
for _, provider := range m.pollProviderSnapshotWithBuiltins() {
for key, connected := range m.providerConnectionStatuses(provider) {
if strings.TrimSpace(key) == "" {
continue
}
statuses[key] = connected
}
}
return statuses
}
// checkContainerizedTempMonitoring logs a security warning if Pulse is running
// in a container with SSH-based temperature monitoring enabled
func checkContainerizedTempMonitoring() {
// Check if running in container
isContainer := os.Getenv("PULSE_DOCKER") == "true" || system.InContainer()
if !isContainer {
return
}
// Check if SSH keys exist (indicates temperature monitoring is configured)
homeDir := os.Getenv("HOME")
if homeDir == "" {
homeDir = "/home/pulse"
}
sshKeyPath := homeDir + "/.ssh/id_ed25519"
if _, err := os.Stat(sshKeyPath); err != nil {
// No SSH key found, temperature monitoring not configured
return
}
// Log warning
log.Warn().
Msg("SECURITY NOTICE: Pulse is running in a container with SSH-based temperature monitoring enabled. " +
"SSH private keys are stored inside the container, which could be a security risk if the container is compromised. " +
"Future versions will use agent-based architecture for better security. " +
"See documentation for hardening recommendations.")
}
// New creates a new Monitor instance
func New(cfg *config.Config) (*Monitor, error) {
if cfg == nil {
return nil, fmt.Errorf("config cannot be nil")
}
// Initialize temperature collector with sensors SSH key
// Will use root user for now - can be made configurable later
homeDir := os.Getenv("HOME")
if homeDir == "" {
homeDir = "/home/pulse"
}
sshKeyPath := filepath.Join(homeDir, ".ssh/id_ed25519_sensors")
tempCollector := NewTemperatureCollectorWithPort("root", sshKeyPath, cfg.SSHPort)
// Security warning if running in container with SSH temperature monitoring
checkContainerizedTempMonitoring()
stalenessTracker := NewStalenessTracker(getPollMetrics())
stalenessTracker.SetBounds(cfg.AdaptivePollingBaseInterval, cfg.AdaptivePollingMaxInterval)
taskQueue := NewTaskQueue()
deadLetterQueue := NewTaskQueue()
breakers := make(map[string]*circuitBreaker)
failureCounts := make(map[string]int)
lastOutcome := make(map[string]taskOutcome)
backoff := backoffConfig{
Initial: 5 * time.Second,
Multiplier: 2,
Jitter: 0.2,
Max: 5 * time.Minute,
}
if cfg.AdaptivePollingEnabled && cfg.AdaptivePollingMaxInterval > 0 && cfg.AdaptivePollingMaxInterval <= 15*time.Second {
backoff.Initial = 750 * time.Millisecond
backoff.Max = 6 * time.Second
}
var scheduler *AdaptiveScheduler
if cfg.AdaptivePollingEnabled {
scheduler = NewAdaptiveScheduler(SchedulerConfig{
BaseInterval: cfg.AdaptivePollingBaseInterval,
MinInterval: cfg.AdaptivePollingMinInterval,
MaxInterval: cfg.AdaptivePollingMaxInterval,
}, stalenessTracker, nil, nil)
}
minRefresh := cfg.GuestMetadataMinRefreshInterval
if minRefresh <= 0 {
minRefresh = config.DefaultGuestMetadataMinRefresh
}
jitter := cfg.GuestMetadataRefreshJitter
if jitter < 0 {
jitter = 0
}
retryBackoff := cfg.GuestMetadataRetryBackoff
if retryBackoff <= 0 {
retryBackoff = config.DefaultGuestMetadataRetryBackoff
}
concurrency := cfg.GuestMetadataMaxConcurrent
if concurrency <= 0 {
concurrency = config.DefaultGuestMetadataMaxConcurrent
}
holdDuration := defaultGuestMetadataHold
// Load guest agent timeout configuration from environment variables (refs #592)
guestAgentFSInfoTimeout := parsePositiveDurationEnv("GUEST_AGENT_FSINFO_TIMEOUT", defaultGuestAgentFSInfoTimeout)
guestAgentNetworkTimeout := parsePositiveDurationEnv("GUEST_AGENT_NETWORK_TIMEOUT", defaultGuestAgentNetworkTimeout)
guestAgentOSInfoTimeout := parsePositiveDurationEnv("GUEST_AGENT_OSINFO_TIMEOUT", defaultGuestAgentOSInfoTimeout)
guestAgentVersionTimeout := parsePositiveDurationEnv("GUEST_AGENT_VERSION_TIMEOUT", defaultGuestAgentVersionTimeout)
guestAgentRetries := parseNonNegativeIntEnv("GUEST_AGENT_RETRIES", defaultGuestAgentRetries)
// Initialize persistent metrics store (SQLite) with configurable retention
var metricsStore *metrics.Store
metricsStoreConfig := metrics.DefaultConfig(cfg.DataPath)
// Override retention settings from config (allows tier-based pricing in future)
if cfg.MetricsRetentionRawHours > 0 {
metricsStoreConfig.RetentionRaw = time.Duration(cfg.MetricsRetentionRawHours) * time.Hour
}
if cfg.MetricsRetentionMinuteHours > 0 {
metricsStoreConfig.RetentionMinute = time.Duration(cfg.MetricsRetentionMinuteHours) * time.Hour
}
if cfg.MetricsRetentionHourlyDays > 0 {
metricsStoreConfig.RetentionHourly = time.Duration(cfg.MetricsRetentionHourlyDays) * 24 * time.Hour
}
if cfg.MetricsRetentionDailyDays > 0 {
metricsStoreConfig.RetentionDaily = time.Duration(cfg.MetricsRetentionDailyDays) * 24 * time.Hour
}
// In mock mode, extend hourly/daily retention to 90 days to match the
// seeded data range (seeds write directly to hourly+daily tiers).
// Raw and minute tiers keep production defaults — seeded data doesn't
// use them, and live mock ticks at 2s intervals would bloat the DB
// (the old 90-day raw retention caused metrics.db to grow to ~2 GB).
if mock.IsMockEnabled() {
metricsStoreConfig.WriteBufferSize = 2000
metricsStoreConfig.RetentionHourly = 90 * 24 * time.Hour
metricsStoreConfig.RetentionDaily = 90 * 24 * time.Hour
}
ms, err := metrics.NewStore(metricsStoreConfig)
if err != nil {
// Do not automatically delete the DB on error, as it causes data loss on transient errors (e.g. locks).
// If the DB is truly corrupted, the user should manually remove it.
log.Error().Err(err).Msg("failed to initialize persistent metrics store - continuing without metrics persistence")
} else {
if mock.IsMockEnabled() {
ms.SetMaxOpenConns(10)
}
metricsStore = ms
log.Info().
Str("path", metricsStoreConfig.DBPath).
Dur("retentionRaw", metricsStoreConfig.RetentionRaw).
Dur("retentionMinute", metricsStoreConfig.RetentionMinute).
Dur("retentionHourly", metricsStoreConfig.RetentionHourly).
Dur("retentionDaily", metricsStoreConfig.RetentionDaily).
Msg("Persistent metrics store initialized with configurable retention")
}
incidentStore := memory.NewIncidentStore(memory.IncidentStoreConfig{
DataDir: cfg.DataPath,
})
m := &Monitor{
config: cfg,
state: models.NewState(),
pveClients: make(map[string]PVEClientInterface),
pbsClients: make(map[string]*pbs.Client),
pmgClients: make(map[string]*pmg.Client),
pollProviders: make(map[InstanceType]PollProvider),
pollMetrics: getPollMetrics(),
scheduler: scheduler,
stalenessTracker: stalenessTracker,
taskQueue: taskQueue,
pollTimeout: derivePollTimeout(cfg),
deadLetterQueue: deadLetterQueue,
circuitBreakers: breakers,
failureCounts: failureCounts,
lastOutcome: lastOutcome,
backoffCfg: backoff,
rng: rand.New(rand.NewSource(time.Now().UnixNano())),
maxRetryAttempts: 5,
tempCollector: tempCollector,
guestMetadataStore: config.NewGuestMetadataStore(cfg.DataPath, nil),
dockerMetadataStore: config.NewDockerMetadataStore(cfg.DataPath, nil),
hostMetadataStore: config.NewHostMetadataStore(cfg.DataPath, nil),
startTime: time.Now(),
rateTracker: NewRateTracker(),
metricsHistory: NewMetricsHistory(1000, 24*time.Hour), // Keep up to 1000 points (~8h @ 30s)
metricsStore: metricsStore, // Persistent SQLite storage
alertManager: alerts.NewManagerWithDataDir(cfg.DataPath),
incidentStore: incidentStore,
notificationMgr: notifications.NewNotificationManagerWithDataDir(cfg.PublicURL, cfg.DataPath),
configPersist: config.NewConfigPersistence(cfg.DataPath),
discoveryService: nil, // Will be initialized in Start()
authFailures: make(map[string]int),
lastAuthAttempt: make(map[string]time.Time),
lastClusterCheck: make(map[string]time.Time),
lastPhysicalDiskPoll: make(map[string]time.Time),
lastPVEBackupPoll: make(map[string]time.Time),
lastPBSBackupPoll: make(map[string]time.Time),
backupPermissionWarnings: make(map[string]string),
persistence: config.NewConfigPersistence(cfg.DataPath),
pbsBackupPollers: make(map[string]bool),
pbsBackupCacheTime: make(map[string]map[pbsBackupGroupKey]time.Time),
nodeSnapshots: make(map[string]NodeMemorySnapshot),
guestSnapshots: make(map[string]GuestMemorySnapshot),
nodeRRDMemCache: make(map[string]rrdMemCacheEntry),
vmRRDMemCache: make(map[string]rrdMemCacheEntry),
vmAgentMemCache: make(map[string]agentMemCacheEntry),
removedDockerHosts: make(map[string]time.Time),
dockerTokenBindings: make(map[string]string),
removedKubernetesClusters: make(map[string]time.Time),
kubernetesTokenBindings: make(map[string]string),
removedHostAgents: make(map[string]time.Time),
hostTokenBindings: make(map[string]string),
clusterSensorsCache: make(map[string]clusterSensorsCacheEntry),
dockerCommands: make(map[string]*dockerHostCommand),
dockerCommandIndex: make(map[string]string),
guestMetadataCache: make(map[string]guestMetadataCacheEntry),
guestMetadataLimiter: make(map[string]time.Time),
guestMetadataMinRefresh: minRefresh,
guestMetadataRefreshJitter: jitter,
guestMetadataRetryBackoff: retryBackoff,
guestMetadataHoldDuration: holdDuration,
guestAgentFSInfoTimeout: guestAgentFSInfoTimeout,
guestAgentNetworkTimeout: guestAgentNetworkTimeout,
guestAgentOSInfoTimeout: guestAgentOSInfoTimeout,
guestAgentVersionTimeout: guestAgentVersionTimeout,
guestAgentRetries: guestAgentRetries,
instanceInfoCache: make(map[string]*instanceInfo),
pollStatusMap: make(map[string]*pollStatus),
dlqInsightMap: make(map[string]*dlqInsight),
nodeLastOnline: make(map[string]time.Time),
nodePendingUpdatesCache: make(map[string]pendingUpdatesCache),
supplementalProviders: make(map[unifiedresources.DataSource]MonitorSupplementalRecordsProvider),
}
m.breakerBaseRetry = 5 * time.Second
m.breakerMaxDelay = 5 * time.Minute
m.breakerHalfOpenWindow = 30 * time.Second
if cfg.AdaptivePollingEnabled && cfg.AdaptivePollingMaxInterval > 0 && cfg.AdaptivePollingMaxInterval <= 15*time.Second {
m.breakerBaseRetry = 2 * time.Second
m.breakerMaxDelay = 10 * time.Second
m.breakerHalfOpenWindow = 2 * time.Second
}
m.executor = newRealExecutor(m)
m.registerBuiltInPollProviders()
m.buildInstanceInfoCache(cfg)
// Initialize state with config values
m.state.TemperatureMonitoringEnabled = cfg.TemperatureMonitoringEnabled
if m.pollMetrics != nil {
m.pollMetrics.ResetQueueDepth(0)
}
// Load saved configurations
if alertConfig, err := m.configPersist.LoadAlertConfig(); err == nil {
m.alertManager.UpdateConfig(*alertConfig)
// Apply schedule settings to notification manager
m.notificationMgr.SetEnabled(alertConfig.Enabled && alertConfig.ActivationState == alerts.ActivationActive)
m.notificationMgr.SetCooldown(alertConfig.Schedule.Cooldown)
m.notificationMgr.SetGroupingWindow(alertConfig.Schedule.Grouping.Window)
m.notificationMgr.SetGroupingOptions(
alertConfig.Schedule.Grouping.ByNode,
alertConfig.Schedule.Grouping.ByGuest,
)
m.notificationMgr.SetNotifyOnResolve(alertConfig.Schedule.NotifyOnResolve)
} else {
log.Warn().Err(err).Msg("failed to load alert configuration")
}
if emailConfig, err := m.configPersist.LoadEmailConfig(); err == nil {
m.notificationMgr.SetEmailConfig(*emailConfig)
} else {
log.Warn().Err(err).Msg("failed to load email configuration")
}
if concurrency > 0 {
m.guestMetadataSlots = make(chan struct{}, concurrency)
}
if appriseConfig, err := m.configPersist.LoadAppriseConfig(); err == nil {
m.notificationMgr.SetAppriseConfig(*appriseConfig)
} else {
log.Warn().Err(err).Msg("failed to load Apprise configuration")
}
// Migrate webhooks if needed (from unencrypted to encrypted)
if err := m.configPersist.MigrateWebhooksIfNeeded(); err != nil {
log.Warn().Err(err).Msg("failed to migrate webhooks")
}
if webhooks, err := m.configPersist.LoadWebhooks(); err == nil {
for _, webhook := range webhooks {
m.notificationMgr.AddWebhook(webhook)
}
} else {
log.Warn().Err(err).Msg("failed to load webhook configuration")
}
// In mock mode the canonical sampler owns demo chart history by default.
// Support-only hybrid runs can opt back into real client initialization.
mockEnabled := mock.IsMockEnabled()
if mockEnabled && !keepRealPollingInMockMode() {
log.Info().Msg("mock mode enabled - real client initialization disabled")
} else {
m.initPVEClients(cfg)
m.initPBSClients(cfg)
m.initPMGClients(cfg)
}
// Initialize state stats
m.state.Stats = models.Stats{
StartTime: m.startTime,
Version: "2.0.0-go",
}
return m, nil
}
// SetExecutor allows tests to override the poll executor; passing nil restores the default executor.
func (m *Monitor) SetExecutor(exec PollExecutor) {
if m == nil {
return
}
m.mu.Lock()
defer m.mu.Unlock()
if exec == nil {
m.executor = newRealExecutor(m)
return
}
m.executor = exec
}
func (m *Monitor) buildInstanceInfoCache(cfg *config.Config) {
_ = cfg
m.refreshInstanceInfoCacheFromProviders()
}
func (m *Monitor) getExecutor() PollExecutor {
m.mu.RLock()
exec := m.executor
m.mu.RUnlock()
return exec
}
func clampInterval(value, min, max time.Duration) time.Duration {
if value <= 0 {
return min
}
if min > 0 && value < min {
return min
}
if max > 0 && value > max {
return max
}
return value
}
func (m *Monitor) effectivePVEPollingInterval() time.Duration {
const minInterval = 10 * time.Second
const maxInterval = time.Hour
interval := minInterval
if m != nil && m.config != nil && m.config.PVEPollingInterval > 0 {
interval = m.config.PVEPollingInterval
}
if interval < minInterval {
interval = minInterval
}
if interval > maxInterval {
interval = maxInterval
}
return interval
}
func (m *Monitor) baseIntervalForInstanceType(instanceType InstanceType) time.Duration {
if provider := m.getPollProvider(instanceType); provider != nil {
if interval := provider.BaseInterval(m); interval > 0 {
return interval
}
}
if m == nil || m.config == nil {
return DefaultSchedulerConfig().BaseInterval
}
switch instanceType {
case InstanceTypePVE:
return m.effectivePVEPollingInterval()
case InstanceTypePBS:
return clampInterval(m.config.PBSPollingInterval, 10*time.Second, time.Hour)
case InstanceTypePMG:
return clampInterval(m.config.PMGPollingInterval, 10*time.Second, time.Hour)
default:
base := m.config.AdaptivePollingBaseInterval
if base <= 0 {
base = DefaultSchedulerConfig().BaseInterval
}
return clampInterval(base, time.Second, 0)
}
}
// Start begins the monitoring loop
func (m *Monitor) Start(ctx context.Context, wsHub *websocket.Hub) {
// Consolidate any duplicate cluster instances before starting
// This fixes the case where multiple agents registered from the same cluster
m.consolidateDuplicateClusters()
pollingInterval := m.effectivePVEPollingInterval()
log.Info().
Dur("pollingInterval", pollingInterval).
Msg("Starting monitoring loop")
m.mu.Lock()
m.runtimeCtx = ctx
m.wsHub = wsHub
m.mu.Unlock()
defer m.stopMockMetricsSampler()
// Best-effort startup cleanup: when direct PBS is configured, remove legacy
// PVE-proxied PBS backup points to prevent duplicate recovery entries.
m.purgeStalePVEPBSBackupsBestEffort(ctx)
if mock.IsMockEnabled() {
m.startMockMetricsSampler(ctx)
}
// Initialize and start discovery service if enabled
if mock.IsMockEnabled() {
log.Info().Msg("mock mode enabled - skipping discovery service")
m.discoveryService = nil
} else if m.config.DiscoveryEnabled {
discoverySubnet := m.config.DiscoverySubnet
if discoverySubnet == "" {
discoverySubnet = "auto"
}
cfgProvider := func() config.DiscoveryConfig {
m.mu.RLock()
defer m.mu.RUnlock()
if m.config == nil {
return config.DefaultDiscoveryConfig()
}
cfg := config.CloneDiscoveryConfig(m.config.Discovery)
// Auto-populate IPBlocklist with configured Proxmox host IPs to avoid
// probing hosts we already know about (reduces PBS auth failure log spam)
cfg.IPBlocklist = m.getConfiguredHostIPs()
return cfg
}
m.discoveryService = discovery.NewService(wsHub, 5*time.Minute, discoverySubnet, cfgProvider)
if m.discoveryService != nil {
m.discoveryService.Start(ctx)
log.Info().Msg("discovery service initialized and started")
} else {
log.Error().Msg("failed to initialize discovery service")
}
} else {
log.Info().Msg("discovery service disabled by configuration")
m.discoveryService = nil
}
// Set up alert callbacks
m.alertManager.SetAlertCallback(func(alert *alerts.Alert) {
m.handleAlertFired(alert)
})
// Set up AI analysis callback - this bypasses activation state and other notification suppression
// so AI can analyze alerts even during pending_review setup phase
m.alertManager.SetAlertForAICallback(func(alert *alerts.Alert) {
log.Debug().Str("alertID", alert.ID).Msg("AI alert callback invoked (bypassing notification suppression)")
if m.alertTriggeredAICallback != nil {
m.alertTriggeredAICallback(alert)
}
})
m.alertManager.SetResolvedCallback(func(alertID string) {
m.handleAlertResolved(alertID)
// Don't broadcast full state here - it causes a cascade with many guests.
// The frontend will get the updated alerts through the regular broadcast ticker.
})
m.alertManager.SetAcknowledgedCallback(func(alert *alerts.Alert, user string) {
m.handleAlertAcknowledged(alert, user)
})
m.alertManager.SetUnacknowledgedCallback(func(alert *alerts.Alert, user string) {
m.handleAlertUnacknowledged(alert, user)
})
m.alertManager.SetEscalateCallback(func(alert *alerts.Alert, level int) {
log.Info().
Str("alertID", alert.ID).
Int("level", level).
Msg("Alert escalated - sending notifications")
// Get escalation config
config := m.alertManager.GetConfig()
if level <= 0 || level > len(config.Schedule.Escalation.Levels) {
return
}
escalationLevel := config.Schedule.Escalation.Levels[level-1]
// Send notifications based on escalation level
switch escalationLevel.Notify {
case "email":
// Only send email
if emailConfig := m.notificationMgr.GetEmailConfig(); emailConfig.Enabled {
m.notificationMgr.SendAlert(alert)
}
case "webhook":
// Only send webhooks
for _, webhook := range m.notificationMgr.GetWebhooks() {
if webhook.Enabled {
m.notificationMgr.SendAlert(alert)
break
}
}
case "all":
// Send all notifications
m.notificationMgr.SendAlert(alert)
}
// Update WebSocket with escalation
m.broadcastEscalatedAlert(wsHub, alert)
})
// Create separate tickers for polling and broadcasting using the configured cadence
workerCount := m.totalClientCount()
m.startTaskWorkers(ctx, workerCount)
pollTicker := time.NewTicker(pollingInterval)
defer pollTicker.Stop()
broadcastTicker := time.NewTicker(pollingInterval)
defer broadcastTicker.Stop()
keepRealPolling := keepRealPollingInMockMode()
// Start connection retry mechanism for failed clients
// This handles cases where network/Proxmox isn't ready on initial startup
if !mock.IsMockEnabled() || keepRealPolling {
go m.retryFailedConnections(ctx)
}
// Do an immediate poll on start.
if mock.IsMockEnabled() {
if keepRealPolling {
log.Info().Msg("mock mode enabled - running mock alerts and real metric polling")
go m.checkMockAlerts()
go m.poll(ctx, wsHub)
} else {
log.Info().Msg("mock mode enabled - skipping real node polling")
go m.checkMockAlerts()
}
} else {
go m.poll(ctx, wsHub)
}
for {
select {
case <-pollTicker.C:
now := time.Now()
m.evaluateDockerAgents(now)
m.evaluateKubernetesAgents(now)
m.evaluateHostAgents(now)
m.cleanupRemovedDockerHosts(now)
m.cleanupRemovedKubernetesClusters(now)
m.cleanupRemovedHostAgents(now)
m.cleanupGuestMetadataCache(now)
m.cleanupDiagnosticSnapshots(now)
m.cleanupRRDCache(now)
m.cleanupTrackingMaps(now)
m.cleanupMetricsHistory()
m.cleanupRateTracker(now)
if mock.IsMockEnabled() {
// In mock mode, keep synthetic alerts fresh
go m.checkMockAlerts()
if keepRealPolling {
// Keep real metrics flowing while mock UI mode is active.
go m.poll(ctx, wsHub)
}
} else {
// Poll real infrastructure
go m.poll(ctx, wsHub)
}
case <-broadcastTicker.C:
// Broadcast current state regardless of polling status
// Use GetState() instead of m.state.GetSnapshot() to respect mock mode
state := m.GetState()
log.Info().
Int("nodes", len(state.Nodes)).
Int("vms", len(state.VMs)).
Int("containers", len(state.Containers)).
Int("hosts", len(state.Hosts)).
Int("pbs", len(state.PBSInstances)).
Int("pbsBackups", len(state.Backups.PBS)).
Int("physicalDisks", len(state.PhysicalDisks)).
Msg("Broadcasting state update (ticker)")
frontendState := m.buildBroadcastFrontendStateFromSnapshot(state)
// Use tenant-aware broadcast method
m.broadcastState(wsHub, frontendState)
case <-ctx.Done():
log.Info().Msg("monitoring loop stopped")
return
}
}
}
// poll fetches data from all configured instances
func (m *Monitor) poll(_ context.Context, wsHub *websocket.Hub) {
defer recoverFromPanic("poll")
// Limit concurrent polls to 2 to prevent resource exhaustion
currentCount := atomic.AddInt32(&m.activePollCount, 1)
if currentCount > 2 {
atomic.AddInt32(&m.activePollCount, -1)
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().Int32("activePolls", currentCount-1).Msg("too many concurrent polls, skipping")
}
return
}
defer atomic.AddInt32(&m.activePollCount, -1)
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().Msg("starting polling cycle")
}
startTime := time.Now()
now := startTime
plannedTasks := m.buildScheduledTasks(now)
for _, task := range plannedTasks {
m.taskQueue.Upsert(task)
}
m.updateQueueDepthMetric()
// Update performance metrics atomically to prevent data races when
// multiple poll() goroutines run concurrently (e.g. mock mode transitions).
wsClients := 0
if wsHub != nil {
wsClients = wsHub.GetClientCount()
}
m.state.UpdatePollStats(
time.Since(startTime).Seconds(),
int64(time.Since(m.startTime).Seconds()),
wsClients,
)
// Sync alert state so broadcasts include the latest acknowledgement data
m.syncAlertsToState()
// Increment poll counter
m.mu.Lock()
m.pollCounter++
m.mu.Unlock()
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().Dur("duration", time.Since(startTime)).Msg("polling cycle completed")
}
// Broadcasting is now handled by the timer in Start()
}
func (m *Monitor) startTaskWorkers(ctx context.Context, workers int) {
if m.taskQueue == nil {
return
}
if workers < 1 {
workers = 1
}
if workers > 10 {
workers = 10
}
for i := 0; i < workers; i++ {
go m.taskWorker(ctx, i)
}
}
func (m *Monitor) taskWorker(ctx context.Context, id int) {
defer recoverFromPanic(fmt.Sprintf("taskWorker-%d", id))
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().Int("worker", id).Msg("task worker started")
}
for {
task, ok := m.taskQueue.WaitNext(ctx)
if !ok {
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().Int("worker", id).Msg("task worker stopping")
}
return
}
m.executeScheduledTask(ctx, task)
m.rescheduleTask(task)
m.updateQueueDepthMetric()
}
}
func derivePollTimeout(cfg *config.Config) time.Duration {
timeout := defaultTaskTimeout
if cfg != nil && cfg.ConnectionTimeout > 0 {
timeout = cfg.ConnectionTimeout * 2
}
if timeout < minTaskTimeout {
timeout = minTaskTimeout
}
// Use configurable max timeout from config (set via MAX_POLL_TIMEOUT env var)
// Falls back to hardcoded maxTaskTimeout if config is nil or MaxPollTimeout not set
maxTimeout := maxTaskTimeout
if cfg != nil && cfg.MaxPollTimeout > 0 {
maxTimeout = cfg.MaxPollTimeout
}
if timeout > maxTimeout {
timeout = maxTimeout
}
return timeout
}
func (m *Monitor) taskExecutionTimeout(_ InstanceType) time.Duration {
if m == nil {
return defaultTaskTimeout
}
timeout := m.pollTimeout
if timeout <= 0 {
timeout = defaultTaskTimeout
}
return timeout
}
func (m *Monitor) executeScheduledTask(ctx context.Context, task ScheduledTask) {
if !m.allowExecution(task) {
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().
Str("instance", task.InstanceName).
Str("type", string(task.InstanceType)).
Msg("Task blocked by circuit breaker")
}
return
}
if m.pollMetrics != nil {
wait := time.Duration(0)
if !task.NextRun.IsZero() {
wait = time.Since(task.NextRun)
if wait < 0 {
wait = 0
}
}
instanceType := string(task.InstanceType)
if strings.TrimSpace(instanceType) == "" {
instanceType = "unknown"
}
m.pollMetrics.RecordQueueWait(instanceType, wait)
}
executor := m.getExecutor()
if executor == nil {
log.Error().
Str("instance", task.InstanceName).
Str("type", string(task.InstanceType)).
Msg("No poll executor configured; skipping task")
return
}
pollTask, ok := m.buildPollTask(task)
if !ok {
return
}
taskCtx := ctx
var cancel context.CancelFunc
timeout := m.taskExecutionTimeout(task.InstanceType)
if timeout > 0 {
taskCtx, cancel = context.WithTimeout(ctx, timeout)
defer cancel()
}
executor.Execute(taskCtx, pollTask)
if timeout > 0 && stderrors.Is(taskCtx.Err(), context.DeadlineExceeded) {
log.Warn().
Str("instance", task.InstanceName).
Str("type", string(task.InstanceType)).
Dur("timeout", timeout).
Msg("Polling task timed out; rescheduling with fresh worker")
}
}
func (m *Monitor) buildPollTask(task ScheduledTask) (PollTask, bool) {
provider := m.getPollProvider(task.InstanceType)
if provider == nil {
log.Debug().
Str("instance", task.InstanceName).
Str("type", string(task.InstanceType)).
Msg("Skipping unsupported task type")
return PollTask{}, false
}
pollTask, err := provider.BuildPollTask(m, task.InstanceName)
if err != nil {
log.Warn().
Str("instance", task.InstanceName).
Str("type", string(task.InstanceType)).
Err(err).
Msg("Skipping scheduled task")
return PollTask{}, false
}
if strings.TrimSpace(pollTask.InstanceName) == "" {
pollTask.InstanceName = task.InstanceName
}
if strings.TrimSpace(pollTask.InstanceType) == "" {
pollTask.InstanceType = string(task.InstanceType)
}
return pollTask, true
}
func (m *Monitor) rescheduleTask(task ScheduledTask) {
if m.taskQueue == nil {
return
}
key := schedulerKey(task.InstanceType, task.InstanceName)
m.mu.Lock()
outcome, hasOutcome := m.lastOutcome[key]
failureCount := m.failureCounts[key]
m.mu.Unlock()
if hasOutcome && !outcome.success {
if !outcome.transient || failureCount >= m.maxRetryAttempts {
m.sendToDeadLetter(task, outcome.err)
return
}
delay := m.backoffCfg.nextDelay(failureCount-1, m.randomFloat())
if delay <= 0 {
delay = 5 * time.Second
}
if m.config != nil && m.config.AdaptivePollingEnabled && m.config.AdaptivePollingMaxInterval > 0 && m.config.AdaptivePollingMaxInterval <= 15*time.Second {
maxDelay := 4 * time.Second
if delay > maxDelay {
delay = maxDelay
}
}
next := task
next.Interval = delay
next.NextRun = time.Now().Add(delay)
m.taskQueue.Upsert(next)
return
}
if m.scheduler == nil {
baseInterval := m.baseIntervalForInstanceType(task.InstanceType)
nextInterval := task.Interval
if nextInterval <= 0 {
nextInterval = baseInterval
}
if nextInterval <= 0 {
nextInterval = DefaultSchedulerConfig().BaseInterval
}
next := task
next.NextRun = time.Now().Add(nextInterval)
next.Interval = nextInterval
m.taskQueue.Upsert(next)
return
}
desc := InstanceDescriptor{
Name: task.InstanceName,
Type: task.InstanceType,
LastInterval: task.Interval,
LastScheduled: task.NextRun,
}
if m.stalenessTracker != nil {
if snap, ok := m.stalenessTracker.snapshot(task.InstanceType, task.InstanceName); ok {
desc.LastSuccess = snap.LastSuccess
desc.LastFailure = snap.LastError
if snap.ChangeHash != "" {
desc.Metadata = TaskMetadata{ChangeHash: snap.ChangeHash}
}
}
}
tasks := m.scheduler.BuildPlan(time.Now(), []InstanceDescriptor{desc}, m.taskQueue.Size())
if len(tasks) == 0 {
next := task
nextInterval := task.Interval
if nextInterval <= 0 && m.config != nil {
nextInterval = m.config.AdaptivePollingBaseInterval
}
if nextInterval <= 0 {
nextInterval = DefaultSchedulerConfig().BaseInterval
}
next.Interval = nextInterval
next.NextRun = time.Now().Add(nextInterval)
m.taskQueue.Upsert(next)
return
}
for _, next := range tasks {
m.taskQueue.Upsert(next)
}
}
func (m *Monitor) sendToDeadLetter(task ScheduledTask, err error) {
if m.deadLetterQueue == nil {
log.Error().
Str("instance", task.InstanceName).
Str("type", string(task.InstanceType)).
Err(err).
Msg("Dead-letter queue unavailable; dropping task")
return
}
log.Error().
Str("instance", task.InstanceName).
Str("type", string(task.InstanceType)).
Err(err).
Msg("Routing task to dead-letter queue after repeated failures")
next := task
next.Interval = 30 * time.Minute
next.NextRun = time.Now().Add(next.Interval)
m.deadLetterQueue.Upsert(next)
m.updateDeadLetterMetrics()
key := schedulerKey(task.InstanceType, task.InstanceName)
now := time.Now()
m.mu.Lock()
if m.dlqInsightMap == nil {
m.dlqInsightMap = make(map[string]*dlqInsight)
}
info, ok := m.dlqInsightMap[key]
if !ok {
info = &dlqInsight{}
m.dlqInsightMap[key] = info
}
if info.FirstAttempt.IsZero() {
info.FirstAttempt = now
}
info.LastAttempt = now
info.RetryCount++
info.NextRetry = next.NextRun
if err != nil {
info.Reason = classifyDLQReason(err)
}
m.mu.Unlock()
}
func classifyDLQReason(err error) string {
if err == nil {
return ""
}
if errors.IsRetryableError(err) {
return "max_retry_attempts"
}
return "permanent_failure"
}
func (m *Monitor) updateDeadLetterMetrics() {
if m.pollMetrics == nil || m.deadLetterQueue == nil {
return
}
size := m.deadLetterQueue.Size()
if size <= 0 {
m.pollMetrics.UpdateDeadLetterCounts(nil)
return
}
tasks := m.deadLetterQueue.PeekAll(size)
m.pollMetrics.UpdateDeadLetterCounts(tasks)
}
func (m *Monitor) updateBreakerMetric(instanceType InstanceType, instance string, breaker *circuitBreaker) {
if m.pollMetrics == nil || breaker == nil {
return
}
state, failures, retryAt, _, _ := breaker.stateDetails()
m.pollMetrics.SetBreakerState(string(instanceType), instance, state, failures, retryAt)
}
func (m *Monitor) randomFloat() float64 {
if m.rng == nil {
m.rng = rand.New(rand.NewSource(time.Now().UnixNano()))
}
return m.rng.Float64()
}
func (m *Monitor) updateQueueDepthMetric() {
if m.pollMetrics == nil || m.taskQueue == nil {
return
}
snapshot := m.taskQueue.Snapshot()
m.pollMetrics.SetQueueDepth(snapshot.Depth)
m.pollMetrics.UpdateQueueSnapshot(snapshot)
}
func (m *Monitor) allowExecution(task ScheduledTask) bool {
if m.circuitBreakers == nil {
return true
}
key := schedulerKey(task.InstanceType, task.InstanceName)
breaker := m.ensureBreaker(key)
allowed := breaker.allow(time.Now())
m.updateBreakerMetric(task.InstanceType, task.InstanceName, breaker)
return allowed
}
func (m *Monitor) ensureBreaker(key string) *circuitBreaker {
m.mu.Lock()
defer m.mu.Unlock()
if m.circuitBreakers == nil {
m.circuitBreakers = make(map[string]*circuitBreaker)
}
if breaker, ok := m.circuitBreakers[key]; ok {
return breaker
}
baseRetry := m.breakerBaseRetry
if baseRetry <= 0 {
baseRetry = 5 * time.Second
}
maxDelay := m.breakerMaxDelay
if maxDelay <= 0 {
maxDelay = 5 * time.Minute
}
halfOpen := m.breakerHalfOpenWindow
if halfOpen <= 0 {
halfOpen = 30 * time.Second
}
breaker := newCircuitBreaker(3, baseRetry, maxDelay, halfOpen)
m.circuitBreakers[key] = breaker
return breaker
}
func (m *Monitor) recordTaskResult(instanceType InstanceType, instance string, pollErr error) {
if m == nil {
return
}
key := schedulerKey(instanceType, instance)
now := time.Now()
breaker := m.ensureBreaker(key)
m.mu.Lock()
status, ok := m.pollStatusMap[key]
if !ok {
status = &pollStatus{}
m.pollStatusMap[key] = status
}
if pollErr == nil {
if m.failureCounts != nil {
m.failureCounts[key] = 0
}
if m.lastOutcome != nil {
m.lastOutcome[key] = taskOutcome{
success: true,
transient: true,
err: nil,
recordedAt: now,
}
}
status.LastSuccess = now
status.ConsecutiveFailures = 0
status.FirstFailureAt = time.Time{}
m.mu.Unlock()
if breaker != nil {
breaker.recordSuccess()
m.updateBreakerMetric(instanceType, instance, breaker)
}
return
}
transient := isTransientError(pollErr)
category := "permanent"
if transient {
category = "transient"
}
if m.failureCounts != nil {
m.failureCounts[key] = m.failureCounts[key] + 1
}
if m.lastOutcome != nil {
m.lastOutcome[key] = taskOutcome{
success: false,
transient: transient,
err: pollErr,
recordedAt: now,
}
}
status.LastErrorAt = now
status.LastErrorMessage = pollErr.Error()
status.LastErrorCategory = category
status.ConsecutiveFailures++
if status.ConsecutiveFailures == 1 {
status.FirstFailureAt = now
}
m.mu.Unlock()
if breaker != nil {
breaker.recordFailure(now)
m.updateBreakerMetric(instanceType, instance, breaker)
}
}
// SchedulerHealthResponse contains complete scheduler health data for API exposure.
type SchedulerHealthResponse struct {
UpdatedAt time.Time `json:"updatedAt"`
Enabled bool `json:"enabled"`
Queue QueueSnapshot `json:"queue"`
DeadLetter DeadLetterSnapshot `json:"deadLetter"`
Breakers []BreakerSnapshot `json:"breakers"`
Staleness []StalenessSnapshot `json:"staleness"`
Instances []InstanceHealth `json:"instances"`
}
// DeadLetterSnapshot contains dead-letter queue data.
type DeadLetterSnapshot struct {
Count int `json:"count"`
Tasks []DeadLetterTask `json:"tasks"`
}
func emptyDeadLetterSnapshot() DeadLetterSnapshot {
return DeadLetterSnapshot{
Tasks: []DeadLetterTask{},
}
}
func emptySchedulerHealthResponse(enabled bool) SchedulerHealthResponse {
return SchedulerHealthResponse{
UpdatedAt: time.Now(),
Enabled: enabled,
Queue: emptyQueueSnapshot(),
DeadLetter: emptyDeadLetterSnapshot(),
Breakers: []BreakerSnapshot{},
Staleness: []StalenessSnapshot{},
Instances: []InstanceHealth{},
}
}
// SchedulerHealth returns a complete snapshot of scheduler health for API exposure.
func (m *Monitor) SchedulerHealth() SchedulerHealthResponse {
response := emptySchedulerHealthResponse(m.config != nil && m.config.AdaptivePollingEnabled)
m.refreshInstanceInfoCacheFromProviders()
// Queue snapshot
if m.taskQueue != nil {
response.Queue = m.taskQueue.Snapshot()
if m.pollMetrics != nil {
m.pollMetrics.UpdateQueueSnapshot(response.Queue)
}
}
// Dead-letter queue snapshot
if m.deadLetterQueue != nil {
deadLetterTasks := m.deadLetterQueue.PeekAll(25) // limit to top 25
m.mu.RLock()
for i := range deadLetterTasks {
key := schedulerKey(InstanceType(deadLetterTasks[i].Type), deadLetterTasks[i].Instance)
if outcome, ok := m.lastOutcome[key]; ok && outcome.err != nil {
deadLetterTasks[i].LastError = outcome.err.Error()
}
if count, ok := m.failureCounts[key]; ok {
deadLetterTasks[i].Failures = count
}
}
m.mu.RUnlock()
response.DeadLetter = DeadLetterSnapshot{
Count: m.deadLetterQueue.Size(),
Tasks: deadLetterTasks,
}
m.updateDeadLetterMetrics()
}
// Circuit breaker snapshots
m.mu.RLock()
breakerSnapshots := make([]BreakerSnapshot, 0, len(m.circuitBreakers))
for key, breaker := range m.circuitBreakers {
state, failures, retryAt := breaker.State()
// Only include breakers that are not in default closed state with 0 failures
if state != "closed" || failures > 0 {
// Parse instance type and name from key
parts := strings.SplitN(key, "::", 2)
instanceType, instanceName := "unknown", key
if len(parts) == 2 {
instanceType, instanceName = parts[0], parts[1]
}
breakerSnapshots = append(breakerSnapshots, BreakerSnapshot{
Instance: instanceName,
Type: instanceType,
State: state,
Failures: failures,
RetryAt: retryAt,
})
}
}
m.mu.RUnlock()
response.Breakers = breakerSnapshots
// Staleness snapshots
if m.stalenessTracker != nil {
response.Staleness = m.stalenessTracker.Snapshot()
}
instanceInfos := make(map[string]*instanceInfo)
pollStatuses := make(map[string]pollStatus)
dlqInsights := make(map[string]dlqInsight)
breakerRefs := make(map[string]*circuitBreaker)
m.mu.RLock()
for k, v := range m.instanceInfoCache {
if v == nil {
continue
}
copyVal := *v
instanceInfos[k] = &copyVal
}
for k, v := range m.pollStatusMap {
if v == nil {
continue
}
pollStatuses[k] = *v
}
for k, v := range m.dlqInsightMap {
if v == nil {
continue
}
dlqInsights[k] = *v
}
for k, v := range m.circuitBreakers {
if v != nil {
breakerRefs[k] = v
}
}
m.mu.RUnlock()
for key, breaker := range breakerRefs {
instanceType := InstanceType("unknown")
instanceName := key
if parts := strings.SplitN(key, "::", 2); len(parts) == 2 {
if parts[0] != "" {
instanceType = InstanceType(parts[0])
}
if parts[1] != "" {
instanceName = parts[1]
}
}
m.updateBreakerMetric(instanceType, instanceName, breaker)
}
keySet := make(map[string]struct{})
for k := range instanceInfos {
if k != "" {
keySet[k] = struct{}{}
}
}
for k := range pollStatuses {
if k != "" {
keySet[k] = struct{}{}
}
}
for k := range dlqInsights {
if k != "" {
keySet[k] = struct{}{}
}
}
for k := range breakerRefs {
if k != "" {
keySet[k] = struct{}{}
}
}
for _, task := range response.DeadLetter.Tasks {
if task.Instance == "" {
continue
}
keySet[schedulerKey(InstanceType(task.Type), task.Instance)] = struct{}{}
}
for _, snap := range response.Staleness {
if snap.Instance == "" {
continue
}
keySet[schedulerKey(InstanceType(snap.Type), snap.Instance)] = struct{}{}
}
if len(keySet) > 0 {
keys := make([]string, 0, len(keySet))
for k := range keySet {
keys = append(keys, k)
}
sort.Strings(keys)
instances := make([]InstanceHealth, 0, len(keys))
for _, key := range keys {
instType := "unknown"
instName := key
if parts := strings.SplitN(key, "::", 2); len(parts) == 2 {
if parts[0] != "" {
instType = parts[0]
}
if parts[1] != "" {
instName = parts[1]
}
}
instType = strings.TrimSpace(instType)
instName = strings.TrimSpace(instName)
info := instanceInfos[key]
display := instName
connection := ""
if info != nil {
if instType == "unknown" || instType == "" {
if info.Type != "" {
instType = string(info.Type)
}
}
if strings.Contains(info.Key, "::") {
if parts := strings.SplitN(info.Key, "::", 2); len(parts) == 2 {
if instName == key {
instName = parts[1]
}
if (instType == "" || instType == "unknown") && parts[0] != "" {
instType = parts[0]
}
}
}
if info.DisplayName != "" {
display = info.DisplayName
}
if info.Connection != "" {
connection = info.Connection
}
}
display = strings.TrimSpace(display)
connection = strings.TrimSpace(connection)
if display == "" {
display = instName
}
if display == "" {
display = connection
}
if instType == "" {
instType = "unknown"
}
if instName == "" {
instName = key
}
status, hasStatus := pollStatuses[key]
instanceStatus := InstancePollStatus{}
if hasStatus {
instanceStatus.ConsecutiveFailures = status.ConsecutiveFailures
instanceStatus.LastSuccess = timePtr(status.LastSuccess)
if !status.FirstFailureAt.IsZero() {
instanceStatus.FirstFailureAt = timePtr(status.FirstFailureAt)
}
if !status.LastErrorAt.IsZero() && status.LastErrorMessage != "" {
instanceStatus.LastError = &ErrorDetail{
At: status.LastErrorAt,
Message: status.LastErrorMessage,
Category: status.LastErrorCategory,
}
}
}
breakerInfo := InstanceBreaker{
State: "closed",
FailureCount: 0,
}
if br, ok := breakerRefs[key]; ok && br != nil {
state, failures, retryAt, since, lastTransition := br.stateDetails()
if state != "" {
breakerInfo.State = state
}
breakerInfo.FailureCount = failures
breakerInfo.RetryAt = timePtr(retryAt)
breakerInfo.Since = timePtr(since)
breakerInfo.LastTransition = timePtr(lastTransition)
}
dlqInfo := InstanceDLQ{Present: false}
if dlq, ok := dlqInsights[key]; ok {
dlqInfo.Present = true
dlqInfo.Reason = dlq.Reason
dlqInfo.FirstAttempt = timePtr(dlq.FirstAttempt)
dlqInfo.LastAttempt = timePtr(dlq.LastAttempt)
dlqInfo.RetryCount = dlq.RetryCount
dlqInfo.NextRetry = timePtr(dlq.NextRetry)
}
// Collect any warnings for this instance
var warnings []string
if instType == "pve" {
if warning, ok := m.backupPermissionWarnings[instName]; ok {
warnings = append(warnings, warning)
}
}
instances = append(instances, InstanceHealth{
Key: key,
Type: instType,
DisplayName: display,
Instance: instName,
Connection: connection,
PollStatus: instanceStatus,
Breaker: breakerInfo,
DeadLetter: dlqInfo,
Warnings: warnings,
}.NormalizeCollections())
}
response.Instances = instances
} else {
response.Instances = []InstanceHealth{}
}
return response
}
func isTransientError(err error) bool {
if err == nil {
return true
}
if errors.IsRetryableError(err) {
return true
}
if stderrors.Is(err, context.Canceled) || stderrors.Is(err, context.DeadlineExceeded) {
return true
}
return false
}
func (m *Monitor) GetState() models.StateSnapshot {
if m == nil {
return models.StateSnapshot{}
}
// Check if mock mode is enabled
if mock.IsMockEnabled() {
state := mock.CurrentFixtureGraph().State
if state.ActiveAlerts == nil && m.alertManager != nil {
// Populate snapshot lazily if the cache hasn't been filled yet.
mock.UpdateAlertSnapshots(m.alertManager.GetActiveAlerts(), m.alertManager.GetRecentlyResolved())
state = mock.CurrentFixtureGraph().State
}
return state
}
if m.state == nil {
return models.StateSnapshot{}
}
state := m.state.GetSnapshot()
// Keep externally served alert arrays aligned with the live alert manager
// even between explicit sync points, so APIs do not expose stale alert
// counts or recently resolved incidents from cached state.
state.ActiveAlerts = m.activeAlertsSnapshot()
state.RecentlyResolved = m.recentlyResolvedAlertsSnapshot()
return state
}
// ReadSnapshot returns a snapshot of the current infrastructure state,
// respecting mock mode when enabled.
//
// This is the preferred accessor for consumer code that needs the full
// StateSnapshot (e.g., chart rendering, reporting, AI state queries).
// This method satisfies models.SnapshotProvider — the single canonical
// interface that all consumer packages depend on. Fields available
// via ReadState should be accessed there instead when practical.
func (m *Monitor) ReadSnapshot() models.StateSnapshot {
return m.GetState()
}
// BackupsSnapshot returns the current backup state.
func (m *Monitor) BackupsSnapshot() models.Backups {
return m.GetState().Backups
}
// PBSInstancesSnapshot returns the current PBS instances.
func (m *Monitor) PBSInstancesSnapshot() []models.PBSInstance {
if m == nil {
return nil
}
readState := m.GetUnifiedReadStateOrSnapshot()
if readState == nil {
return nil
}
pbsViews := readState.PBSInstances()
if len(pbsViews) == 0 {
return nil
}
instances := make([]models.PBSInstance, 0, len(pbsViews))
for _, instance := range pbsViews {
if instance == nil {
continue
}
instances = append(instances, pbsInstanceFromReadStateView(instance))
}
return instances
}
// ReplicationJobsSnapshot returns the current replication jobs.
func (m *Monitor) ReplicationJobsSnapshot() []models.ReplicationJob {
return m.GetState().ReplicationJobs
}
// ConnectionHealthSnapshot returns the current connection health map.
func (m *Monitor) ConnectionHealthSnapshot() map[string]bool {
return m.GetState().ConnectionHealth
}
// HostsSnapshot returns the current hosts.
func (m *Monitor) HostsSnapshot() []models.Host {
if m == nil {
return nil
}
readState := m.GetUnifiedReadStateOrSnapshot()
if readState == nil {
return nil
}
hostViews := readState.Hosts()
if len(hostViews) == 0 {
return nil
}
hosts := make([]models.Host, 0, len(hostViews))
for _, host := range hostViews {
if host == nil {
continue
}
hosts = append(hosts, hostFromReadStateView(host))
}
return hosts
}
// VMsSnapshot returns the current VMs.
func (m *Monitor) VMsSnapshot() []models.VM {
if m == nil {
return nil
}
readState := m.GetUnifiedReadStateOrSnapshot()
if readState == nil {
return nil
}
vmViews := readState.VMs()
if len(vmViews) == 0 {
return nil
}
vms := make([]models.VM, 0, len(vmViews))
for _, vm := range vmViews {
if vm == nil {
continue
}
vms = append(vms, vmFromReadStateView(vm))
}
return vms
}
// ContainersSnapshot returns the current system containers.
func (m *Monitor) ContainersSnapshot() []models.Container {
if m == nil {
return nil
}
readState := m.GetUnifiedReadStateOrSnapshot()
if readState == nil {
return nil
}
containerViews := readState.Containers()
if len(containerViews) == 0 {
return nil
}
containers := make([]models.Container, 0, len(containerViews))
for _, container := range containerViews {
if container == nil {
continue
}
containers = append(containers, containerFromReadStateView(container))
}
return containers
}
// NodesSnapshot returns the current Proxmox nodes.
func (m *Monitor) NodesSnapshot() []models.Node {
if m == nil {
return nil
}
readState := m.GetUnifiedReadStateOrSnapshot()
if readState == nil {
return nil
}
nodeViews := readState.Nodes()
if len(nodeViews) == 0 {
return nil
}
nodes := make([]models.Node, 0, len(nodeViews))
for _, node := range nodeViews {
if node == nil {
continue
}
nodes = append(nodes, nodeFromReadStateView(node))
}
return nodes
}
// DockerHostsSnapshot returns the current Docker hosts.
func (m *Monitor) DockerHostsSnapshot() []models.DockerHost {
if m == nil {
return nil
}
readState := m.GetUnifiedReadStateOrSnapshot()
if readState == nil {
return nil
}
dockerViews := readState.DockerHosts()
if len(dockerViews) == 0 {
return nil
}
hosts := make([]models.DockerHost, 0, len(dockerViews))
for _, host := range dockerViews {
if host == nil {
continue
}
hosts = append(hosts, dockerHostFromReadStateView(host))
}
return hosts
}
// StorageSnapshot returns the current storage pools.
func (m *Monitor) StorageSnapshot() []models.Storage {
if m == nil {
return nil
}
readState := m.GetUnifiedReadStateOrSnapshot()
if readState == nil {
return nil
}
storagePools := readState.StoragePools()
if len(storagePools) == 0 {
return nil
}
storage := make([]models.Storage, 0, len(storagePools))
for _, pool := range storagePools {
if pool == nil {
continue
}
storage = append(storage, storageFromReadStateView(pool))
}
return storage
}
func storageFromReadStateView(view *unifiedresources.StoragePoolView) models.Storage {
if view == nil {
return models.Storage{}
}
storageID := strings.TrimSpace(view.SourceID())
if storageID == "" {
storageID = strings.TrimSpace(view.ID())
}
nodes := view.AccessibleNodes()
nodeIDs := storageNodeIDsFromReadState(view.Instance(), nodes)
total := view.DiskTotal()
used := view.DiskUsed()
free := total - used
if free < 0 {
free = 0
}
return models.Storage{
ID: storageID,
Name: view.Name(),
Node: view.Node(),
Instance: view.Instance(),
Nodes: nodes,
NodeIDs: nodeIDs,
NodeCount: len(nodes),
Type: view.StorageType(),
Status: string(view.Status()),
Path: view.Path(),
Total: total,
Used: used,
Free: free,
Usage: view.DiskPercent(),
Content: view.Content(),
Shared: view.Shared(),
Enabled: view.Enabled(),
Active: view.Active(),
ZFSPool: storageZFSPoolFromReadStateView(view),
}
}
func nodeFromReadStateView(view *unifiedresources.NodeView) models.Node {
if view == nil {
return models.Node{}
}
name := view.NodeName()
displayName := ""
if trimmed := strings.TrimSpace(view.Name()); trimmed != "" && trimmed != name {
displayName = trimmed
}
return models.Node{
ID: firstNonEmptyString(view.SourceID(), view.ID()),
Name: name,
DisplayName: displayName,
Instance: view.Instance(),
Host: view.HostURL(),
GuestURL: view.GuestURL(),
Status: string(view.Status()),
Type: "node",
CPU: view.CPUPercent(),
Memory: models.Memory{Used: view.MemoryUsed(), Total: view.MemoryTotal(), Free: maxInt64(0, view.MemoryTotal()-view.MemoryUsed()), Usage: view.MemoryPercent()},
Disk: models.Disk{Used: view.DiskUsed(), Total: view.DiskTotal(), Free: maxInt64(0, view.DiskTotal()-view.DiskUsed()), Usage: view.DiskPercent()},
Uptime: view.Uptime(),
LoadAverage: view.LoadAverage(),
KernelVersion: view.KernelVersion(),
PVEVersion: view.PVEVersion(),
CPUInfo: view.CPUInfo(),
Temperature: view.TemperatureDetails(),
TemperatureMonitoringEnabled: view.TemperatureMonitoringEnabled(),
LastSeen: view.LastSeen(),
ConnectionHealth: view.ConnectionHealth(),
IsClusterMember: view.IsClusterMember(),
ClusterName: view.ClusterName(),
PendingUpdates: view.PendingUpdates(),
PendingUpdatesCheckedAt: view.PendingUpdatesCheckedAt(),
LinkedAgentID: view.LinkedAgentID(),
}
}
func hostFromReadStateView(view *unifiedresources.HostView) models.Host {
if view == nil {
return models.Host{}
}
displayName := ""
if trimmed := strings.TrimSpace(view.Name()); trimmed != "" && trimmed != view.Hostname() {
displayName = trimmed
}
return models.Host{
ID: firstNonEmptyString(view.AgentID(), view.ID()),
Hostname: view.Hostname(),
DisplayName: displayName,
Platform: view.Platform(),
OSName: view.OSName(),
OSVersion: view.OSVersion(),
KernelVersion: view.KernelVersion(),
Architecture: view.Architecture(),
CPUCount: view.CPUCount(),
CPUUsage: view.CPUPercent(),
Memory: hostMemoryFromReadStateView(view),
LoadAverage: view.LoadAverage(),
Disks: hostDisksFromReadStateView(view.Disks()),
DiskIO: hostDiskIOFromReadStateView(view.DiskIO()),
NetworkInterfaces: hostNetworkInterfacesFromReadStateView(view.NetworkInterfaces()),
Sensors: hostSensorsFromReadStateView(view.Sensors()),
RAID: hostRAIDFromReadStateView(view.RAID()),
Unraid: hostUnraidFromReadStateView(view.Unraid()),
Ceph: hostCephFromReadStateView(view.Ceph()),
Status: string(view.Status()),
UptimeSeconds: view.UptimeSeconds(),
IntervalSeconds: view.IntervalSeconds(),
LastSeen: view.LastSeen(),
AgentVersion: view.AgentVersion(),
MachineID: view.MachineID(),
CommandsEnabled: view.CommandsEnabled(),
ReportIP: view.ReportIP(),
TokenID: view.TokenID(),
TokenName: view.TokenName(),
TokenHint: view.TokenHint(),
TokenLastUsedAt: view.TokenLastUsedAt(),
Tags: view.Tags(),
DiskExclude: view.DiskExclude(),
IsLegacy: view.IsLegacy(),
NetInRate: view.NetInRate(),
NetOutRate: view.NetOutRate(),
DiskReadRate: view.DiskReadRate(),
DiskWriteRate: view.DiskWriteRate(),
LinkedNodeID: view.LinkedNodeID(),
LinkedVMID: view.LinkedVMID(),
LinkedContainerID: view.LinkedContainerID(),
}
}
func vmFromReadStateView(view *unifiedresources.VMView) models.VM {
if view == nil {
return models.VM{}
}
totalMemory := view.MemoryTotal()
usedMemory := view.MemoryUsed()
totalDisk := view.DiskTotal()
usedDisk := view.DiskUsed()
return models.VM{
ID: firstNonEmptyString(view.SourceID(), view.ID()),
VMID: view.VMID(),
Name: view.Name(),
Node: view.Node(),
Instance: view.Instance(),
Status: string(view.Status()),
Type: "qemu",
CPU: view.CPUPercent(),
CPUs: view.CPUs(),
Memory: models.Memory{Total: totalMemory, Used: usedMemory, Free: maxInt64(0, totalMemory-usedMemory), Usage: view.MemoryPercent()},
Disk: models.Disk{Used: usedDisk, Total: totalDisk, Free: maxInt64(0, totalDisk-usedDisk), Usage: view.DiskPercent()},
Disks: guestDisksFromReadStateView(view.Disks()),
DiskStatusReason: view.DiskStatusReason(),
IPAddresses: view.IPAddresses(),
OSName: view.OSName(),
OSVersion: view.OSVersion(),
AgentVersion: view.AgentVersion(),
NetworkInterfaces: guestNetworkInterfacesFromReadStateView(view.NetworkInterfaces()),
NetworkIn: maxInt64(0, int64(view.NetIn())),
NetworkOut: maxInt64(0, int64(view.NetOut())),
DiskRead: maxInt64(0, int64(view.DiskRead())),
DiskWrite: maxInt64(0, int64(view.DiskWrite())),
Uptime: view.Uptime(),
Template: view.Template(),
LastBackup: view.LastBackup(),
Tags: view.Tags(),
Lock: view.Lock(),
LastSeen: view.LastSeen(),
}
}
func containerFromReadStateView(view *unifiedresources.ContainerView) models.Container {
if view == nil {
return models.Container{}
}
totalMemory := view.MemoryTotal()
usedMemory := view.MemoryUsed()
totalDisk := view.DiskTotal()
usedDisk := view.DiskUsed()
return models.Container{
ID: firstNonEmptyString(view.SourceID(), view.ID()),
VMID: view.VMID(),
Name: view.Name(),
Node: view.Node(),
Instance: view.Instance(),
Status: string(view.Status()),
Type: firstNonEmptyString(view.ContainerType(), "lxc"),
CPU: view.CPUPercent(),
CPUs: view.CPUs(),
Memory: models.Memory{Total: totalMemory, Used: usedMemory, Free: maxInt64(0, totalMemory-usedMemory), Usage: view.MemoryPercent()},
Disk: models.Disk{Used: usedDisk, Total: totalDisk, Free: maxInt64(0, totalDisk-usedDisk), Usage: view.DiskPercent()},
Disks: guestDisksFromReadStateView(view.Disks()),
NetworkIn: maxInt64(0, int64(view.NetIn())),
NetworkOut: maxInt64(0, int64(view.NetOut())),
DiskRead: maxInt64(0, int64(view.DiskRead())),
DiskWrite: maxInt64(0, int64(view.DiskWrite())),
Uptime: view.Uptime(),
Template: view.Template(),
LastBackup: view.LastBackup(),
Tags: view.Tags(),
Lock: view.Lock(),
LastSeen: view.LastSeen(),
IPAddresses: view.IPAddresses(),
NetworkInterfaces: guestNetworkInterfacesFromReadStateView(view.NetworkInterfaces()),
OSName: view.OSName(),
IsOCI: view.IsOCI(),
OSTemplate: view.OSTemplate(),
HasDocker: view.HasDocker(),
DockerCheckedAt: view.DockerCheckedAt(),
}
}
func guestDisksFromReadStateView(disks []unifiedresources.DiskInfo) []models.Disk {
if len(disks) == 0 {
return nil
}
out := make([]models.Disk, 0, len(disks))
for _, disk := range disks {
out = append(out, models.Disk{
Total: disk.Total,
Used: disk.Used,
Free: disk.Free,
Usage: disk.Usage,
Mountpoint: disk.Mountpoint,
Type: disk.Filesystem,
Device: disk.Device,
})
}
return out
}
func guestNetworkInterfacesFromReadStateView(interfaces []unifiedresources.NetworkInterface) []models.GuestNetworkInterface {
if len(interfaces) == 0 {
return nil
}
out := make([]models.GuestNetworkInterface, 0, len(interfaces))
for _, iface := range interfaces {
out = append(out, models.GuestNetworkInterface{
Name: iface.Name,
MAC: iface.MAC,
Addresses: append([]string(nil), iface.Addresses...),
RXBytes: maxInt64(0, int64(iface.RXBytes)),
TXBytes: maxInt64(0, int64(iface.TXBytes)),
})
}
return out
}
func pbsInstanceFromReadStateView(view *unifiedresources.PBSInstanceView) models.PBSInstance {
if view == nil {
return models.PBSInstance{}
}
return models.PBSInstance{
ID: firstNonEmptyString(view.InstanceID(), view.ID()),
Name: view.Name(),
Host: view.HostURL(),
GuestURL: view.GuestURL(),
Status: string(view.Status()),
Version: view.Version(),
CPU: view.CPUPercent(),
Memory: view.MemoryPercent(),
MemoryUsed: view.MemoryUsed(),
MemoryTotal: view.MemoryTotal(),
Uptime: view.UptimeSeconds(),
Datastores: view.DatastoreDetails(),
BackupJobs: view.BackupJobs(),
SyncJobs: view.SyncJobs(),
VerifyJobs: view.VerifyJobs(),
PruneJobs: view.PruneJobs(),
GarbageJobs: view.GarbageJobs(),
ConnectionHealth: view.ConnectionHealth(),
LastSeen: view.LastSeen(),
}
}
func hostMemoryFromReadStateView(view *unifiedresources.HostView) models.Memory {
if view == nil {
return models.Memory{}
}
total := view.MemoryTotal()
used := view.MemoryUsed()
return models.Memory{
Total: total,
Used: used,
Free: maxInt64(0, total-used),
Usage: view.MemoryPercent(),
SwapUsed: view.SwapUsed(),
SwapTotal: view.SwapTotal(),
}
}
func hostDisksFromReadStateView(disks []unifiedresources.DiskInfo) []models.Disk {
if len(disks) == 0 {
return nil
}
out := make([]models.Disk, 0, len(disks))
for _, disk := range disks {
out = append(out, models.Disk{
Total: disk.Total,
Used: disk.Used,
Free: disk.Free,
Usage: disk.Usage,
Mountpoint: disk.Mountpoint,
Type: disk.Filesystem,
Device: disk.Device,
})
}
return out
}
func hostDiskIOFromReadStateView(diskIO []unifiedresources.HostDiskIOMeta) []models.DiskIO {
if len(diskIO) == 0 {
return nil
}
out := make([]models.DiskIO, 0, len(diskIO))
for _, entry := range diskIO {
out = append(out, models.DiskIO{
Device: entry.Device,
ReadBytes: entry.ReadBytes,
WriteBytes: entry.WriteBytes,
ReadOps: entry.ReadOps,
WriteOps: entry.WriteOps,
IOTime: entry.IOTimeMs,
})
}
return out
}
func hostNetworkInterfacesFromReadStateView(interfaces []unifiedresources.NetworkInterface) []models.HostNetworkInterface {
if len(interfaces) == 0 {
return nil
}
out := make([]models.HostNetworkInterface, 0, len(interfaces))
for _, iface := range interfaces {
out = append(out, models.HostNetworkInterface{
Name: iface.Name,
MAC: iface.MAC,
Addresses: append([]string(nil), iface.Addresses...),
RXBytes: iface.RXBytes,
TXBytes: iface.TXBytes,
SpeedMbps: int64PtrCopy(iface.SpeedMbps),
})
}
return out
}
func hostSensorsFromReadStateView(sensors *unifiedresources.HostSensorMeta) models.HostSensorSummary {
if sensors == nil {
return models.HostSensorSummary{}
}
out := models.HostSensorSummary{}
if len(sensors.TemperatureCelsius) > 0 {
out.TemperatureCelsius = make(map[string]float64, len(sensors.TemperatureCelsius))
for k, v := range sensors.TemperatureCelsius {
out.TemperatureCelsius[k] = v
}
}
if len(sensors.FanRPM) > 0 {
out.FanRPM = make(map[string]float64, len(sensors.FanRPM))
for k, v := range sensors.FanRPM {
out.FanRPM[k] = v
}
}
if len(sensors.Additional) > 0 {
out.Additional = make(map[string]float64, len(sensors.Additional))
for k, v := range sensors.Additional {
out.Additional[k] = v
}
}
if len(sensors.SMART) > 0 {
out.SMART = make([]models.HostDiskSMART, 0, len(sensors.SMART))
for _, smart := range sensors.SMART {
out.SMART = append(out.SMART, models.HostDiskSMART{
Device: smart.Device,
Model: smart.Model,
Serial: smart.Serial,
WWN: smart.WWN,
Type: smart.Type,
Temperature: smart.Temperature,
Health: smart.Health,
Standby: smart.Standby,
Attributes: smartAttributesCopy(smart.Attributes),
})
}
}
return out
}
func hostRAIDFromReadStateView(raid []unifiedresources.HostRAIDMeta) []models.HostRAIDArray {
if len(raid) == 0 {
return nil
}
out := make([]models.HostRAIDArray, 0, len(raid))
for _, entry := range raid {
devices := make([]models.HostRAIDDevice, 0, len(entry.Devices))
for _, device := range entry.Devices {
devices = append(devices, models.HostRAIDDevice{
Device: device.Device,
State: device.State,
Slot: device.Slot,
})
}
out = append(out, models.HostRAIDArray{
Device: entry.Device,
Name: entry.Name,
Level: entry.Level,
State: entry.State,
TotalDevices: entry.TotalDevices,
ActiveDevices: entry.ActiveDevices,
WorkingDevices: entry.WorkingDevices,
FailedDevices: entry.FailedDevices,
SpareDevices: entry.SpareDevices,
UUID: entry.UUID,
Devices: devices,
RebuildPercent: entry.RebuildPercent,
RebuildSpeed: entry.RebuildSpeed,
})
}
return out
}
func hostUnraidFromReadStateView(unraid *unifiedresources.HostUnraidMeta) *models.HostUnraidStorage {
if unraid == nil {
return nil
}
out := &models.HostUnraidStorage{
ArrayStarted: unraid.ArrayStarted,
ArrayState: unraid.ArrayState,
SyncAction: unraid.SyncAction,
SyncProgress: unraid.SyncProgress,
SyncErrors: unraid.SyncErrors,
NumProtected: unraid.NumProtected,
NumDisabled: unraid.NumDisabled,
NumInvalid: unraid.NumInvalid,
NumMissing: unraid.NumMissing,
}
if len(unraid.Disks) > 0 {
out.Disks = make([]models.HostUnraidDisk, 0, len(unraid.Disks))
for _, disk := range unraid.Disks {
out.Disks = append(out.Disks, models.HostUnraidDisk{
Name: disk.Name,
Device: disk.Device,
Role: disk.Role,
Status: disk.Status,
RawStatus: disk.RawStatus,
Serial: disk.Serial,
Filesystem: disk.Filesystem,
SizeBytes: disk.SizeBytes,
Slot: disk.Slot,
})
}
}
return out
}
func hostCephFromReadStateView(ceph *unifiedresources.HostCephMeta) *models.HostCephCluster {
if ceph == nil {
return nil
}
out := &models.HostCephCluster{
FSID: ceph.FSID,
Health: models.HostCephHealth{
Status: ceph.Health.Status,
},
MonMap: models.HostCephMonitorMap{
Epoch: ceph.MonMap.Epoch,
NumMons: ceph.MonMap.NumMons,
},
MgrMap: models.HostCephManagerMap{
Available: ceph.MgrMap.Available,
NumMgrs: ceph.MgrMap.NumMgrs,
ActiveMgr: ceph.MgrMap.ActiveMgr,
Standbys: ceph.MgrMap.Standbys,
},
OSDMap: models.HostCephOSDMap{
Epoch: ceph.OSDMap.Epoch,
NumOSDs: ceph.OSDMap.NumOSDs,
NumUp: ceph.OSDMap.NumUp,
NumIn: ceph.OSDMap.NumIn,
NumDown: ceph.OSDMap.NumDown,
NumOut: ceph.OSDMap.NumOut,
},
PGMap: models.HostCephPGMap{
NumPGs: ceph.PGMap.NumPGs,
BytesTotal: ceph.PGMap.BytesTotal,
BytesUsed: ceph.PGMap.BytesUsed,
BytesAvailable: ceph.PGMap.BytesAvailable,
DataBytes: ceph.PGMap.DataBytes,
UsagePercent: ceph.PGMap.UsagePercent,
DegradedRatio: ceph.PGMap.DegradedRatio,
MisplacedRatio: ceph.PGMap.MisplacedRatio,
ReadBytesPerSec: ceph.PGMap.ReadBytesPerSec,
WriteBytesPerSec: ceph.PGMap.WriteBytesPerSec,
ReadOpsPerSec: ceph.PGMap.ReadOpsPerSec,
WriteOpsPerSec: ceph.PGMap.WriteOpsPerSec,
},
CollectedAt: ceph.CollectedAt,
}
if len(ceph.Health.Summary) > 0 {
out.Health.Summary = make([]models.HostCephHealthSummary, 0, len(ceph.Health.Summary))
for _, summary := range ceph.Health.Summary {
out.Health.Summary = append(out.Health.Summary, models.HostCephHealthSummary{
Severity: summary.Severity,
Message: summary.Message,
})
}
}
if len(ceph.Health.Checks) > 0 {
out.Health.Checks = make(map[string]models.HostCephCheck, len(ceph.Health.Checks))
for name, check := range ceph.Health.Checks {
out.Health.Checks[name] = models.HostCephCheck{
Severity: check.Severity,
Message: check.Message,
Detail: append([]string(nil), check.Detail...),
}
}
}
if len(ceph.MonMap.Monitors) > 0 {
out.MonMap.Monitors = make([]models.HostCephMonitor, 0, len(ceph.MonMap.Monitors))
for _, monitor := range ceph.MonMap.Monitors {
out.MonMap.Monitors = append(out.MonMap.Monitors, models.HostCephMonitor{
Name: monitor.Name,
Rank: monitor.Rank,
Addr: monitor.Addr,
Status: monitor.Status,
})
}
}
if len(ceph.Pools) > 0 {
out.Pools = make([]models.HostCephPool, 0, len(ceph.Pools))
for _, pool := range ceph.Pools {
out.Pools = append(out.Pools, models.HostCephPool{
ID: pool.ID,
Name: pool.Name,
BytesUsed: pool.BytesUsed,
BytesAvailable: pool.BytesAvailable,
Objects: pool.Objects,
PercentUsed: pool.PercentUsed,
})
}
}
if len(ceph.Services) > 0 {
out.Services = make([]models.HostCephService, 0, len(ceph.Services))
for _, service := range ceph.Services {
out.Services = append(out.Services, models.HostCephService{
Type: service.Type,
Running: service.Running,
Total: service.Total,
Daemons: append([]string(nil), service.Daemons...),
})
}
}
return out
}
func dockerHostFromReadStateView(view *unifiedresources.DockerHostView) models.DockerHost {
if view == nil {
return models.DockerHost{}
}
totalMemory := view.TotalMemoryBytes()
if totalMemory == 0 {
totalMemory = view.MemoryTotal()
}
usedMemory := view.MemoryUsed()
freeMemory := maxInt64(0, totalMemory-usedMemory)
return models.DockerHost{
ID: firstNonEmptyString(view.HostSourceID(), view.ID()),
AgentID: view.AgentID(),
Hostname: view.Hostname(),
DisplayName: view.DisplayName(),
CustomDisplayName: view.CustomDisplayName(),
MachineID: view.MachineID(),
OS: view.OS(),
KernelVersion: view.KernelVersion(),
Architecture: view.Architecture(),
Runtime: view.Runtime(),
RuntimeVersion: view.RuntimeVersion(),
DockerVersion: view.DockerVersion(),
CPUs: view.CPUs(),
TotalMemoryBytes: totalMemory,
UptimeSeconds: view.UptimeSeconds(),
CPUUsage: view.CPUPercent(),
LoadAverage: view.LoadAverage(),
Memory: models.Memory{
Total: totalMemory,
Used: usedMemory,
Free: freeMemory,
Usage: view.MemoryPercent(),
},
Disks: hostDisksFromReadStateView(view.Disks()),
NetworkInterfaces: hostNetworkInterfacesFromReadStateView(view.NetworkInterfaces()),
Status: string(view.Status()),
LastSeen: view.LastSeen(),
IntervalSeconds: view.IntervalSeconds(),
AgentVersion: view.AgentVersion(),
Containers: view.Containers(),
Services: view.Services(),
Tasks: view.Tasks(),
Swarm: dockerSwarmFromReadStateView(view.Swarm()),
TokenID: view.TokenID(),
TokenName: view.TokenName(),
TokenHint: view.TokenHint(),
TokenLastUsedAt: view.TokenLastUsedAt(),
Hidden: view.Hidden(),
PendingUninstall: view.PendingUninstall(),
Command: view.Command(),
IsLegacy: view.IsLegacy(),
NetInRate: view.NetInRate(),
NetOutRate: view.NetOutRate(),
DiskReadRate: view.DiskReadRate(),
DiskWriteRate: view.DiskWriteRate(),
}
}
func dockerSwarmFromReadStateView(in *unifiedresources.DockerSwarmInfo) *models.DockerSwarmInfo {
if in == nil {
return nil
}
return &models.DockerSwarmInfo{
NodeID: in.NodeID,
NodeRole: in.NodeRole,
LocalState: in.LocalState,
ControlAvailable: in.ControlAvailable,
ClusterID: in.ClusterID,
ClusterName: in.ClusterName,
Scope: in.Scope,
Error: in.Error,
}
}
func maxInt64(a, b int64) int64 {
if a > b {
return a
}
return b
}
func int64PtrCopy(in *int64) *int64 {
if in == nil {
return nil
}
out := *in
return &out
}
func smartAttributesCopy(in *models.SMARTAttributes) *models.SMARTAttributes {
if in == nil {
return nil
}
out := *in
out.PowerOnHours = int64PtrCopy(in.PowerOnHours)
out.PowerCycles = int64PtrCopy(in.PowerCycles)
out.ReallocatedSectors = int64PtrCopy(in.ReallocatedSectors)
out.PendingSectors = int64PtrCopy(in.PendingSectors)
out.OfflineUncorrectable = int64PtrCopy(in.OfflineUncorrectable)
out.UDMACRCErrors = int64PtrCopy(in.UDMACRCErrors)
if in.PercentageUsed != nil {
value := *in.PercentageUsed
out.PercentageUsed = &value
}
if in.AvailableSpare != nil {
value := *in.AvailableSpare
out.AvailableSpare = &value
}
out.MediaErrors = int64PtrCopy(in.MediaErrors)
out.UnsafeShutdowns = int64PtrCopy(in.UnsafeShutdowns)
return &out
}
func firstNonEmptyString(values ...string) string {
for _, value := range values {
if trimmed := strings.TrimSpace(value); trimmed != "" {
return trimmed
}
}
return ""
}
func storageZFSPoolFromReadStateView(view *unifiedresources.StoragePoolView) *models.ZFSPool {
if view == nil {
return nil
}
state := strings.TrimSpace(view.ZFSPoolState())
if !view.IsZFS() && state == "" && view.ZFSReadErrors() == 0 && view.ZFSWriteErrors() == 0 && view.ZFSChecksumErrors() == 0 {
return nil
}
return &models.ZFSPool{
Name: view.Name(),
State: state,
ReadErrors: view.ZFSReadErrors(),
WriteErrors: view.ZFSWriteErrors(),
ChecksumErrors: view.ZFSChecksumErrors(),
}
}
func storageNodeIDsFromReadState(instance string, nodes []string) []string {
if len(nodes) == 0 {
return nil
}
nodeIDs := make([]string, 0, len(nodes))
for _, node := range nodes {
node = strings.TrimSpace(node)
if node == "" {
continue
}
if instance == "" {
nodeIDs = append(nodeIDs, node)
continue
}
nodeIDs = append(nodeIDs, instance+"-"+node)
}
if len(nodeIDs) == 0 {
return nil
}
return nodeIDs
}
// ActiveAlertsSnapshot returns the current active alerts.
func (m *Monitor) ActiveAlertsSnapshot() []models.Alert {
return m.activeAlertsSnapshot()
}
// RecentlyResolvedSnapshot returns the recently resolved alerts.
func (m *Monitor) RecentlyResolvedSnapshot() []models.ResolvedAlert {
return m.recentlyResolvedAlertsSnapshot()
}
// PVEBackupsSnapshot returns the current PVE backups.
func (m *Monitor) PVEBackupsSnapshot() models.PVEBackups {
return m.GetState().PVEBackups
}
// BuildFrontendState returns the current state converted to frontend format.
// This replaces the GetState().ToFrontend() pattern in consumer code.
func (m *Monitor) BuildFrontendState() models.StateFrontend {
return m.buildBroadcastFrontendStateFromSnapshot(m.GetState())
}
// BuildBroadcastFrontendState returns frontend state ready for websocket
// broadcasts, including the unified resource payload when a resource store is
// configured.
func (m *Monitor) BuildBroadcastFrontendState() models.StateFrontend {
return m.buildBroadcastFrontendStateFromSnapshot(m.GetState())
}
func buildFrontendStateFromSnapshot(snapshot models.StateSnapshot) models.StateFrontend {
return snapshot.ToFrontend()
}
func (m *Monitor) buildBroadcastFrontendStateFromSnapshot(snapshot models.StateSnapshot) models.StateFrontend {
frontendState := buildFrontendStateFromSnapshot(snapshot)
m.updateResourceStore(snapshot)
if m != nil && m.alertManager != nil {
if liveAlerts := m.activeAlertsSnapshot(); len(liveAlerts) > 0 || len(frontendState.ActiveAlerts) > 0 {
frontendState.ActiveAlerts = liveAlerts
}
}
unifiedView := m.currentUnifiedStateView()
frontendState.Resources = convertResourcesForBroadcast(unifiedView.resources)
frontendState.ConnectedInfrastructure = buildConnectedInfrastructure(unifiedView.resources, snapshot)
if !unifiedView.freshness.IsZero() {
frontendState.LastUpdate = unifiedView.freshness.UnixMilli()
}
return frontendState
}
// GetLiveStateSnapshot returns the underlying monitor state snapshot without
// applying global mock mode overrides.
//
// This is useful for agent management endpoints that need to reflect actual
// registrations even when mock mode is enabled for the UI/demo experience.
func (m *Monitor) GetLiveStateSnapshot() models.StateSnapshot {
if m == nil || m.state == nil {
return models.EmptyStateSnapshot()
}
return m.state.GetSnapshot()
}
// GetLiveHostsSnapshot returns the underlying registered host agents without
// applying global mock mode overrides.
func (m *Monitor) GetLiveHostsSnapshot() []models.Host {
if m == nil || m.state == nil {
return nil
}
return m.state.GetSnapshot().Hosts
}
// SetOrgID sets the organization ID for this monitor instance.
// This is used for tenant isolation in multi-tenant deployments.
func (m *Monitor) SetOrgID(orgID string) {
m.mu.Lock()
defer m.mu.Unlock()
m.orgID = strings.TrimSpace(orgID)
}
// GetOrgID returns the organization ID for this monitor instance.
// Returns empty string for default/legacy monitors.
func (m *Monitor) GetOrgID() string {
m.mu.RLock()
defer m.mu.RUnlock()
return m.orgID
}
type stateBroadcaster interface {
BroadcastState(state interface{})
BroadcastStateToTenant(orgID string, state interface{})
}
// broadcastState broadcasts state to WebSocket clients.
// Monitors with an explicit org ID (including "default") are tenant-scoped.
// Legacy monitors without an org ID broadcast globally.
func (m *Monitor) broadcastState(hub stateBroadcaster, frontendState interface{}) {
if hub == nil {
return
}
orgID := strings.TrimSpace(m.GetOrgID())
if orgID != "" {
hub.BroadcastStateToTenant(orgID, frontendState)
} else {
hub.BroadcastState(frontendState)
}
}
func (m *Monitor) broadcastEscalatedAlert(hub *websocket.Hub, alert *alerts.Alert) {
if hub == nil || alert == nil {
return
}
hub.BroadcastAlertToTenant(m.GetOrgID(), alert)
}
// SetMockMode switches between mock data and real infrastructure data at runtime.
func (m *Monitor) SetMockMode(enable bool) error {
current := mock.IsMockEnabled()
if current == enable {
log.Info().Bool("mockMode", enable).Msg("mock mode already in desired state")
return nil
}
if enable {
m.stopMockMetricsSampler()
if err := mock.SetEnabled(true); err != nil {
return err
}
m.alertManager.ClearActiveAlerts()
m.mu.Lock()
m.resetStateLocked()
m.metricsHistory.Reset()
m.mu.Unlock()
m.StopDiscoveryService()
m.mu.RLock()
ctx := m.runtimeCtx
m.mu.RUnlock()
if ctx != nil {
m.startMockMetricsSampler(ctx)
}
log.Info().Msg("switched monitor to mock mode")
} else {
m.stopMockMetricsSampler()
if err := mock.SetEnabled(false); err != nil {
return err
}
m.alertManager.ClearActiveAlerts()
m.mu.Lock()
m.resetStateLocked()
m.metricsHistory.Reset()
m.mu.Unlock()
log.Info().Msg("switched monitor to real data mode")
}
m.mu.RLock()
ctx := m.runtimeCtx
hub := m.wsHub
m.mu.RUnlock()
if hub != nil {
frontendState := m.buildBroadcastFrontendStateFromSnapshot(m.GetState())
// Use tenant-aware broadcast method
m.broadcastState(hub, frontendState)
}
if enable && ctx != nil && keepRealPollingInMockMode() {
// Keep real metrics flowing while mock mode is enabled.
go m.poll(ctx, hub)
}
if !enable && ctx != nil {
// Kick off an immediate poll to repopulate state with live data.
go m.poll(ctx, hub)
if hub != nil && m.config.DiscoveryEnabled {
go m.StartDiscoveryService(ctx, hub, m.config.DiscoverySubnet)
}
}
return nil
}
func (m *Monitor) resetStateLocked() {
m.state = models.NewState()
m.state.Stats = models.Stats{
StartTime: m.startTime,
Version: "2.0.0-go",
}
}
// GetStartTime returns the monitor start time
func (m *Monitor) GetStartTime() time.Time {
return m.startTime
}
// GetDiscoveryService returns the discovery service
func (m *Monitor) GetDiscoveryService() *discovery.Service {
return m.discoveryService
}
// StartDiscoveryService starts the discovery service if not already running
func (m *Monitor) StartDiscoveryService(ctx context.Context, wsHub *websocket.Hub, subnet string) {
m.mu.Lock()
defer m.mu.Unlock()
if m.discoveryService != nil {
log.Debug().Msg("discovery service already running")
return
}
if subnet == "" {
subnet = "auto"
}
cfgProvider := func() config.DiscoveryConfig {
m.mu.RLock()
defer m.mu.RUnlock()
if m.config == nil {
return config.DefaultDiscoveryConfig()
}
return config.CloneDiscoveryConfig(m.config.Discovery)
}
m.discoveryService = discovery.NewService(wsHub, 5*time.Minute, subnet, cfgProvider)
if m.discoveryService != nil {
m.discoveryService.Start(ctx)
log.Info().Str("subnet", subnet).Msg("discovery service started")
} else {
log.Error().Msg("failed to create discovery service")
}
}
// StopDiscoveryService stops the discovery service if running
func (m *Monitor) StopDiscoveryService() {
m.mu.Lock()
defer m.mu.Unlock()
if m.discoveryService != nil {
m.discoveryService.Stop()
m.discoveryService = nil
log.Info().Msg("discovery service stopped")
}
}
// EnableTemperatureMonitoring enables temperature data collection
func (m *Monitor) EnableTemperatureMonitoring() {
// Temperature collection is always enabled when tempCollector is initialized
// This method exists for interface compatibility
log.Info().Msg("temperature monitoring enabled")
}
// DisableTemperatureMonitoring disables temperature data collection
func (m *Monitor) DisableTemperatureMonitoring() {
// Temperature collection is always enabled when tempCollector is initialized
// This method exists for interface compatibility
log.Info().Msg("temperature monitoring disabled")
}
// SetResourceStore sets the resource store for polling optimization.
// When set, the monitor will check if it should reduce polling frequency
// for nodes that have host agents providing data.
func (m *Monitor) SetResourceStore(store ResourceStoreInterface) {
m.mu.Lock()
m.resourceStore = store
incidentStore := m.incidentStore
m.mu.Unlock()
log.Info().Msg("resource store set for polling optimization")
if incidentStore != nil {
if timelineStore, ok := store.(memory.IncidentTimelineStore); ok {
incidentStore.SetResourceTimelineStore(timelineStore)
} else {
incidentStore.SetResourceTimelineStore(nil)
}
}
// Immediately backfill the store from current state so ReadState
// consumers have data as soon as the store is wired.
// Guard against minimally initialized monitors (e.g., test fixtures
// with bare &Monitor{}) where m.state may be nil.
if store != nil && m.state != nil {
m.updateResourceStore(m.GetState())
}
}
// SetSupplementalRecordsProvider configures source-native resource providers
// that ingest alongside the legacy state snapshot path.
func (m *Monitor) SetSupplementalRecordsProvider(source unifiedresources.DataSource, provider MonitorSupplementalRecordsProvider) {
if m == nil {
return
}
normalized := unifiedresources.DataSource(strings.ToLower(strings.TrimSpace(string(source))))
if normalized == "" {
return
}
m.mu.Lock()
if m.supplementalProviders == nil {
m.supplementalProviders = make(map[unifiedresources.DataSource]MonitorSupplementalRecordsProvider)
}
if provider == nil {
delete(m.supplementalProviders, normalized)
} else {
m.supplementalProviders[normalized] = provider
}
m.mu.Unlock()
m.updateResourceStore(m.GetState())
}
// SetRecoveryManager wires the recovery store manager for best-effort ingestion of
// recovery points derived from polled backup/snapshot data.
func (m *Monitor) SetRecoveryManager(manager *recoverymanager.Manager) {
m.mu.Lock()
m.recoveryManager = manager
m.mu.Unlock()
// Try cleanup during wiring so monitors that are already running still get
// the migration once a recovery manager becomes available.
go m.purgeStalePVEPBSBackupsBestEffort(context.Background())
}
// GetNotificationManager returns the notification manager
func (m *Monitor) GetNotificationManager() *notifications.NotificationManager {
return m.notificationMgr
}
// GetConfigPersistence returns the config persistence manager
func (m *Monitor) GetConfigPersistence() *config.ConfigPersistence {
return m.configPersist
}
// GetMetricsStore returns the persistent metrics store
func (m *Monitor) GetMetricsStore() *metrics.Store {
return m.metricsStore
}
// GetMetricsHistory returns the in-memory metrics history for trend analysis
// This is used by the AI context builder to compute trends and predictions
func (m *Monitor) GetMetricsHistory() *MetricsHistory {
return m.metricsHistory
}
// GetUnifiedResources returns the current unified resource view for this monitor.
// Returns nil when no resource store is configured.
func (m *Monitor) GetUnifiedResources() []unifiedresources.Resource {
if m == nil {
return nil
}
m.mu.RLock()
store := m.resourceStore
m.mu.RUnlock()
if store == nil {
return nil
}
return store.GetAll()
}
type monitorUnifiedStateView struct {
resources []unifiedresources.Resource
readState unifiedresources.ReadState
freshness time.Time
}
func monitorUnifiedStateViewFromSnapshot(snapshot models.StateSnapshot) monitorUnifiedStateView {
registry := unifiedresources.NewRegistry(nil)
registry.IngestSnapshot(snapshot)
adapter := unifiedresources.NewMonitorAdapter(registry)
return monitorUnifiedStateView{
resources: registry.List(),
readState: adapter,
freshness: snapshot.LastUpdate,
}
}
func monitorUnifiedStateViewFromResources(resources []unifiedresources.Resource, freshness time.Time) monitorUnifiedStateView {
registry := unifiedresources.NewRegistry(nil)
registry.IngestResources(resources)
adapter := unifiedresources.NewMonitorAdapter(registry)
return monitorUnifiedStateView{
resources: registry.List(),
readState: adapter,
freshness: freshness,
}
}
func (m *Monitor) currentUnifiedStateView() monitorUnifiedStateView {
if m == nil {
return monitorUnifiedStateView{}
}
if mock.IsMockEnabled() {
resources, freshness := mock.UnifiedResourceSnapshot()
if len(resources) > 0 || !freshness.IsZero() {
return monitorUnifiedStateViewFromResources(resources, freshness)
}
return monitorUnifiedStateViewFromSnapshot(m.GetState())
}
m.mu.RLock()
store := m.resourceStore
state := m.state
m.mu.RUnlock()
if store == nil {
return monitorUnifiedStateViewFromSnapshot(m.GetState())
}
resources := store.GetAll()
freshness := unifiedResourceFreshness(store, state)
if readState, ok := store.(unifiedresources.ReadState); ok {
return monitorUnifiedStateView{
resources: resources,
readState: readState,
freshness: freshness,
}
}
if len(resources) > 0 || state == nil {
return monitorUnifiedStateViewFromResources(resources, freshness)
}
return monitorUnifiedStateViewFromSnapshot(m.GetState())
}
func (m *Monitor) currentUnifiedResourceFreshness() time.Time {
if m == nil {
return time.Time{}
}
m.mu.RLock()
store := m.resourceStore
state := m.state
m.mu.RUnlock()
return unifiedResourceFreshness(store, state)
}
func unifiedResourceFreshness(store ResourceStoreInterface, state *models.State) time.Time {
freshness := time.Time{}
if freshnessStore, ok := store.(UnifiedResourceFreshnessStore); ok {
freshness = freshnessStore.UnifiedResourceFreshness()
}
if freshness.IsZero() && state != nil {
freshness = state.GetLastUpdate()
}
return freshness
}
// UnifiedResourceSnapshot returns a canonical unified-resource seed plus the
// associated freshness marker. In mock mode it returns the shared mock
// unified-resource fixture graph rather than the live resource store.
func (m *Monitor) UnifiedResourceSnapshot() ([]unifiedresources.Resource, time.Time) {
view := m.currentUnifiedStateView()
return view.resources, view.freshness
}
// GetUnifiedReadState returns a typed unified read-state provider when the
// configured resource store supports it.
func (m *Monitor) GetUnifiedReadState() unifiedresources.ReadState {
if m == nil {
return nil
}
m.mu.RLock()
store := m.resourceStore
m.mu.RUnlock()
if store == nil {
return nil
}
readState, ok := store.(unifiedresources.ReadState)
if !ok {
return nil
}
return readState
}
// GetUnifiedReadStateOrSnapshot returns unified read-state when available.
// If the monitor has not been wired with a resource store yet, it creates an
// ephemeral snapshot-backed adapter to preserve read access without exposing
// direct state reads to consumer packages.
func (m *Monitor) GetUnifiedReadStateOrSnapshot() unifiedresources.ReadState {
return m.currentUnifiedStateView().readState
}
// shouldSkipNodeMetrics returns true if we should skip detailed metric polling
// for the given node because a host agent is providing richer data.
// This helps reduce API load when agents are active.
func (m *Monitor) shouldSkipNodeMetrics(nodeName string) bool {
m.mu.RLock()
store := m.resourceStore
m.mu.RUnlock()
if store == nil {
return false
}
should := store.ShouldSkipAPIPolling(nodeName)
if should {
log.Debug().
Str("node", nodeName).
Msg("Skipping detailed node metrics - host agent provides data")
}
return should
}
// updateResourceStore populates the resource store with data from the current state.
// This should be called before broadcasting to ensure fresh data.
func (m *Monitor) updateResourceStore(state models.StateSnapshot) {
m.mu.RLock()
store := m.resourceStore
m.mu.RUnlock()
if store == nil {
log.Debug().Msg("[Resources] No resource store configured, skipping population")
return
}
log.Debug().
Int("nodes", len(state.Nodes)).
Int("vms", len(state.VMs)).
Int("containers", len(state.Containers)).
Int("hosts", len(state.Hosts)).
Int("dockerHosts", len(state.DockerHosts)).
Msg("[Resources] Populating resource store from state snapshot")
snapshotForStore := state
ownedSources := m.providerOwnedSnapshotSources()
if len(ownedSources) > 0 {
snapshotForStore = unifiedresources.SnapshotWithoutSources(state, ownedSources)
sourceNames := make([]string, 0, len(ownedSources))
for _, source := range ownedSources {
sourceNames = append(sourceNames, string(source))
}
log.Debug().
Strs("sources", sourceNames).
Msg("[Resources] Suppressing legacy snapshot slices for provider-owned sources")
}
recordsBySource := m.collectSupplementalRecordsBySource()
supplementalChanges := m.collectSupplementalChanges()
if atomicStore, ok := store.(AtomicSnapshotResourceStore); ok {
atomicStore.PopulateSnapshotAndSupplemental(snapshotForStore, recordsBySource)
recordSupplementalResourceChanges(store, supplementalChanges)
m.syncUnifiedAgentMetrics(store)
m.syncUnifiedVMMetrics(store)
m.syncUnifiedStorageMetrics(store)
m.syncUnifiedPhysicalDiskMetrics(store)
m.syncUnifiedAppContainerMetrics(store)
for source, records := range recordsBySource {
if len(records) == 0 {
continue
}
log.Debug().
Str("source", string(source)).
Int("records", len(records)).
Msg("[Resources] Atomically ingested supplemental records")
}
m.syncUnifiedResourceAlertsToState(store.GetAll())
return
}
store.PopulateFromSnapshot(snapshotForStore)
supplementalStore, ok := store.(SupplementalRecordStore)
if ok {
for source, records := range recordsBySource {
if len(records) == 0 {
continue
}
supplementalStore.PopulateSupplementalRecords(source, records)
log.Debug().
Str("source", string(source)).
Int("records", len(records)).
Msg("[Resources] Ingested supplemental records")
}
}
recordSupplementalResourceChanges(store, supplementalChanges)
m.syncUnifiedAgentMetrics(store)
m.syncUnifiedVMMetrics(store)
m.syncUnifiedStorageMetrics(store)
m.syncUnifiedPhysicalDiskMetrics(store)
m.syncUnifiedAppContainerMetrics(store)
m.syncUnifiedResourceAlertsToState(store.GetAll())
}
func recordSupplementalResourceChanges(store ResourceStoreInterface, changes []unifiedresources.ResourceChange) {
if store == nil || len(changes) == 0 {
return
}
recorder, ok := store.(canonicalResourceChangeRecorder)
if !ok || recorder == nil {
return
}
for _, change := range changes {
if err := recorder.RecordChange(change); err != nil {
log.Warn().
Err(err).
Str("resource_id", change.ResourceID).
Str("change_id", change.ID).
Str("kind", string(change.Kind)).
Msg("failed to record supplemental canonical resource change")
}
}
}
func (m *Monitor) syncUnifiedAgentMetrics(store ResourceStoreInterface) {
if store == nil || (m.metricsHistory == nil && m.metricsStore == nil) {
return
}
resolver, ok := store.(MetricsTargetResourceStore)
if !ok {
return
}
now := time.Now()
storeWrites := make([]metrics.WriteMetric, 0)
appendStoreWrite := func(resourceType, resourceID, metricType string, value float64) {
if m.metricsStore == nil {
return
}
storeWrites = append(storeWrites, metrics.WriteMetric{
ResourceType: resourceType,
ResourceID: resourceID,
MetricType: metricType,
Value: value,
Timestamp: now,
Tier: metrics.TierRaw,
})
}
seenTargets := make(map[string]struct{})
for _, resource := range store.GetAll() {
if resource.Type != unifiedresources.ResourceTypeAgent || resource.Metrics == nil {
continue
}
if shouldSkipMockOwnedUnifiedMetricSync(resource) {
continue
}
if monitorHasSource(resource.Sources, unifiedresources.SourceAgent) ||
monitorHasSource(resource.Sources, unifiedresources.SourceProxmox) ||
monitorHasSource(resource.Sources, unifiedresources.SourceDocker) {
continue
}
target := resolver.MetricsTargetForResource(resource.ID)
if target == nil || target.ResourceType != "agent" || strings.TrimSpace(target.ResourceID) == "" {
continue
}
targetID := strings.TrimSpace(target.ResourceID)
if _, ok := seenTargets[targetID]; ok {
continue
}
seenTargets[targetID] = struct{}{}
metricKey := fmt.Sprintf("agent:%s", targetID)
if metric := resource.Metrics.CPU; metric != nil {
value := metric.Percent
if value == 0 {
value = metric.Value
}
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(metricKey, "cpu", value, now)
}
appendStoreWrite("agent", targetID, "cpu", value)
}
if metric := resource.Metrics.Memory; metric != nil && (metric.Total != nil || metric.Percent > 0 || metric.Used != nil) {
value := metric.Percent
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(metricKey, "memory", value, now)
}
appendStoreWrite("agent", targetID, "memory", value)
}
if metric := resource.Metrics.Disk; metric != nil && (metric.Total != nil || metric.Percent > 0 || metric.Used != nil) {
value := metric.Percent
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(metricKey, "disk", value, now)
}
appendStoreWrite("agent", targetID, "disk", value)
}
if metric := resource.Metrics.NetIn; metric != nil {
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(metricKey, "netin", metric.Value, now)
}
appendStoreWrite("agent", targetID, "netin", metric.Value)
}
if metric := resource.Metrics.NetOut; metric != nil {
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(metricKey, "netout", metric.Value, now)
}
appendStoreWrite("agent", targetID, "netout", metric.Value)
}
if metric := resource.Metrics.DiskRead; metric != nil {
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(metricKey, "diskread", metric.Value, now)
}
appendStoreWrite("agent", targetID, "diskread", metric.Value)
}
if metric := resource.Metrics.DiskWrite; metric != nil {
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(metricKey, "diskwrite", metric.Value, now)
}
appendStoreWrite("agent", targetID, "diskwrite", metric.Value)
}
}
if len(storeWrites) > 0 {
m.metricsStore.WriteBatchSync(storeWrites)
}
}
func (m *Monitor) syncUnifiedVMMetrics(store ResourceStoreInterface) {
if store == nil || (m.metricsHistory == nil && m.metricsStore == nil) {
return
}
resolver, ok := store.(MetricsTargetResourceStore)
if !ok {
return
}
now := time.Now()
storeWrites := make([]metrics.WriteMetric, 0)
appendStoreWrite := func(resourceType, resourceID, metricType string, value float64) {
if m.metricsStore == nil {
return
}
storeWrites = append(storeWrites, metrics.WriteMetric{
ResourceType: resourceType,
ResourceID: resourceID,
MetricType: metricType,
Value: value,
Timestamp: now,
Tier: metrics.TierRaw,
})
}
seenTargets := make(map[string]struct{})
for _, resource := range store.GetAll() {
if resource.Type != unifiedresources.ResourceTypeVM || resource.Metrics == nil {
continue
}
if shouldSkipMockOwnedUnifiedMetricSync(resource) {
continue
}
hasNativeVMWriter := false
for _, source := range resource.Sources {
if source == unifiedresources.SourceProxmox {
hasNativeVMWriter = true
break
}
}
if hasNativeVMWriter {
continue
}
target := resolver.MetricsTargetForResource(resource.ID)
if target == nil || target.ResourceType != "vm" || strings.TrimSpace(target.ResourceID) == "" {
continue
}
targetID := strings.TrimSpace(target.ResourceID)
if _, ok := seenTargets[targetID]; ok {
continue
}
seenTargets[targetID] = struct{}{}
if metric := resource.Metrics.CPU; metric != nil {
value := metric.Percent
if value == 0 {
value = metric.Value
}
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(targetID, "cpu", value, now)
}
appendStoreWrite("vm", targetID, "cpu", value)
}
if metric := resource.Metrics.Memory; metric != nil && (metric.Total != nil || metric.Percent > 0 || metric.Used != nil) {
value := metric.Percent
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(targetID, "memory", value, now)
}
appendStoreWrite("vm", targetID, "memory", value)
}
if metric := resource.Metrics.Disk; metric != nil && (metric.Total != nil || metric.Percent > 0 || metric.Used != nil) {
value := metric.Percent
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(targetID, "disk", value, now)
}
appendStoreWrite("vm", targetID, "disk", value)
}
if metric := resource.Metrics.NetIn; metric != nil {
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(targetID, "netin", metric.Value, now)
}
appendStoreWrite("vm", targetID, "netin", metric.Value)
}
if metric := resource.Metrics.NetOut; metric != nil {
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(targetID, "netout", metric.Value, now)
}
appendStoreWrite("vm", targetID, "netout", metric.Value)
}
if metric := resource.Metrics.DiskRead; metric != nil {
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(targetID, "diskread", metric.Value, now)
}
appendStoreWrite("vm", targetID, "diskread", metric.Value)
}
if metric := resource.Metrics.DiskWrite; metric != nil {
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(targetID, "diskwrite", metric.Value, now)
}
appendStoreWrite("vm", targetID, "diskwrite", metric.Value)
}
}
if len(storeWrites) > 0 {
m.metricsStore.WriteBatchSync(storeWrites)
}
}
func (m *Monitor) syncUnifiedStorageMetrics(store ResourceStoreInterface) {
if store == nil || (m.metricsHistory == nil && m.metricsStore == nil) {
return
}
resolver, ok := store.(MetricsTargetResourceStore)
if !ok {
return
}
now := time.Now()
storeWrites := make([]metrics.WriteMetric, 0)
appendStoreWrite := func(resourceType, resourceID, metricType string, value float64) {
if m.metricsStore == nil {
return
}
storeWrites = append(storeWrites, metrics.WriteMetric{
ResourceType: resourceType,
ResourceID: resourceID,
MetricType: metricType,
Value: value,
Timestamp: now,
Tier: metrics.TierRaw,
})
}
seenTargets := make(map[string]struct{})
for _, resource := range store.GetAll() {
if resource.Type != unifiedresources.ResourceTypeStorage || resource.Metrics == nil || resource.Metrics.Disk == nil {
continue
}
if shouldSkipMockOwnedUnifiedMetricSync(resource) {
continue
}
// Native Proxmox storage already writes to history during the storage poller.
if resource.Storage != nil && resource.Storage.Platform == "" {
hasProxmoxSource := false
for _, source := range resource.Sources {
if source == unifiedresources.SourceProxmox {
hasProxmoxSource = true
break
}
}
if hasProxmoxSource {
continue
}
}
target := resolver.MetricsTargetForResource(resource.ID)
if target == nil || target.ResourceType != "storage" || strings.TrimSpace(target.ResourceID) == "" {
continue
}
targetID := strings.TrimSpace(target.ResourceID)
if _, ok := seenTargets[targetID]; ok {
continue
}
seenTargets[targetID] = struct{}{}
disk := resource.Metrics.Disk
usage := disk.Percent
used := int64(0)
total := int64(0)
free := int64(0)
if disk.Used != nil {
used = *disk.Used
}
if disk.Total != nil {
total = *disk.Total
}
if total > 0 {
free = total - used
if usage == 0 && used > 0 {
usage = (float64(used) / float64(total)) * 100
}
}
if m.metricsHistory != nil {
m.metricsHistory.AddStorageMetric(targetID, "usage", usage, now)
if total > 0 {
m.metricsHistory.AddStorageMetric(targetID, "used", float64(used), now)
m.metricsHistory.AddStorageMetric(targetID, "total", float64(total), now)
m.metricsHistory.AddStorageMetric(targetID, "avail", float64(free), now)
}
}
appendStoreWrite("storage", targetID, "usage", usage)
if total > 0 {
appendStoreWrite("storage", targetID, "used", float64(used))
appendStoreWrite("storage", targetID, "total", float64(total))
appendStoreWrite("storage", targetID, "avail", float64(free))
}
}
if len(storeWrites) > 0 {
m.metricsStore.WriteBatchSync(storeWrites)
}
}
func (m *Monitor) syncUnifiedPhysicalDiskMetrics(store ResourceStoreInterface) {
if store == nil || m.metricsStore == nil {
return
}
resolver, ok := store.(MetricsTargetResourceStore)
if !ok {
return
}
now := time.Now()
seenTargets := make(map[string]struct{})
for _, resource := range store.GetAll() {
if resource.Type != unifiedresources.ResourceTypePhysicalDisk || resource.PhysicalDisk == nil {
continue
}
if shouldSkipMockOwnedUnifiedMetricSync(resource) {
continue
}
hasNativeWriter := false
for _, source := range resource.Sources {
if source == unifiedresources.SourceProxmox || source == unifiedresources.SourceAgent {
hasNativeWriter = true
break
}
}
if hasNativeWriter {
continue
}
target := resolver.MetricsTargetForResource(resource.ID)
if target == nil || target.ResourceType != "disk" || strings.TrimSpace(target.ResourceID) == "" {
continue
}
targetID := strings.TrimSpace(target.ResourceID)
if _, ok := seenTargets[targetID]; ok {
continue
}
seenTargets[targetID] = struct{}{}
nodeName := ""
for _, hostname := range resource.Identity.Hostnames {
if hostname = strings.TrimSpace(hostname); hostname != "" {
nodeName = hostname
break
}
}
if nodeName == "" {
nodeName = firstNonEmptyString(strings.TrimSpace(resource.ParentName), strings.TrimSpace(resource.Name))
}
disk := models.PhysicalDisk{
ID: resource.ID,
Node: nodeName,
DevPath: resource.PhysicalDisk.DevPath,
Model: resource.PhysicalDisk.Model,
Serial: resource.PhysicalDisk.Serial,
WWN: resource.PhysicalDisk.WWN,
Type: resource.PhysicalDisk.DiskType,
Size: resource.PhysicalDisk.SizeBytes,
Health: resource.PhysicalDisk.Health,
Wearout: resource.PhysicalDisk.Wearout,
Temperature: resource.PhysicalDisk.Temperature,
RPM: resource.PhysicalDisk.RPM,
Used: resource.PhysicalDisk.Used,
SmartAttributes: smartAttributesFromUnifiedMeta(resource.PhysicalDisk.SMART),
LastChecked: resource.LastSeen,
}
if disk.Serial == "" {
disk.ID = targetID
}
m.writeSMARTMetrics(disk, now)
}
}
func (m *Monitor) syncUnifiedAppContainerMetrics(store ResourceStoreInterface) {
if store == nil || (m.metricsHistory == nil && m.metricsStore == nil) {
return
}
resolver, ok := store.(MetricsTargetResourceStore)
if !ok {
return
}
now := time.Now()
storeWrites := make([]metrics.WriteMetric, 0)
appendStoreWrite := func(resourceType, resourceID, metricType string, value float64) {
if m.metricsStore == nil {
return
}
storeWrites = append(storeWrites, metrics.WriteMetric{
ResourceType: resourceType,
ResourceID: resourceID,
MetricType: metricType,
Value: value,
Timestamp: now,
Tier: metrics.TierRaw,
})
}
seenTargets := make(map[string]struct{})
for _, resource := range store.GetAll() {
if resource.Type != unifiedresources.ResourceTypeAppContainer || resource.Metrics == nil {
continue
}
if shouldSkipMockOwnedUnifiedMetricSync(resource) {
continue
}
hasDockerSource := false
for _, source := range resource.Sources {
if source == unifiedresources.SourceDocker {
hasDockerSource = true
break
}
}
if hasDockerSource {
continue
}
target := resolver.MetricsTargetForResource(resource.ID)
if target == nil || target.ResourceType != "app-container" || strings.TrimSpace(target.ResourceID) == "" {
continue
}
targetID := strings.TrimSpace(target.ResourceID)
if _, ok := seenTargets[targetID]; ok {
continue
}
seenTargets[targetID] = struct{}{}
metricKey := fmt.Sprintf("docker:%s", targetID)
if metric := resource.Metrics.CPU; metric != nil {
value := metric.Percent
if value == 0 {
value = metric.Value
}
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(metricKey, "cpu", value, now)
}
appendStoreWrite("dockerContainer", targetID, "cpu", value)
}
if metric := resource.Metrics.Memory; metric != nil && (metric.Total != nil || metric.Percent > 0) {
value := metric.Percent
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(metricKey, "memory", value, now)
}
appendStoreWrite("dockerContainer", targetID, "memory", value)
}
if metric := resource.Metrics.Disk; metric != nil && (metric.Total != nil || metric.Percent > 0) {
value := metric.Percent
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(metricKey, "disk", value, now)
}
appendStoreWrite("dockerContainer", targetID, "disk", value)
}
if metric := resource.Metrics.NetIn; metric != nil {
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(metricKey, "netin", metric.Value, now)
}
appendStoreWrite("dockerContainer", targetID, "netin", metric.Value)
}
if metric := resource.Metrics.NetOut; metric != nil {
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(metricKey, "netout", metric.Value, now)
}
appendStoreWrite("dockerContainer", targetID, "netout", metric.Value)
}
if metric := resource.Metrics.DiskRead; metric != nil {
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(metricKey, "diskread", metric.Value, now)
}
appendStoreWrite("dockerContainer", targetID, "diskread", metric.Value)
}
if metric := resource.Metrics.DiskWrite; metric != nil {
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(metricKey, "diskwrite", metric.Value, now)
}
appendStoreWrite("dockerContainer", targetID, "diskwrite", metric.Value)
}
}
if len(storeWrites) > 0 {
m.metricsStore.WriteBatchSync(storeWrites)
}
}
func shouldSkipMockOwnedUnifiedMetricSync(resource unifiedresources.Resource) bool {
if !mock.IsMockEnabled() {
return false
}
// In mock mode the canonical mock sampler owns chart/history continuity for
// the entire demo estate. Unified-resource sync must not append a second
// live timeline on top of seeded mock history for any resource class.
_ = resource
return true
}
func shouldSkipNativeMockStateMetricWrites() bool {
return mock.IsMockEnabled()
}
// getUnifiedResourcesForBroadcast retrieves all resources from the store.
// Returns nil if no resource store is configured.
func (m *Monitor) getUnifiedResourcesForBroadcast() []unifiedresources.Resource {
m.mu.RLock()
store := m.resourceStore
m.mu.RUnlock()
if store == nil {
log.Debug().Msg("[Resources] No store for broadcast")
return nil
}
allResources := store.GetAll()
log.Debug().Int("count", len(allResources)).Msg("[Resources] Got resources for broadcast")
if len(allResources) == 0 {
return nil
}
return allResources
}
// getResourcesForBroadcast retrieves all resources from the store and converts
// them to frontend format.
func (m *Monitor) getResourcesForBroadcast() []models.ResourceFrontend {
return convertResourcesForBroadcast(m.getUnifiedResourcesForBroadcast())
}
// convertResourcesForBroadcast converts unified resources into the frontend payload shape.
func convertResourcesForBroadcast(allResources []unifiedresources.Resource) []models.ResourceFrontend {
if len(allResources) == 0 {
return []models.ResourceFrontend{}
}
type broadcastResource struct {
input models.ResourceConvertInput
sortKey string
resourceID string
}
converted := make([]broadcastResource, 0, len(allResources))
for _, r := range allResources {
input := monitorResourceToConvertInput(r)
sortKey := strings.ToLower(input.DisplayName)
if sortKey == "" {
sortKey = strings.ToLower(input.Name)
}
converted = append(converted, broadcastResource{
input: input,
sortKey: sortKey,
resourceID: input.ID,
})
}
sort.Slice(converted, func(i, j int) bool {
if converted[i].sortKey == converted[j].sortKey {
return converted[i].resourceID < converted[j].resourceID
}
return converted[i].sortKey < converted[j].sortKey
})
result := make([]models.ResourceFrontend, len(converted))
for i, resource := range converted {
result[i] = models.ConvertResourceToFrontend(resource.input)
}
return result
}
func monitorResourceToConvertInput(resource unifiedresources.Resource) models.ResourceConvertInput {
unifiedresources.RefreshCanonicalMetadata(&resource)
resourceType := monitorFrontendResourceType(resource)
name, displayName := monitorFrontendNames(resource, resourceType)
platformID := monitorPlatformID(resource, resourceType)
input := models.ResourceConvertInput{
ID: resource.ID,
Type: resourceType,
Name: name,
DisplayName: displayName,
PlatformID: platformID,
PlatformType: monitorPlatformType(resource, resourceType),
SourceType: monitorSourceType(resource.Sources),
ParentID: monitorStringValue(resource.ParentID),
ClusterID: monitorClusterID(resource),
Status: monitorFrontendStatus(resource, resourceType),
CPU: monitorMetricInput(monitorMetricValue(resource.Metrics, func(metrics *unifiedresources.ResourceMetrics) *unifiedresources.MetricValue { return metrics.CPU })),
Memory: monitorMetricInput(monitorMetricValue(resource.Metrics, func(metrics *unifiedresources.ResourceMetrics) *unifiedresources.MetricValue { return metrics.Memory })),
Disk: monitorMetricInput(monitorMetricValue(resource.Metrics, func(metrics *unifiedresources.ResourceMetrics) *unifiedresources.MetricValue { return metrics.Disk })),
Temperature: monitorTemperature(resource),
Uptime: monitorUptime(resource),
Tags: append([]string(nil), resource.Tags...),
Labels: monitorLabels(resource),
LastSeenUnix: monitorLastSeenUnix(resource.LastSeen),
Identity: monitorIdentity(resource, name),
PlatformData: monitorPlatformData(resource, resourceType, platformID),
}
hasNetwork, rx, tx := monitorNetworkMetricInput(resource.Metrics)
input.HasNetwork = hasNetwork
input.NetworkRX = rx
input.NetworkTX = tx
return input
}
func monitorFrontendResourceType(resource unifiedresources.Resource) string {
return string(unifiedresources.ContractResourceType(resource))
}
func monitorFrontendNames(resource unifiedresources.Resource, resourceType string) (string, string) {
name := strings.TrimSpace(unifiedresources.ResourceDisplayName(resource))
if name == "" {
name = resource.ID
}
return name, name
}
func monitorPlatformType(resource unifiedresources.Resource, resourceType string) string {
if resource.Proxmox != nil {
return "proxmox-pve"
}
if resource.VMware != nil {
return "vmware-vsphere"
}
if resource.TrueNAS != nil {
return "truenas"
}
switch resourceType {
case "vm", "system-container", "storage", "pool":
return "proxmox-pve"
case "docker-host", "app-container":
return "docker"
case "k8s-cluster", "k8s-node", "pod", "k8s-deployment":
return "kubernetes"
case "pbs":
return "proxmox-pbs"
case "pmg":
return "proxmox-pmg"
case "agent":
return "agent"
default:
if monitorHasSource(resource.Sources, unifiedresources.SourceK8s) {
return "kubernetes"
}
if monitorHasSource(resource.Sources, unifiedresources.SourceDocker) {
return "docker"
}
if monitorHasSource(resource.Sources, unifiedresources.SourcePBS) {
return "proxmox-pbs"
}
if monitorHasSource(resource.Sources, unifiedresources.SourcePMG) {
return "proxmox-pmg"
}
if monitorHasSource(resource.Sources, unifiedresources.SourceAgent) {
return "agent"
}
if monitorHasSource(resource.Sources, unifiedresources.SourceProxmox) {
return "proxmox-pve"
}
for _, source := range resource.Sources {
if candidate := strings.TrimSpace(string(source)); candidate != "" {
return candidate
}
}
return "unknown"
}
}
func monitorPlatformID(resource unifiedresources.Resource, resourceType string) string {
switch resourceType {
case "node", "vm", "system-container":
if resource.Proxmox != nil && strings.TrimSpace(resource.Proxmox.Instance) != "" {
return strings.TrimSpace(resource.Proxmox.Instance)
}
case "agent":
if resource.Agent != nil && strings.TrimSpace(resource.Agent.AgentID) != "" {
return strings.TrimSpace(resource.Agent.AgentID)
}
case "docker-host":
if resource.Docker != nil && strings.TrimSpace(resource.Docker.Hostname) != "" {
return strings.TrimSpace(resource.Docker.Hostname)
}
case "app-container":
if resource.Docker != nil && strings.TrimSpace(resource.Docker.Hostname) != "" {
return strings.TrimSpace(resource.Docker.Hostname)
}
if resource.ParentID != nil {
return strings.TrimSpace(*resource.ParentID)
}
case "k8s-cluster", "k8s-node", "pod", "k8s-deployment":
if resource.Kubernetes != nil && strings.TrimSpace(resource.Kubernetes.AgentID) != "" {
return strings.TrimSpace(resource.Kubernetes.AgentID)
}
case "pbs":
if resource.PBS != nil && strings.TrimSpace(resource.PBS.Hostname) != "" {
return strings.TrimSpace(resource.PBS.Hostname)
}
case "pmg":
if resource.PMG != nil && strings.TrimSpace(resource.PMG.Hostname) != "" {
return strings.TrimSpace(resource.PMG.Hostname)
}
}
return resource.ID
}
func monitorFrontendStatus(resource unifiedresources.Resource, resourceType string) string {
switch resourceType {
case "app-container":
switch resource.Status {
case unifiedresources.StatusOnline:
return "running"
case unifiedresources.StatusOffline:
return "stopped"
case unifiedresources.StatusWarning:
return "degraded"
}
case "pod":
if resource.Kubernetes != nil {
phase := strings.ToLower(strings.TrimSpace(resource.Kubernetes.PodPhase))
switch phase {
case "running":
return "running"
case "pending", "unknown":
return "degraded"
case "succeeded", "failed":
return "stopped"
}
}
}
switch resource.Status {
case unifiedresources.StatusOnline:
if monitorIsWorkloadType(resourceType) || resourceType == "pod" {
return "running"
}
return "online"
case unifiedresources.StatusOffline:
if monitorIsWorkloadType(resourceType) || resourceType == "pod" {
return "stopped"
}
return "offline"
case unifiedresources.StatusWarning:
return "degraded"
default:
return "unknown"
}
}
func monitorIsWorkloadType(resourceType string) bool {
switch resourceType {
case "app-container", "system-container", "vm", "oci-container":
return true
default:
return false
}
}
func monitorClusterID(resource unifiedresources.Resource) string {
if clusterID := strings.TrimSpace(unifiedresources.ResourceClusterName(resource)); clusterID != "" {
return clusterID
}
if resource.Docker != nil && resource.Docker.Swarm != nil {
if name := strings.TrimSpace(resource.Docker.Swarm.ClusterName); name != "" {
return name
}
if id := strings.TrimSpace(resource.Docker.Swarm.ClusterID); id != "" {
return id
}
}
return ""
}
func monitorMetricInput(metric *unifiedresources.MetricValue) *models.ResourceMetricInput {
if metric == nil {
return nil
}
current := metric.Percent
if current == 0 {
current = metric.Value
}
if metric.Percent != 0 && metric.Value != 0 {
current = math.Max(metric.Percent, metric.Value)
}
result := &models.ResourceMetricInput{Current: current}
if metric.Total != nil {
total := *metric.Total
result.Total = &total
}
if metric.Used != nil {
used := *metric.Used
result.Used = &used
}
if result.Total != nil && result.Used != nil {
free := *result.Total - *result.Used
result.Free = &free
}
return result
}
func monitorNetworkMetricInput(metrics *unifiedresources.ResourceMetrics) (bool, int64, int64) {
if metrics == nil || (metrics.NetIn == nil && metrics.NetOut == nil) {
return false, 0, 0
}
var rx int64
var tx int64
if metrics.NetIn != nil {
rx = int64(math.Round(metrics.NetIn.Value))
}
if metrics.NetOut != nil {
tx = int64(math.Round(metrics.NetOut.Value))
}
return true, rx, tx
}
func monitorTemperature(resource unifiedresources.Resource) *float64 {
if resource.Agent != nil && resource.Agent.Temperature != nil {
value := *resource.Agent.Temperature
return &value
}
if resource.Proxmox != nil && resource.Proxmox.Temperature != nil {
value := *resource.Proxmox.Temperature
return &value
}
if resource.Docker != nil && resource.Docker.Temperature != nil {
value := *resource.Docker.Temperature
return &value
}
if resource.Kubernetes != nil && resource.Kubernetes.Temperature != nil {
value := *resource.Kubernetes.Temperature
return &value
}
return nil
}
func monitorUptime(resource unifiedresources.Resource) *int64 {
if resource.Agent != nil && resource.Agent.UptimeSeconds > 0 {
value := resource.Agent.UptimeSeconds
return &value
}
if resource.Proxmox != nil && resource.Proxmox.Uptime > 0 {
value := resource.Proxmox.Uptime
return &value
}
if resource.Docker != nil && resource.Docker.UptimeSeconds > 0 {
value := resource.Docker.UptimeSeconds
return &value
}
if resource.Kubernetes != nil && resource.Kubernetes.UptimeSeconds > 0 {
value := resource.Kubernetes.UptimeSeconds
return &value
}
if resource.PBS != nil && resource.PBS.UptimeSeconds > 0 {
value := resource.PBS.UptimeSeconds
return &value
}
if resource.PMG != nil && resource.PMG.UptimeSeconds > 0 {
value := resource.PMG.UptimeSeconds
return &value
}
if resource.TrueNAS != nil && resource.TrueNAS.UptimeSeconds > 0 {
value := resource.TrueNAS.UptimeSeconds
return &value
}
return nil
}
func monitorLabels(resource unifiedresources.Resource) map[string]string {
if resource.Kubernetes == nil || len(resource.Kubernetes.Labels) == 0 {
return nil
}
labels := make(map[string]string, len(resource.Kubernetes.Labels))
for key, value := range resource.Kubernetes.Labels {
labels[key] = value
}
return labels
}
func monitorIdentity(resource unifiedresources.Resource, fallbackName string) *models.ResourceIdentityInput {
hostname := ""
if resource.Agent != nil {
hostname = strings.TrimSpace(resource.Agent.Hostname)
}
if hostname == "" && resource.Docker != nil {
hostname = strings.TrimSpace(resource.Docker.Hostname)
}
if hostname == "" && resource.Proxmox != nil {
hostname = strings.TrimSpace(resource.Proxmox.NodeName)
}
if hostname == "" {
for _, candidate := range resource.Identity.Hostnames {
if trimmed := strings.TrimSpace(candidate); trimmed != "" {
hostname = trimmed
break
}
}
}
if hostname == "" {
hostname = fallbackName
}
ips := make([]string, 0, len(resource.Identity.IPAddresses))
for _, ip := range resource.Identity.IPAddresses {
trimmed := strings.TrimSpace(ip)
if trimmed == "" {
continue
}
ips = append(ips, trimmed)
}
machineID := strings.TrimSpace(resource.Identity.MachineID)
if hostname == "" && machineID == "" && len(ips) == 0 {
return nil
}
return &models.ResourceIdentityInput{
Hostname: hostname,
MachineID: machineID,
IPs: ips,
}
}
func monitorPlatformData(resource unifiedresources.Resource, resourceType string, platformID string) json.RawMessage {
var payload interface{}
switch resourceType {
case "node":
if resource.Proxmox != nil {
payload = map[string]interface{}{
"instance": resource.Proxmox.Instance,
"host": "",
"guestURL": "",
"pveVersion": resource.Proxmox.PVEVersion,
"kernelVersion": resource.Proxmox.KernelVersion,
"cpuInfo": resource.Proxmox.CPUInfo,
"loadAverage": []float64{},
"isClusterMember": resource.Proxmox.ClusterName != "",
"clusterName": resource.Proxmox.ClusterName,
"connectionHealth": monitorSourceStatus(resource.SourceStatus, unifiedresources.SourceProxmox),
}
}
case "vm":
payload = buildProxmoxVMPayload(resource)
case "system-container", "oci-container":
payload = buildProxmoxVMPayload(resource)
case "agent":
if resource.Proxmox != nil {
payload = map[string]interface{}{
"instance": resource.Proxmox.Instance,
"host": "",
"guestURL": "",
"pveVersion": resource.Proxmox.PVEVersion,
"kernelVersion": resource.Proxmox.KernelVersion,
"cpuInfo": resource.Proxmox.CPUInfo,
"loadAverage": []float64{},
"isClusterMember": resource.Proxmox.ClusterName != "",
"clusterName": resource.Proxmox.ClusterName,
"connectionHealth": monitorSourceStatus(resource.SourceStatus, unifiedresources.SourceProxmox),
}
} else if resource.Agent != nil {
payload = map[string]interface{}{
"platform": resource.Agent.Platform,
"osName": resource.Agent.OSName,
"osVersion": resource.Agent.OSVersion,
"kernelVersion": resource.Agent.KernelVersion,
"architecture": resource.Agent.Architecture,
"agentVersion": resource.Agent.AgentVersion,
"interfaces": resource.Agent.NetworkInterfaces,
"disks": resource.Agent.Disks,
"memory": resource.Agent.Memory,
}
}
case "docker-host":
if resource.Docker != nil {
payload = map[string]interface{}{
"agentId": platformID,
"runtime": resource.Docker.Runtime,
"runtimeVersion": resource.Docker.RuntimeVersion,
"dockerVersion": resource.Docker.DockerVersion,
"os": resource.Docker.OS,
"kernelVersion": resource.Docker.KernelVersion,
"architecture": resource.Docker.Architecture,
"agentVersion": resource.Docker.AgentVersion,
"swarm": resource.Docker.Swarm,
"interfaces": resource.Docker.NetworkInterfaces,
"disks": resource.Docker.Disks,
}
}
case "app-container":
if resource.Docker != nil {
payload = map[string]interface{}{
"hostId": monitorStringValue(resource.ParentID),
"hostName": resource.Docker.Hostname,
"image": resource.Docker.Image,
"state": strings.ToLower(string(resource.Status)),
"status": strings.ToLower(string(resource.Status)),
"health": "",
"createdAt": time.Time{},
}
}
case "k8s-cluster":
if resource.Kubernetes != nil {
payload = map[string]interface{}{
"agentId": resource.Kubernetes.AgentID,
"server": resource.Kubernetes.Server,
"context": resource.Kubernetes.Context,
"version": resource.Kubernetes.Version,
"customDisplayName": "",
"hidden": false,
"pendingUninstall": resource.Kubernetes.PendingUninstall,
"nodeCount": resource.ChildCount,
}
}
case "k8s-node":
if resource.Kubernetes != nil {
payload = map[string]interface{}{
"clusterId": resource.Kubernetes.ClusterID,
"ready": resource.Kubernetes.Ready,
"unschedulable": resource.Kubernetes.Unschedulable,
"kubeletVersion": resource.Kubernetes.KubeletVersion,
"containerRuntimeVersion": resource.Kubernetes.ContainerRuntimeVersion,
"osImage": resource.Kubernetes.OSImage,
"kernelVersion": resource.Kubernetes.KernelVersion,
"architecture": resource.Kubernetes.Architecture,
"capacityCpuCores": resource.Kubernetes.CapacityCPU,
"capacityMemoryBytes": resource.Kubernetes.CapacityMemoryBytes,
"capacityPods": resource.Kubernetes.CapacityPods,
"allocatableCpuCores": resource.Kubernetes.AllocCPU,
"allocatableMemoryBytes": resource.Kubernetes.AllocMemoryBytes,
"allocatablePods": resource.Kubernetes.AllocPods,
"roles": append([]string(nil), resource.Kubernetes.Roles...),
}
}
case "pod":
if resource.Kubernetes != nil {
payload = map[string]interface{}{
"clusterId": resource.Kubernetes.ClusterID,
"namespace": resource.Kubernetes.Namespace,
"nodeName": resource.Kubernetes.NodeName,
"phase": resource.Kubernetes.PodPhase,
"restarts": resource.Kubernetes.Restarts,
"ownerKind": resource.Kubernetes.OwnerKind,
"ownerName": resource.Kubernetes.OwnerName,
}
}
case "k8s-deployment":
if resource.Kubernetes != nil {
payload = map[string]interface{}{
"clusterId": resource.Kubernetes.ClusterID,
"namespace": resource.Kubernetes.Namespace,
"desiredReplicas": resource.Kubernetes.DesiredReplicas,
"updatedReplicas": resource.Kubernetes.UpdatedReplicas,
"readyReplicas": resource.Kubernetes.ReadyReplicas,
"availableReplicas": resource.Kubernetes.AvailableReplicas,
}
}
case "pbs":
if resource.PBS != nil {
payload = map[string]interface{}{
"host": resource.PBS.Hostname,
"version": resource.PBS.Version,
"connectionHealth": resource.PBS.ConnectionHealth,
"memoryUsed": monitorMetricUsed(monitorMetricValue(resource.Metrics, func(metrics *unifiedresources.ResourceMetrics) *unifiedresources.MetricValue { return metrics.Memory })),
"memoryTotal": monitorMetricTotal(monitorMetricValue(resource.Metrics, func(metrics *unifiedresources.ResourceMetrics) *unifiedresources.MetricValue { return metrics.Memory })),
"numDatastores": resource.PBS.DatastoreCount,
}
}
case "pmg":
if resource.PMG != nil {
payload = map[string]interface{}{
"host": resource.PMG.Hostname,
"version": resource.PMG.Version,
"connectionHealth": resource.PMG.ConnectionHealth,
"nodeCount": resource.PMG.NodeCount,
"queueActive": resource.PMG.QueueActive,
"queueDeferred": resource.PMG.QueueDeferred,
"queueHold": resource.PMG.QueueHold,
"queueIncoming": resource.PMG.QueueIncoming,
"queueTotal": resource.PMG.QueueTotal,
"mailCountTotal": resource.PMG.MailCountTotal,
"spamIn": resource.PMG.SpamIn,
"virusIn": resource.PMG.VirusIn,
"lastUpdated": resource.PMG.LastUpdated,
}
}
case "storage", "pool":
nodeLabel := resource.ParentName
if nodeLabel == "" {
nodeLabel = monitorStringValue(resource.ParentID)
}
payload = map[string]interface{}{
"instance": platformID,
"node": nodeLabel,
"type": "",
"content": "",
"shared": false,
"enabled": true,
"active": resource.Status == unifiedresources.StatusOnline,
}
}
if payload == nil {
return nil
}
encoded, err := json.Marshal(payload)
if err != nil {
return nil
}
return encoded
}
func convertProxmoxDisks(disks []unifiedresources.DiskInfo) []map[string]interface{} {
if len(disks) == 0 {
return nil
}
out := make([]map[string]interface{}, 0, len(disks))
for _, d := range disks {
usage := float64(0)
if d.Total > 0 {
usage = float64(d.Used) / float64(d.Total) * 100
}
out = append(out, map[string]interface{}{
"total": d.Total,
"used": d.Used,
"free": d.Free,
"usage": usage,
"mountpoint": d.Mountpoint,
"type": d.Filesystem,
"device": d.Device,
})
}
return out
}
func buildProxmoxVMPayload(resource unifiedresources.Resource) map[string]interface{} {
if resource.Proxmox == nil {
return nil
}
return map[string]interface{}{
"vmid": resource.Proxmox.VMID,
"node": resource.Proxmox.NodeName,
"instance": resource.Proxmox.Instance,
"cpus": resource.Proxmox.CPUs,
"template": resource.Proxmox.Template,
"networkIn": monitorMetricInt64(resource.Metrics, func(metrics *unifiedresources.ResourceMetrics) *unifiedresources.MetricValue { return metrics.NetIn }),
"networkOut": monitorMetricInt64(resource.Metrics, func(metrics *unifiedresources.ResourceMetrics) *unifiedresources.MetricValue { return metrics.NetOut }),
"diskRead": monitorMetricInt64(resource.Metrics, func(metrics *unifiedresources.ResourceMetrics) *unifiedresources.MetricValue { return metrics.DiskRead }),
"diskWrite": monitorMetricInt64(resource.Metrics, func(metrics *unifiedresources.ResourceMetrics) *unifiedresources.MetricValue {
return metrics.DiskWrite
}),
"disks": convertProxmoxDisks(resource.Proxmox.Disks),
"swapUsed": resource.Proxmox.SwapUsed,
"swapTotal": resource.Proxmox.SwapTotal,
"balloon": resource.Proxmox.Balloon,
"lastBackup": resource.Proxmox.LastBackup,
"ipAddresses": append([]string(nil), resource.Identity.IPAddresses...),
}
}
func monitorMetricValue(metrics *unifiedresources.ResourceMetrics, pick func(*unifiedresources.ResourceMetrics) *unifiedresources.MetricValue) *unifiedresources.MetricValue {
if metrics == nil {
return nil
}
return pick(metrics)
}
func monitorMetricInt64(metrics *unifiedresources.ResourceMetrics, pick func(*unifiedresources.ResourceMetrics) *unifiedresources.MetricValue) int64 {
metric := monitorMetricValue(metrics, pick)
if metric == nil {
return 0
}
return int64(math.Round(metric.Value))
}
func monitorMetricUsed(metric *unifiedresources.MetricValue) int64 {
if metric == nil || metric.Used == nil {
return 0
}
return *metric.Used
}
func monitorMetricTotal(metric *unifiedresources.MetricValue) int64 {
if metric == nil || metric.Total == nil {
return 0
}
return *metric.Total
}
func monitorSourceStatus(statuses map[unifiedresources.DataSource]unifiedresources.SourceStatus, source unifiedresources.DataSource) string {
if statuses == nil {
return ""
}
status, ok := statuses[source]
if !ok {
return ""
}
return status.Status
}
func monitorHasSource(sources []unifiedresources.DataSource, source unifiedresources.DataSource) bool {
for _, candidate := range sources {
if candidate == source {
return true
}
}
return false
}
func monitorSourceType(sources []unifiedresources.DataSource) string {
if len(sources) > 1 {
return "hybrid"
}
if len(sources) == 1 {
switch sources[0] {
case unifiedresources.SourceAgent, unifiedresources.SourceDocker, unifiedresources.SourceK8s:
return "agent"
default:
return "api"
}
}
return "api"
}
func monitorStringValue(value *string) string {
if value == nil {
return ""
}
return strings.TrimSpace(*value)
}
func monitorLastSeenUnix(value time.Time) int64 {
if value.IsZero() {
return time.Now().UTC().UnixMilli()
}
return value.UnixMilli()
}
// pollStorageBackupsWithNodes polls backups using a provided nodes list to avoid duplicate GetNodes calls
// Stop gracefully stops the monitor
func (m *Monitor) Stop() {
log.Info().Msg("stopping monitor")
// Stop the alert manager to save history
if m.alertManager != nil {
m.alertManager.Stop()
}
// Stop notification manager
if m.notificationMgr != nil {
m.notificationMgr.Stop()
}
// Close persistent metrics store (flushes buffered data)
if m.metricsStore != nil {
if err := m.metricsStore.Close(); err != nil {
log.Error().Err(err).Msg("failed to close metrics store")
} else {
log.Info().Msg("metrics store closed successfully")
}
}
log.Info().Msg("monitor stopped")
}
// recordAuthFailure records an authentication failure for a node