mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-07 00:37:36 +00:00
5582 lines
170 KiB
Go
5582 lines
170 KiB
Go
package monitoring
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
stderrors "errors"
|
|
"fmt"
|
|
"math"
|
|
"math/rand"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/ai/memory"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/config"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/discovery"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/logging"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/mock"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/models"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/monitoring/errors"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/notifications"
|
|
recoverymanager "github.com/rcourtman/pulse-go-rewrite/internal/recovery/manager"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/system"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/websocket"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/metrics"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/pbs"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/pmg"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
|
|
"github.com/rs/zerolog"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
const (
|
|
defaultTaskTimeout = 90 * time.Second
|
|
minTaskTimeout = 30 * time.Second
|
|
maxTaskTimeout = 3 * time.Minute
|
|
)
|
|
|
|
const mockKeepRealPollingEnv = "PULSE_MOCK_KEEP_REAL_POLLING"
|
|
|
|
func keepRealPollingInMockMode() bool {
|
|
raw := strings.TrimSpace(strings.ToLower(os.Getenv(mockKeepRealPollingEnv)))
|
|
switch raw {
|
|
case "1", "true", "yes", "on":
|
|
return true
|
|
case "", "0", "false", "no", "off":
|
|
return false
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// newProxmoxClientFunc is a variable that holds the function to create a new Proxmox client.
|
|
// It is used to allow mocking the client creation in tests.
|
|
var newProxmoxClientFunc = func(cfg proxmox.ClientConfig) (PVEClientInterface, error) {
|
|
return proxmox.NewClient(cfg)
|
|
}
|
|
|
|
// PVEClientInterface defines the interface for PVE clients (both regular and cluster)
|
|
type PVEClientInterface interface {
|
|
GetNodes(ctx context.Context) ([]proxmox.Node, error)
|
|
GetNodeStatus(ctx context.Context, node string) (*proxmox.NodeStatus, error)
|
|
GetNodeRRDData(ctx context.Context, node string, timeframe string, cf string, ds []string) ([]proxmox.NodeRRDPoint, error)
|
|
GetLXCRRDData(ctx context.Context, node string, vmid int, timeframe string, cf string, ds []string) ([]proxmox.GuestRRDPoint, error)
|
|
GetVMRRDData(ctx context.Context, node string, vmid int, timeframe string, cf string, ds []string) ([]proxmox.GuestRRDPoint, error)
|
|
GetVMs(ctx context.Context, node string) ([]proxmox.VM, error)
|
|
GetContainers(ctx context.Context, node string) ([]proxmox.Container, error)
|
|
GetStorage(ctx context.Context, node string) ([]proxmox.Storage, error)
|
|
GetAllStorage(ctx context.Context) ([]proxmox.Storage, error)
|
|
GetBackupTasks(ctx context.Context) ([]proxmox.Task, error)
|
|
GetReplicationStatus(ctx context.Context) ([]proxmox.ReplicationJob, error)
|
|
GetStorageContent(ctx context.Context, node, storage string) ([]proxmox.StorageContent, error)
|
|
GetVMSnapshots(ctx context.Context, node string, vmid int) ([]proxmox.Snapshot, error)
|
|
GetContainerSnapshots(ctx context.Context, node string, vmid int) ([]proxmox.Snapshot, error)
|
|
GetVMStatus(ctx context.Context, node string, vmid int) (*proxmox.VMStatus, error)
|
|
GetContainerStatus(ctx context.Context, node string, vmid int) (*proxmox.Container, error)
|
|
GetContainerConfig(ctx context.Context, node string, vmid int) (map[string]interface{}, error)
|
|
GetContainerInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.ContainerInterface, error)
|
|
GetClusterResources(ctx context.Context, resourceType string) ([]proxmox.ClusterResource, error)
|
|
IsClusterMember(ctx context.Context) (bool, error)
|
|
GetVMFSInfo(ctx context.Context, node string, vmid int) ([]proxmox.VMFileSystem, error)
|
|
GetVMNetworkInterfaces(ctx context.Context, node string, vmid int) ([]proxmox.VMNetworkInterface, error)
|
|
GetVMAgentInfo(ctx context.Context, node string, vmid int) (map[string]interface{}, error)
|
|
GetVMAgentVersion(ctx context.Context, node string, vmid int) (string, error)
|
|
GetZFSPoolStatus(ctx context.Context, node string) ([]proxmox.ZFSPoolStatus, error)
|
|
GetZFSPoolsWithDetails(ctx context.Context, node string) ([]proxmox.ZFSPoolInfo, error)
|
|
GetDisks(ctx context.Context, node string) ([]proxmox.Disk, error)
|
|
GetNodePendingUpdates(ctx context.Context, node string) ([]proxmox.AptPackage, error)
|
|
GetCephStatus(ctx context.Context) (*proxmox.CephStatus, error)
|
|
GetCephDF(ctx context.Context) (*proxmox.CephDF, error)
|
|
}
|
|
|
|
// ResourceStoreInterface provides methods for polling optimization and resource access.
|
|
// When an agent is monitoring a node, we can reduce API polling for that node.
|
|
type ResourceStoreInterface interface {
|
|
// ShouldSkipAPIPolling returns true if API polling should be skipped for the hostname
|
|
// because an agent is providing richer data.
|
|
ShouldSkipAPIPolling(hostname string) bool
|
|
// GetPollingRecommendations returns a map of hostname -> polling multiplier.
|
|
// 0 = skip entirely, 0.5 = half frequency, 1 = normal
|
|
GetPollingRecommendations() map[string]float64
|
|
// GetAll returns all resources in the store (for WebSocket broadcasts)
|
|
GetAll() []unifiedresources.Resource
|
|
// PopulateFromSnapshot updates the store with data from a StateSnapshot
|
|
PopulateFromSnapshot(snapshot models.StateSnapshot)
|
|
}
|
|
|
|
// SupplementalRecordStore is an optional extension for resource stores that can
|
|
// ingest source-native unified records in addition to legacy snapshots.
|
|
type SupplementalRecordStore interface {
|
|
PopulateSupplementalRecords(source unifiedresources.DataSource, records []unifiedresources.IngestRecord)
|
|
}
|
|
|
|
// AtomicSnapshotResourceStore is an optional extension for stores that can
|
|
// atomically replace their canonical registry from a snapshot plus
|
|
// supplemental records in a single swap.
|
|
type AtomicSnapshotResourceStore interface {
|
|
PopulateSnapshotAndSupplemental(snapshot models.StateSnapshot, recordsBySource map[unifiedresources.DataSource][]unifiedresources.IngestRecord)
|
|
}
|
|
|
|
// MetricsTargetResourceStore optionally resolves the history/metrics target for
|
|
// a canonical resource in the live unified store.
|
|
type MetricsTargetResourceStore interface {
|
|
MetricsTargetForResource(resourceID string) *unifiedresources.MetricsTarget
|
|
}
|
|
|
|
// UnifiedResourceFreshnessStore is an optional extension for stores that track
|
|
// their own canonical-resource freshness independent of state.LastUpdate.
|
|
type UnifiedResourceFreshnessStore interface {
|
|
UnifiedResourceFreshness() time.Time
|
|
}
|
|
|
|
// MonitorSupplementalRecordsProvider emits source-native records outside the
|
|
// poll-provider scheduling path (for example, dedicated background pollers).
|
|
type MonitorSupplementalRecordsProvider interface {
|
|
SupplementalRecords(m *Monitor, orgID string) []unifiedresources.IngestRecord
|
|
}
|
|
|
|
// MonitorSupplementalChangesProvider optionally emits canonical resource
|
|
// timeline changes alongside supplemental records.
|
|
type MonitorSupplementalChangesProvider interface {
|
|
SupplementalChanges(m *Monitor, orgID string) []unifiedresources.ResourceChange
|
|
}
|
|
|
|
// MonitorSupplementalInventoryReadinessProvider optionally reports when a
|
|
// supplemental provider's current org-scoped inventory is settled enough to be
|
|
// consumed by billing and monitored-system admission boundaries.
|
|
//
|
|
// Providers that suppress snapshot-owned sources must implement this contract
|
|
// so the monitor can fail closed until the canonical store has been rebuilt
|
|
// from a settled provider baseline.
|
|
type MonitorSupplementalInventoryReadinessProvider interface {
|
|
SupplementalInventoryReadyAt(m *Monitor, orgID string) (time.Time, bool)
|
|
}
|
|
|
|
// MonitorPhysicalDiskTemperatureHistoryProvider optionally exposes source-native
|
|
// physical-disk temperature history through the canonical monitoring chart
|
|
// boundary when Pulse's own stored history is shallow.
|
|
type MonitorPhysicalDiskTemperatureHistoryProvider interface {
|
|
PhysicalDiskTemperatureHistory(m *Monitor, orgID string, duration time.Duration) map[string][]MetricPoint
|
|
}
|
|
|
|
func getNodeDisplayName(instance *config.PVEInstance, nodeName string) string {
|
|
baseName := strings.TrimSpace(nodeName)
|
|
if baseName == "" {
|
|
baseName = "unknown-node"
|
|
}
|
|
|
|
if instance == nil {
|
|
return baseName
|
|
}
|
|
|
|
friendly := strings.TrimSpace(instance.Name)
|
|
|
|
if instance.IsCluster {
|
|
if endpointLabel := lookupClusterEndpointLabel(instance, nodeName); endpointLabel != "" {
|
|
return endpointLabel
|
|
}
|
|
|
|
if baseName != "" && baseName != "unknown-node" {
|
|
return baseName
|
|
}
|
|
|
|
if friendly != "" {
|
|
return friendly
|
|
}
|
|
|
|
return baseName
|
|
}
|
|
|
|
if friendly != "" {
|
|
return friendly
|
|
}
|
|
|
|
if baseName != "" && baseName != "unknown-node" {
|
|
return baseName
|
|
}
|
|
|
|
if label := normalizeEndpointHost(instance.Host); label != "" && !isLikelyIPAddress(label) {
|
|
return label
|
|
}
|
|
|
|
return baseName
|
|
}
|
|
|
|
func (m *Monitor) getInstanceConfig(instanceName string) *config.PVEInstance {
|
|
if m == nil {
|
|
return nil
|
|
}
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
if m.config == nil {
|
|
return nil
|
|
}
|
|
|
|
for i := range m.config.PVEInstances {
|
|
if strings.EqualFold(m.config.PVEInstances[i].Name, instanceName) {
|
|
instanceCopy := m.config.PVEInstances[i]
|
|
return &instanceCopy
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (m *Monitor) totalClientCount() int {
|
|
if m == nil {
|
|
return 0
|
|
}
|
|
|
|
total := 0
|
|
for _, provider := range m.pollProviderSnapshotWithBuiltins() {
|
|
if provider == nil {
|
|
continue
|
|
}
|
|
total += len(provider.ListInstances(m))
|
|
}
|
|
return total
|
|
}
|
|
|
|
func (m *Monitor) getPVEClient(name string) (PVEClientInterface, bool) {
|
|
if m == nil {
|
|
return nil, false
|
|
}
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
client, ok := m.pveClients[name]
|
|
return client, ok
|
|
}
|
|
|
|
func (m *Monitor) getPBSClient(name string) (*pbs.Client, bool) {
|
|
if m == nil {
|
|
return nil, false
|
|
}
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
client, ok := m.pbsClients[name]
|
|
return client, ok
|
|
}
|
|
|
|
func (m *Monitor) getPMGClient(name string) (*pmg.Client, bool) {
|
|
if m == nil {
|
|
return nil, false
|
|
}
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
client, ok := m.pmgClients[name]
|
|
return client, ok
|
|
}
|
|
|
|
func mergeNVMeTempsIntoDisks(disks []models.PhysicalDisk, nodes []models.Node) []models.PhysicalDisk {
|
|
if len(disks) == 0 || len(nodes) == 0 {
|
|
return disks
|
|
}
|
|
|
|
// Build temperature maps by node for both SMART and legacy NVMe data
|
|
smartTempsByNode := make(map[string][]models.DiskTemp)
|
|
nvmeTempsByNode := make(map[string][]models.NVMeTemp)
|
|
|
|
for _, node := range nodes {
|
|
log.Debug().
|
|
Str("nodeName", node.Name).
|
|
Bool("hasTemp", node.Temperature != nil).
|
|
Bool("tempAvailable", node.Temperature != nil && node.Temperature.Available).
|
|
Int("smartCount", func() int {
|
|
if node.Temperature != nil {
|
|
return len(node.Temperature.SMART)
|
|
}
|
|
return 0
|
|
}()).
|
|
Msg("mergeNVMeTempsIntoDisks: checking node temperature")
|
|
|
|
if node.Temperature == nil || !node.Temperature.Available {
|
|
continue
|
|
}
|
|
|
|
// Collect SMART temps (preferred source)
|
|
if len(node.Temperature.SMART) > 0 {
|
|
temps := make([]models.DiskTemp, len(node.Temperature.SMART))
|
|
copy(temps, node.Temperature.SMART)
|
|
smartTempsByNode[node.Name] = temps
|
|
log.Debug().
|
|
Str("nodeName", node.Name).
|
|
Int("smartTempCount", len(temps)).
|
|
Msg("mergeNVMeTempsIntoDisks: collected SMART temps for node")
|
|
}
|
|
|
|
// Collect legacy NVMe temps as fallback
|
|
if len(node.Temperature.NVMe) > 0 {
|
|
temps := make([]models.NVMeTemp, len(node.Temperature.NVMe))
|
|
copy(temps, node.Temperature.NVMe)
|
|
sort.Slice(temps, func(i, j int) bool {
|
|
return temps[i].Device < temps[j].Device
|
|
})
|
|
nvmeTempsByNode[node.Name] = temps
|
|
}
|
|
}
|
|
|
|
if len(smartTempsByNode) == 0 && len(nvmeTempsByNode) == 0 {
|
|
log.Debug().
|
|
Int("diskCount", len(disks)).
|
|
Msg("mergeNVMeTempsIntoDisks: no SMART or NVMe temperature data available")
|
|
return disks
|
|
}
|
|
|
|
log.Debug().
|
|
Int("smartNodeCount", len(smartTempsByNode)).
|
|
Int("nvmeNodeCount", len(nvmeTempsByNode)).
|
|
Int("diskCount", len(disks)).
|
|
Msg("mergeNVMeTempsIntoDisks: starting disk temperature merge")
|
|
|
|
updated := make([]models.PhysicalDisk, len(disks))
|
|
copy(updated, disks)
|
|
|
|
// Process SMART temperatures first (preferred method)
|
|
for i := range updated {
|
|
smartTemps, ok := smartTempsByNode[updated[i].Node]
|
|
log.Debug().
|
|
Str("diskDevPath", updated[i].DevPath).
|
|
Str("diskNode", updated[i].Node).
|
|
Bool("hasSMARTData", ok).
|
|
Int("smartTempCount", len(smartTemps)).
|
|
Msg("mergeNVMeTempsIntoDisks: checking disk for SMART temp match")
|
|
if !ok || len(smartTemps) == 0 {
|
|
continue
|
|
}
|
|
|
|
// Try to match by WWN (most reliable)
|
|
if updated[i].WWN != "" {
|
|
for _, temp := range smartTemps {
|
|
if temp.WWN != "" && strings.EqualFold(temp.WWN, updated[i].WWN) {
|
|
if temp.Temperature > 0 && !temp.StandbySkipped {
|
|
updated[i].Temperature = temp.Temperature
|
|
log.Debug().
|
|
Str("disk", updated[i].DevPath).
|
|
Str("wwn", updated[i].WWN).
|
|
Int("temp", temp.Temperature).
|
|
Msg("Matched SMART temperature by WWN")
|
|
}
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fall back to serial number match (case-insensitive)
|
|
if updated[i].Serial != "" && updated[i].Temperature == 0 {
|
|
for _, temp := range smartTemps {
|
|
if temp.Serial != "" && strings.EqualFold(temp.Serial, updated[i].Serial) {
|
|
if temp.Temperature > 0 && !temp.StandbySkipped {
|
|
updated[i].Temperature = temp.Temperature
|
|
log.Debug().
|
|
Str("disk", updated[i].DevPath).
|
|
Str("serial", updated[i].Serial).
|
|
Int("temp", temp.Temperature).
|
|
Msg("Matched SMART temperature by serial")
|
|
}
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
|
|
// Last resort: match by device path (normalized)
|
|
if updated[i].Temperature == 0 {
|
|
normalizedDevPath := strings.TrimPrefix(updated[i].DevPath, "/dev/")
|
|
for _, temp := range smartTemps {
|
|
normalizedTempDev := strings.TrimPrefix(temp.Device, "/dev/")
|
|
if normalizedTempDev == normalizedDevPath {
|
|
if temp.Temperature > 0 && !temp.StandbySkipped {
|
|
updated[i].Temperature = temp.Temperature
|
|
log.Debug().
|
|
Str("disk", updated[i].DevPath).
|
|
Int("temp", temp.Temperature).
|
|
Msg("Matched SMART temperature by device path")
|
|
}
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Process legacy NVMe temperatures for disks that didn't get SMART data
|
|
disksByNode := make(map[string][]int)
|
|
for i := range updated {
|
|
if strings.EqualFold(updated[i].Type, "nvme") && updated[i].Temperature == 0 {
|
|
disksByNode[updated[i].Node] = append(disksByNode[updated[i].Node], i)
|
|
}
|
|
}
|
|
|
|
for nodeName, diskIndexes := range disksByNode {
|
|
temps, ok := nvmeTempsByNode[nodeName]
|
|
if !ok || len(temps) == 0 {
|
|
continue
|
|
}
|
|
|
|
sort.Slice(diskIndexes, func(i, j int) bool {
|
|
return updated[diskIndexes[i]].DevPath < updated[diskIndexes[j]].DevPath
|
|
})
|
|
|
|
for idx, diskIdx := range diskIndexes {
|
|
if idx >= len(temps) {
|
|
break
|
|
}
|
|
|
|
tempVal := temps[idx].Temp
|
|
if tempVal <= 0 || math.IsNaN(tempVal) {
|
|
continue
|
|
}
|
|
|
|
updated[diskIdx].Temperature = int(math.Round(tempVal))
|
|
log.Debug().
|
|
Str("disk", updated[diskIdx].DevPath).
|
|
Int("temp", updated[diskIdx].Temperature).
|
|
Msg("Matched legacy NVMe temperature by index")
|
|
}
|
|
}
|
|
|
|
return updated
|
|
}
|
|
|
|
// mergeHostAgentSMARTIntoDisks merges SMART temperature data from linked host agents
|
|
// into physical disks for Proxmox nodes. This allows disk temps collected by the
|
|
// pulse-agent running on a PVE node to populate the Physical Disks view.
|
|
func mergeHostAgentSMARTIntoDisks(disks []models.PhysicalDisk, nodes []models.Node, hosts []models.Host) []models.PhysicalDisk {
|
|
if len(disks) == 0 || len(nodes) == 0 || len(hosts) == 0 {
|
|
return disks
|
|
}
|
|
|
|
// Build a map of host ID to host for quick lookup
|
|
hostByID := make(map[string]*models.Host, len(hosts))
|
|
for i := range hosts {
|
|
hostByID[hosts[i].ID] = &hosts[i]
|
|
}
|
|
|
|
// Build a map of node name to linked host's SMART data
|
|
smartByNodeName := make(map[string][]models.HostDiskSMART)
|
|
for _, node := range nodes {
|
|
if node.LinkedAgentID == "" {
|
|
continue
|
|
}
|
|
host, ok := hostByID[node.LinkedAgentID]
|
|
if !ok || len(host.Sensors.SMART) == 0 {
|
|
continue
|
|
}
|
|
smartByNodeName[node.Name] = host.Sensors.SMART
|
|
log.Debug().
|
|
Str("nodeName", node.Name).
|
|
Str("hostAgentID", node.LinkedAgentID).
|
|
Int("smartDiskCount", len(host.Sensors.SMART)).
|
|
Msg("mergeHostAgentSMARTIntoDisks: found linked host agent with SMART data")
|
|
}
|
|
|
|
if len(smartByNodeName) == 0 {
|
|
return disks
|
|
}
|
|
|
|
updated := make([]models.PhysicalDisk, len(disks))
|
|
copy(updated, disks)
|
|
|
|
for i := range updated {
|
|
smartData, ok := smartByNodeName[updated[i].Node]
|
|
if !ok || len(smartData) == 0 {
|
|
continue
|
|
}
|
|
|
|
// Find matching SMART entry by WWN, serial, or device path
|
|
var matched *models.HostDiskSMART
|
|
|
|
// Try to match by WWN (most reliable)
|
|
if updated[i].WWN != "" {
|
|
for j := range smartData {
|
|
if smartData[j].WWN != "" && strings.EqualFold(smartData[j].WWN, updated[i].WWN) {
|
|
matched = &smartData[j]
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fall back to serial number match
|
|
if matched == nil && updated[i].Serial != "" {
|
|
for j := range smartData {
|
|
if smartData[j].Serial != "" && strings.EqualFold(smartData[j].Serial, updated[i].Serial) {
|
|
matched = &smartData[j]
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// Last resort: match by device path
|
|
if matched == nil {
|
|
normalizedDevPath := strings.TrimPrefix(updated[i].DevPath, "/dev/")
|
|
for j := range smartData {
|
|
normalizedDiskDev := strings.TrimPrefix(smartData[j].Device, "/dev/")
|
|
if normalizedDiskDev == normalizedDevPath {
|
|
matched = &smartData[j]
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
if matched == nil || matched.Standby {
|
|
continue
|
|
}
|
|
|
|
// Merge temperature if not already set
|
|
if updated[i].Temperature == 0 && matched.Temperature > 0 {
|
|
updated[i].Temperature = matched.Temperature
|
|
log.Debug().
|
|
Str("device", updated[i].DevPath).
|
|
Int("temp", matched.Temperature).
|
|
Msg("Matched host agent SMART temperature")
|
|
}
|
|
|
|
// Always merge SMART attributes from host agent
|
|
if matched.Attributes != nil {
|
|
updated[i].SmartAttributes = smartAttributesCopy(matched.Attributes)
|
|
if updated[i].Wearout < 0 {
|
|
if derivedWearout := deriveWearoutFromSMARTAttributes(matched.Attributes); derivedWearout >= 0 {
|
|
updated[i].Wearout = derivedWearout
|
|
}
|
|
}
|
|
}
|
|
|
|
if (strings.TrimSpace(updated[i].Health) == "" || strings.EqualFold(updated[i].Health, "unknown")) && strings.TrimSpace(matched.Health) != "" {
|
|
updated[i].Health = matched.Health
|
|
}
|
|
}
|
|
|
|
return updated
|
|
}
|
|
|
|
func deriveWearoutFromSMARTAttributes(attrs *models.SMARTAttributes) int {
|
|
if attrs == nil || attrs.PercentageUsed == nil {
|
|
return -1
|
|
}
|
|
|
|
used := *attrs.PercentageUsed
|
|
if used < 0 {
|
|
used = 0
|
|
}
|
|
if used > 100 {
|
|
used = 100
|
|
}
|
|
return 100 - used
|
|
}
|
|
|
|
func physicalDiskFromReadStateView(view *unifiedresources.PhysicalDiskView) models.PhysicalDisk {
|
|
if view == nil {
|
|
return models.PhysicalDisk{}
|
|
}
|
|
|
|
return models.PhysicalDisk{
|
|
ID: view.ID(),
|
|
Node: view.Node(),
|
|
Instance: view.Instance(),
|
|
DevPath: view.DevPath(),
|
|
Model: view.Model(),
|
|
Serial: view.Serial(),
|
|
WWN: view.WWN(),
|
|
Type: view.DiskType(),
|
|
Size: view.SizeBytes(),
|
|
Health: view.Health(),
|
|
Wearout: view.Wearout(),
|
|
Temperature: view.Temperature(),
|
|
RPM: view.RPM(),
|
|
Used: view.Used(),
|
|
SmartAttributes: smartAttributesFromUnifiedMeta(view.SMART()),
|
|
LastChecked: view.LastSeen(),
|
|
}
|
|
}
|
|
|
|
func smartAttributesFromUnifiedMeta(in *unifiedresources.SMARTMeta) *models.SMARTAttributes {
|
|
if in == nil {
|
|
return nil
|
|
}
|
|
|
|
out := &models.SMARTAttributes{}
|
|
if in.PowerOnHours != 0 {
|
|
value := in.PowerOnHours
|
|
out.PowerOnHours = &value
|
|
}
|
|
if in.PowerCycles != 0 {
|
|
value := in.PowerCycles
|
|
out.PowerCycles = &value
|
|
}
|
|
if in.ReallocatedSectors != 0 {
|
|
value := in.ReallocatedSectors
|
|
out.ReallocatedSectors = &value
|
|
}
|
|
if in.PendingSectors != 0 {
|
|
value := in.PendingSectors
|
|
out.PendingSectors = &value
|
|
}
|
|
if in.OfflineUncorrectable != 0 {
|
|
value := in.OfflineUncorrectable
|
|
out.OfflineUncorrectable = &value
|
|
}
|
|
if in.UDMACRCErrors != 0 {
|
|
value := in.UDMACRCErrors
|
|
out.UDMACRCErrors = &value
|
|
}
|
|
if in.PercentageUsed != 0 {
|
|
value := in.PercentageUsed
|
|
out.PercentageUsed = &value
|
|
}
|
|
if in.AvailableSpare != 0 {
|
|
value := in.AvailableSpare
|
|
out.AvailableSpare = &value
|
|
}
|
|
if in.MediaErrors != 0 {
|
|
value := in.MediaErrors
|
|
out.MediaErrors = &value
|
|
}
|
|
if in.UnsafeShutdowns != 0 {
|
|
value := in.UnsafeShutdowns
|
|
out.UnsafeShutdowns = &value
|
|
}
|
|
if out.PowerOnHours == nil &&
|
|
out.PowerCycles == nil &&
|
|
out.ReallocatedSectors == nil &&
|
|
out.PendingSectors == nil &&
|
|
out.OfflineUncorrectable == nil &&
|
|
out.UDMACRCErrors == nil &&
|
|
out.PercentageUsed == nil &&
|
|
out.AvailableSpare == nil &&
|
|
out.MediaErrors == nil &&
|
|
out.UnsafeShutdowns == nil {
|
|
return nil
|
|
}
|
|
return out
|
|
}
|
|
|
|
func physicalDisksForInstanceFromReadState(readState unifiedresources.ReadState, instance string) []models.PhysicalDisk {
|
|
if readState == nil {
|
|
return nil
|
|
}
|
|
|
|
out := make([]models.PhysicalDisk, 0)
|
|
for _, disk := range readState.PhysicalDisks() {
|
|
if disk == nil || disk.Instance() != instance {
|
|
continue
|
|
}
|
|
out = append(out, physicalDiskFromReadStateView(disk))
|
|
}
|
|
return out
|
|
}
|
|
|
|
func nodesForInstanceFromReadState(readState unifiedresources.ReadState, instance string) []models.Node {
|
|
if readState == nil {
|
|
return nil
|
|
}
|
|
|
|
out := make([]models.Node, 0)
|
|
for _, node := range readState.Nodes() {
|
|
if node == nil || node.Instance() != instance {
|
|
continue
|
|
}
|
|
out = append(out, nodeFromReadStateView(node))
|
|
}
|
|
return out
|
|
}
|
|
|
|
func hostsFromReadState(readState unifiedresources.ReadState) []models.Host {
|
|
if readState == nil {
|
|
return nil
|
|
}
|
|
|
|
out := make([]models.Host, 0)
|
|
for _, host := range readState.Hosts() {
|
|
if host == nil {
|
|
continue
|
|
}
|
|
out = append(out, hostFromReadStateView(host))
|
|
}
|
|
return out
|
|
}
|
|
|
|
// writeSMARTMetrics writes SMART temperature history to the in-memory chart
|
|
// buffer and persists SMART attributes when the metrics store is enabled.
|
|
func (m *Monitor) writeSMARTMetrics(disk models.PhysicalDisk, now time.Time) {
|
|
if shouldSkipNativeMockStateMetricWrites() {
|
|
return
|
|
}
|
|
|
|
resourceID := unifiedresources.PhysicalDiskMetricID(disk)
|
|
if resourceID == "" {
|
|
return
|
|
}
|
|
|
|
// Temperature (always write if > 0)
|
|
if disk.Temperature > 0 {
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddDiskMetric(resourceID, "smart_temp", float64(disk.Temperature), now)
|
|
}
|
|
if m.metricsStore != nil {
|
|
m.metricsStore.Write("disk", resourceID, "smart_temp", float64(disk.Temperature), now)
|
|
}
|
|
}
|
|
|
|
attrs := disk.SmartAttributes
|
|
if attrs == nil || m.metricsStore == nil {
|
|
return
|
|
}
|
|
|
|
// Common
|
|
if attrs.PowerOnHours != nil {
|
|
m.metricsStore.Write("disk", resourceID, "smart_power_on_hours", float64(*attrs.PowerOnHours), now)
|
|
}
|
|
if attrs.PowerCycles != nil {
|
|
m.metricsStore.Write("disk", resourceID, "smart_power_cycles", float64(*attrs.PowerCycles), now)
|
|
}
|
|
|
|
// SATA-specific
|
|
if attrs.ReallocatedSectors != nil {
|
|
m.metricsStore.Write("disk", resourceID, "smart_reallocated_sectors", float64(*attrs.ReallocatedSectors), now)
|
|
}
|
|
if attrs.PendingSectors != nil {
|
|
m.metricsStore.Write("disk", resourceID, "smart_pending_sectors", float64(*attrs.PendingSectors), now)
|
|
}
|
|
if attrs.OfflineUncorrectable != nil {
|
|
m.metricsStore.Write("disk", resourceID, "smart_offline_uncorrectable", float64(*attrs.OfflineUncorrectable), now)
|
|
}
|
|
if attrs.UDMACRCErrors != nil {
|
|
m.metricsStore.Write("disk", resourceID, "smart_crc_errors", float64(*attrs.UDMACRCErrors), now)
|
|
}
|
|
|
|
// NVMe-specific
|
|
if attrs.PercentageUsed != nil {
|
|
m.metricsStore.Write("disk", resourceID, "smart_percentage_used", float64(*attrs.PercentageUsed), now)
|
|
}
|
|
if attrs.AvailableSpare != nil {
|
|
m.metricsStore.Write("disk", resourceID, "smart_available_spare", float64(*attrs.AvailableSpare), now)
|
|
}
|
|
if attrs.MediaErrors != nil {
|
|
m.metricsStore.Write("disk", resourceID, "smart_media_errors", float64(*attrs.MediaErrors), now)
|
|
}
|
|
if attrs.UnsafeShutdowns != nil {
|
|
m.metricsStore.Write("disk", resourceID, "smart_unsafe_shutdowns", float64(*attrs.UnsafeShutdowns), now)
|
|
}
|
|
}
|
|
|
|
// PollExecutor defines the contract for executing polling tasks.
|
|
type PollExecutor interface {
|
|
Execute(ctx context.Context, task PollTask)
|
|
}
|
|
|
|
type realExecutor struct {
|
|
monitor *Monitor
|
|
}
|
|
|
|
func newRealExecutor(m *Monitor) PollExecutor {
|
|
return &realExecutor{monitor: m}
|
|
}
|
|
|
|
func (r *realExecutor) Execute(ctx context.Context, task PollTask) {
|
|
if r == nil || r.monitor == nil {
|
|
return
|
|
}
|
|
|
|
if task.Run != nil {
|
|
task.Run(ctx)
|
|
return
|
|
}
|
|
|
|
switch strings.ToLower(task.InstanceType) {
|
|
case "pve":
|
|
if task.PVEClient == nil {
|
|
log.Warn().
|
|
Str("instance", task.InstanceName).
|
|
Msg("PollExecutor received nil PVE client")
|
|
return
|
|
}
|
|
r.monitor.pollPVEInstance(ctx, task.InstanceName, task.PVEClient)
|
|
case "pbs":
|
|
if task.PBSClient == nil {
|
|
log.Warn().
|
|
Str("instance", task.InstanceName).
|
|
Msg("PollExecutor received nil PBS client")
|
|
return
|
|
}
|
|
r.monitor.pollPBSInstance(ctx, task.InstanceName, task.PBSClient)
|
|
case "pmg":
|
|
if task.PMGClient == nil {
|
|
log.Warn().
|
|
Str("instance", task.InstanceName).
|
|
Msg("PollExecutor received nil PMG client")
|
|
return
|
|
}
|
|
r.monitor.pollPMGInstance(ctx, task.InstanceName, task.PMGClient)
|
|
default:
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().
|
|
Str("instance", task.InstanceName).
|
|
Str("type", task.InstanceType).
|
|
Msg("PollExecutor received unsupported task type")
|
|
}
|
|
}
|
|
}
|
|
|
|
type instanceInfo struct {
|
|
Key string
|
|
Type InstanceType
|
|
DisplayName string
|
|
Connection string
|
|
Metadata map[string]string
|
|
}
|
|
|
|
type pollStatus struct {
|
|
LastSuccess time.Time
|
|
LastErrorAt time.Time
|
|
LastErrorMessage string
|
|
LastErrorCategory string
|
|
ConsecutiveFailures int
|
|
FirstFailureAt time.Time
|
|
}
|
|
|
|
type dlqInsight struct {
|
|
Reason string
|
|
FirstAttempt time.Time
|
|
LastAttempt time.Time
|
|
RetryCount int
|
|
NextRetry time.Time
|
|
}
|
|
|
|
type ErrorDetail struct {
|
|
At time.Time `json:"at"`
|
|
Message string `json:"message"`
|
|
Category string `json:"category"`
|
|
}
|
|
|
|
type InstancePollStatus struct {
|
|
LastSuccess *time.Time `json:"lastSuccess,omitempty"`
|
|
LastError *ErrorDetail `json:"lastError,omitempty"`
|
|
ConsecutiveFailures int `json:"consecutiveFailures"`
|
|
FirstFailureAt *time.Time `json:"firstFailureAt,omitempty"`
|
|
}
|
|
|
|
type InstanceBreaker struct {
|
|
State string `json:"state"`
|
|
Since *time.Time `json:"since,omitempty"`
|
|
LastTransition *time.Time `json:"lastTransition,omitempty"`
|
|
RetryAt *time.Time `json:"retryAt,omitempty"`
|
|
FailureCount int `json:"failureCount"`
|
|
}
|
|
|
|
type InstanceDLQ struct {
|
|
Present bool `json:"present"`
|
|
Reason string `json:"reason,omitempty"`
|
|
FirstAttempt *time.Time `json:"firstAttempt,omitempty"`
|
|
LastAttempt *time.Time `json:"lastAttempt,omitempty"`
|
|
RetryCount int `json:"retryCount,omitempty"`
|
|
NextRetry *time.Time `json:"nextRetry,omitempty"`
|
|
}
|
|
|
|
type InstanceHealth struct {
|
|
Key string `json:"key"`
|
|
Type string `json:"type"`
|
|
DisplayName string `json:"displayName"`
|
|
Instance string `json:"instance"`
|
|
Connection string `json:"connection"`
|
|
PollStatus InstancePollStatus `json:"pollStatus"`
|
|
Breaker InstanceBreaker `json:"breaker"`
|
|
DeadLetter InstanceDLQ `json:"deadLetter"`
|
|
Warnings []string `json:"warnings"`
|
|
}
|
|
|
|
func (h InstanceHealth) NormalizeCollections() InstanceHealth {
|
|
if h.Warnings == nil {
|
|
h.Warnings = []string{}
|
|
}
|
|
return h
|
|
}
|
|
|
|
// Monitor handles all monitoring operations
|
|
type Monitor struct {
|
|
config *config.Config
|
|
state *models.State
|
|
orgID string // Organization ID for tenant isolation (empty = default/legacy)
|
|
pveClients map[string]PVEClientInterface
|
|
pbsClients map[string]*pbs.Client
|
|
pmgClients map[string]*pmg.Client
|
|
pollProviders map[InstanceType]PollProvider
|
|
pollMetrics *PollMetrics
|
|
scheduler *AdaptiveScheduler
|
|
stalenessTracker *StalenessTracker
|
|
taskQueue *TaskQueue
|
|
pollTimeout time.Duration
|
|
circuitBreakers map[string]*circuitBreaker
|
|
deadLetterQueue *TaskQueue
|
|
failureCounts map[string]int
|
|
lastOutcome map[string]taskOutcome
|
|
backoffCfg backoffConfig
|
|
rng *rand.Rand
|
|
maxRetryAttempts int
|
|
tempCollector *TemperatureCollector // SSH-based temperature collector
|
|
guestMetadataStore *config.GuestMetadataStore
|
|
dockerMetadataStore *config.DockerMetadataStore
|
|
hostMetadataStore *config.HostMetadataStore
|
|
mu sync.RWMutex
|
|
startTime time.Time
|
|
rateTracker *RateTracker
|
|
metricsHistory *MetricsHistory
|
|
metricsStore *metrics.Store // Persistent SQLite metrics storage
|
|
alertManager *alerts.Manager
|
|
alertResolvedAICallback func(*alerts.Alert)
|
|
alertTriggeredAICallback func(*alerts.Alert)
|
|
incidentStore *memory.IncidentStore
|
|
notificationMgr *notifications.NotificationManager
|
|
configPersist *config.ConfigPersistence
|
|
discoveryService *discovery.Service // Background discovery service
|
|
activePollCount int32 // Number of active polling operations
|
|
pollCounter int64 // Counter for polling cycles
|
|
authFailures map[string]int // Track consecutive auth failures per node
|
|
lastAuthAttempt map[string]time.Time // Track last auth attempt time
|
|
lastClusterCheck map[string]time.Time // Track last cluster check for standalone nodes
|
|
lastPhysicalDiskPoll map[string]time.Time // Track last physical disk poll time per instance
|
|
lastPVEBackupPoll map[string]time.Time // Track last PVE backup poll per instance
|
|
lastPBSBackupPoll map[string]time.Time // Track last PBS backup poll per instance
|
|
backupPermissionWarnings map[string]string // Track backup permission issues per instance (instance -> warning message)
|
|
persistence *config.ConfigPersistence // Add persistence for saving updated configs
|
|
pbsBackupPollers map[string]bool // Track PBS backup polling goroutines per instance
|
|
pbsBackupCacheTime map[string]map[pbsBackupGroupKey]time.Time // Track when each PBS backup group was last fetched
|
|
runtimeCtx context.Context // Context used while monitor is running
|
|
wsHub *websocket.Hub // Hub used for broadcasting state
|
|
diagMu sync.RWMutex // Protects diagnostic snapshot maps
|
|
nodeSnapshots map[string]NodeMemorySnapshot
|
|
guestSnapshots map[string]GuestMemorySnapshot
|
|
rrdCacheMu sync.RWMutex // Protects short-lived guest memory caches.
|
|
nodeRRDMemCache map[string]rrdMemCacheEntry
|
|
vmRRDMemCache map[string]rrdMemCacheEntry
|
|
vmAgentMemCache map[string]agentMemCacheEntry
|
|
removedDockerHosts map[string]time.Time // Track deliberately removed Docker hosts (ID -> removal time)
|
|
dockerTokenBindings map[string]string // Track token ID -> agent ID bindings to enforce uniqueness
|
|
removedKubernetesClusters map[string]time.Time // Track deliberately removed Kubernetes clusters (ID -> removal time)
|
|
kubernetesTokenBindings map[string]string // Track token ID -> agent ID bindings to enforce uniqueness
|
|
removedHostAgents map[string]time.Time // Track deliberately removed host agents (ID -> removal time)
|
|
hostTokenBindings map[string]string // Track tokenID:hostname -> host identity bindings
|
|
dockerCommands map[string]*dockerHostCommand
|
|
dockerCommandIndex map[string]string
|
|
guestMetadataMu sync.RWMutex
|
|
guestMetadataCache map[string]guestMetadataCacheEntry
|
|
guestMetadataLimiterMu sync.Mutex
|
|
guestMetadataLimiter map[string]time.Time
|
|
guestMetadataSlots chan struct{}
|
|
guestMetadataMinRefresh time.Duration
|
|
guestMetadataRefreshJitter time.Duration
|
|
guestMetadataRetryBackoff time.Duration
|
|
guestMetadataHoldDuration time.Duration
|
|
// Configurable guest agent timeouts (refs #592)
|
|
guestAgentFSInfoTimeout time.Duration
|
|
guestAgentNetworkTimeout time.Duration
|
|
guestAgentOSInfoTimeout time.Duration
|
|
guestAgentVersionTimeout time.Duration
|
|
guestAgentRetries int
|
|
executor PollExecutor
|
|
breakerBaseRetry time.Duration
|
|
breakerMaxDelay time.Duration
|
|
breakerHalfOpenWindow time.Duration
|
|
instanceInfoCache map[string]*instanceInfo
|
|
pollStatusMap map[string]*pollStatus
|
|
dlqInsightMap map[string]*dlqInsight
|
|
nodeLastOnline map[string]time.Time // Track last time each node was seen online (for grace period)
|
|
nodePendingUpdatesCache map[string]pendingUpdatesCache // Cache pending updates per node (checked every 30 min)
|
|
resourceStore ResourceStoreInterface // Optional unified resource store for polling optimization
|
|
supplementalProviders map[unifiedresources.DataSource]MonitorSupplementalRecordsProvider
|
|
recoveryManager *recoverymanager.Manager // Optional recovery store manager for backup rollups
|
|
mockMetricsCancel context.CancelFunc
|
|
mockMetricsWg sync.WaitGroup
|
|
dockerChecker DockerChecker // Optional Docker checker for LXC containers
|
|
// Agent profile cache to avoid disk I/O on every report (refs #1094)
|
|
agentProfileCacheMu sync.RWMutex
|
|
agentProfileCache *agentProfileCacheEntry
|
|
// Cluster sensor cache: temperature data collected by an agent on one Proxmox
|
|
// cluster node via SSH to its siblings. Keyed by lowercase node name.
|
|
clusterSensorsMu sync.RWMutex
|
|
clusterSensorsCache map[string]clusterSensorsCacheEntry
|
|
mockChartCacheMu sync.RWMutex
|
|
mockChartMapCache map[mockChartMetricMapCacheKey]map[string][]MetricPoint
|
|
}
|
|
|
|
// clusterSensorsCacheEntry stores temperature data collected by a sibling agent via SSH.
|
|
type clusterSensorsCacheEntry struct {
|
|
sensors models.HostSensorSummary
|
|
updatedAt time.Time
|
|
}
|
|
|
|
type rrdMemCacheEntry struct {
|
|
available uint64
|
|
used uint64
|
|
total uint64
|
|
netIn float64
|
|
netOut float64
|
|
hasNetIn bool
|
|
hasNetOut bool
|
|
fetchedAt time.Time
|
|
}
|
|
|
|
// pendingUpdatesCache caches apt pending updates count per node
|
|
type pendingUpdatesCache struct {
|
|
count int
|
|
checkedAt time.Time
|
|
}
|
|
|
|
// TTL for pending updates cache (30 minutes - balance between freshness and API load)
|
|
const pendingUpdatesCacheTTL = 30 * time.Minute
|
|
|
|
// agentProfileCacheEntry caches agent profiles and assignments to avoid disk I/O on every agent report.
|
|
// TTL is 60 seconds to balance freshness with performance.
|
|
type agentProfileCacheEntry struct {
|
|
profiles []models.AgentProfile
|
|
assignments []models.AgentProfileAssignment
|
|
loadedAt time.Time
|
|
}
|
|
|
|
const agentProfileCacheTTL = 60 * time.Second
|
|
|
|
// shouldRunBackupPoll determines whether a backup polling cycle should execute.
|
|
// Returns whether polling should run, a human-readable skip reason, and the timestamp to record.
|
|
func (m *Monitor) shouldRunBackupPoll(last time.Time, now time.Time) (bool, string, time.Time) {
|
|
if m == nil || m.config == nil {
|
|
return false, "configuration unavailable", last
|
|
}
|
|
|
|
if !m.config.EnableBackupPolling {
|
|
return false, "backup polling globally disabled", last
|
|
}
|
|
|
|
interval := m.config.BackupPollingInterval
|
|
if interval > 0 {
|
|
if !last.IsZero() && now.Sub(last) < interval {
|
|
next := last.Add(interval)
|
|
return false, fmt.Sprintf("next run scheduled for %s", next.Format(time.RFC3339)), last
|
|
}
|
|
return true, "", now
|
|
}
|
|
|
|
backupCycles := m.config.BackupPollingCycles
|
|
if backupCycles <= 0 {
|
|
backupCycles = 10
|
|
}
|
|
|
|
if m.pollCounter%int64(backupCycles) == 0 || m.pollCounter == 1 {
|
|
return true, "", now
|
|
}
|
|
|
|
remaining := int64(backupCycles) - (m.pollCounter % int64(backupCycles))
|
|
return false, fmt.Sprintf("next run in %d polling cycles", remaining), last
|
|
}
|
|
|
|
const (
|
|
dockerConnectionPrefix = "docker-"
|
|
kubernetesConnectionPrefix = "kubernetes-"
|
|
hostConnectionPrefix = "host-"
|
|
dockerOfflineGraceMultiplier = 4
|
|
dockerMinimumHealthWindow = 30 * time.Second
|
|
dockerMaximumHealthWindow = 10 * time.Minute
|
|
kubernetesOfflineGraceMultiplier = 4
|
|
kubernetesMinimumHealthWindow = 30 * time.Second
|
|
kubernetesMaximumHealthWindow = 10 * time.Minute
|
|
hostOfflineGraceMultiplier = 6
|
|
hostMinimumHealthWindow = 60 * time.Second
|
|
hostMaximumHealthWindow = 10 * time.Minute
|
|
nodeOfflineGracePeriod = 60 * time.Second // Grace period before marking Proxmox nodes offline
|
|
nodeRRDCacheTTL = 30 * time.Second
|
|
nodeRRDRequestTimeout = 2 * time.Second
|
|
)
|
|
|
|
type taskOutcome struct {
|
|
success bool
|
|
transient bool
|
|
err error
|
|
recordedAt time.Time
|
|
}
|
|
|
|
func (m *Monitor) getNodeRRDMetrics(ctx context.Context, client PVEClientInterface, nodeName string) (rrdMemCacheEntry, error) {
|
|
if client == nil || nodeName == "" {
|
|
return rrdMemCacheEntry{}, fmt.Errorf("invalid arguments for RRD lookup")
|
|
}
|
|
|
|
now := time.Now()
|
|
|
|
m.rrdCacheMu.RLock()
|
|
if entry, ok := m.nodeRRDMemCache[nodeName]; ok && now.Sub(entry.fetchedAt) < nodeRRDCacheTTL {
|
|
m.rrdCacheMu.RUnlock()
|
|
return entry, nil
|
|
}
|
|
m.rrdCacheMu.RUnlock()
|
|
|
|
requestCtx, cancel := context.WithTimeout(ctx, nodeRRDRequestTimeout)
|
|
defer cancel()
|
|
|
|
points, err := client.GetNodeRRDData(requestCtx, nodeName, "hour", "AVERAGE", []string{"memavailable", "memused", "memtotal", "netin", "netout"})
|
|
if err != nil {
|
|
return rrdMemCacheEntry{}, err
|
|
}
|
|
|
|
var memAvailable uint64
|
|
var memUsed uint64
|
|
var memTotal uint64
|
|
var netIn float64
|
|
var netOut float64
|
|
var hasNetIn bool
|
|
var hasNetOut bool
|
|
|
|
for i := len(points) - 1; i >= 0; i-- {
|
|
point := points[i]
|
|
|
|
if memTotal == 0 && point.MemTotal != nil && !math.IsNaN(*point.MemTotal) && *point.MemTotal > 0 {
|
|
memTotal = uint64(math.Round(*point.MemTotal))
|
|
}
|
|
|
|
if memAvailable == 0 && point.MemAvailable != nil && !math.IsNaN(*point.MemAvailable) && *point.MemAvailable > 0 {
|
|
memAvailable = uint64(math.Round(*point.MemAvailable))
|
|
}
|
|
|
|
if memUsed == 0 && point.MemUsed != nil && !math.IsNaN(*point.MemUsed) && *point.MemUsed > 0 {
|
|
memUsed = uint64(math.Round(*point.MemUsed))
|
|
}
|
|
|
|
if !hasNetIn && point.NetIn != nil && !math.IsNaN(*point.NetIn) {
|
|
netIn = *point.NetIn
|
|
hasNetIn = true
|
|
}
|
|
if !hasNetOut && point.NetOut != nil && !math.IsNaN(*point.NetOut) {
|
|
netOut = *point.NetOut
|
|
hasNetOut = true
|
|
}
|
|
}
|
|
|
|
if memTotal > 0 {
|
|
if memAvailable > memTotal {
|
|
memAvailable = memTotal
|
|
}
|
|
if memUsed > memTotal {
|
|
memUsed = memTotal
|
|
}
|
|
}
|
|
|
|
if memAvailable == 0 && memUsed == 0 && !hasNetIn && !hasNetOut {
|
|
return rrdMemCacheEntry{}, fmt.Errorf("rrd node metrics not present")
|
|
}
|
|
|
|
entry := rrdMemCacheEntry{
|
|
available: memAvailable,
|
|
used: memUsed,
|
|
total: memTotal,
|
|
netIn: netIn,
|
|
netOut: netOut,
|
|
hasNetIn: hasNetIn,
|
|
hasNetOut: hasNetOut,
|
|
fetchedAt: now,
|
|
}
|
|
|
|
m.rrdCacheMu.Lock()
|
|
m.nodeRRDMemCache[nodeName] = entry
|
|
m.rrdCacheMu.Unlock()
|
|
|
|
return entry, nil
|
|
}
|
|
|
|
// getVMRRDMetrics fetches Proxmox RRD memavailable for a single VM with a
|
|
// short-lived cache to avoid a live API call on every poll for VMs that
|
|
// consistently lack guest-agent memory data (e.g. Windows VMs).
|
|
func (m *Monitor) getVMRRDMetrics(ctx context.Context, client PVEClientInterface, instanceName, node string, vmid int) (uint64, error) {
|
|
if client == nil || node == "" || vmid <= 0 {
|
|
return 0, fmt.Errorf("invalid arguments for VM RRD lookup")
|
|
}
|
|
|
|
cacheKey := guestMemoryCacheKey(instanceName, node, vmid)
|
|
now := time.Now()
|
|
|
|
m.rrdCacheMu.RLock()
|
|
if entry, ok := m.vmRRDMemCache[cacheKey]; ok && now.Sub(entry.fetchedAt) < nodeRRDCacheTTL {
|
|
m.rrdCacheMu.RUnlock()
|
|
return entry.available, nil
|
|
}
|
|
m.rrdCacheMu.RUnlock()
|
|
|
|
requestCtx, cancel := context.WithTimeout(ctx, nodeRRDRequestTimeout)
|
|
defer cancel()
|
|
|
|
points, err := client.GetVMRRDData(requestCtx, node, vmid, "hour", "AVERAGE", []string{"memavailable"})
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
if len(points) == 0 {
|
|
return 0, fmt.Errorf("no RRD points for VM %s/%d", node, vmid)
|
|
}
|
|
|
|
var memAvailable uint64
|
|
for i := len(points) - 1; i >= 0; i-- {
|
|
p := points[i]
|
|
if p.MemAvailable != nil && !math.IsNaN(*p.MemAvailable) && *p.MemAvailable > 0 {
|
|
memAvailable = uint64(math.Round(*p.MemAvailable))
|
|
break
|
|
}
|
|
}
|
|
if memAvailable == 0 {
|
|
return 0, fmt.Errorf("rrd memavailable not present for VM %s/%d", node, vmid)
|
|
}
|
|
|
|
entry := rrdMemCacheEntry{available: memAvailable, fetchedAt: now}
|
|
m.rrdCacheMu.Lock()
|
|
if m.vmRRDMemCache == nil {
|
|
m.vmRRDMemCache = make(map[string]rrdMemCacheEntry)
|
|
}
|
|
m.vmRRDMemCache[cacheKey] = entry
|
|
m.rrdCacheMu.Unlock()
|
|
|
|
return memAvailable, nil
|
|
}
|
|
|
|
// RemoveDockerHost removes a docker host from the shared state and clears related alerts.
|
|
func (m *Monitor) GetConnectionStatuses() map[string]bool {
|
|
if m == nil {
|
|
return map[string]bool{}
|
|
}
|
|
|
|
if mock.IsMockEnabled() {
|
|
statuses := make(map[string]bool)
|
|
state := mock.CurrentFixtureGraph().State
|
|
for _, node := range state.Nodes {
|
|
key := "pve-" + node.Name
|
|
statuses[key] = strings.ToLower(node.Status) == "online"
|
|
if node.Host != "" {
|
|
statuses[node.Host] = strings.ToLower(node.Status) == "online"
|
|
}
|
|
}
|
|
for _, pbsInst := range state.PBSInstances {
|
|
key := "pbs-" + pbsInst.Name
|
|
statuses[key] = strings.ToLower(pbsInst.Status) != "offline"
|
|
if pbsInst.Host != "" {
|
|
statuses[pbsInst.Host] = strings.ToLower(pbsInst.Status) != "offline"
|
|
}
|
|
}
|
|
for _, pmgInst := range state.PMGInstances {
|
|
key := "pmg-" + pmgInst.Name
|
|
statuses[key] = strings.ToLower(pmgInst.Status) != "offline"
|
|
if pmgInst.Host != "" {
|
|
statuses[pmgInst.Host] = strings.ToLower(pmgInst.Status) != "offline"
|
|
}
|
|
}
|
|
|
|
for _, dockerHost := range state.DockerHosts {
|
|
key := dockerConnectionPrefix + dockerHost.ID
|
|
statuses[key] = strings.ToLower(dockerHost.Status) == "online"
|
|
}
|
|
return statuses
|
|
}
|
|
|
|
statuses := make(map[string]bool)
|
|
for _, provider := range m.pollProviderSnapshotWithBuiltins() {
|
|
for key, connected := range m.providerConnectionStatuses(provider) {
|
|
if strings.TrimSpace(key) == "" {
|
|
continue
|
|
}
|
|
statuses[key] = connected
|
|
}
|
|
}
|
|
return statuses
|
|
}
|
|
|
|
// checkContainerizedTempMonitoring logs a security warning if Pulse is running
|
|
// in a container with SSH-based temperature monitoring enabled
|
|
func checkContainerizedTempMonitoring() {
|
|
// Check if running in container
|
|
isContainer := os.Getenv("PULSE_DOCKER") == "true" || system.InContainer()
|
|
if !isContainer {
|
|
return
|
|
}
|
|
|
|
// Check if SSH keys exist (indicates temperature monitoring is configured)
|
|
homeDir := os.Getenv("HOME")
|
|
if homeDir == "" {
|
|
homeDir = "/home/pulse"
|
|
}
|
|
sshKeyPath := homeDir + "/.ssh/id_ed25519"
|
|
if _, err := os.Stat(sshKeyPath); err != nil {
|
|
// No SSH key found, temperature monitoring not configured
|
|
return
|
|
}
|
|
|
|
// Log warning
|
|
log.Warn().
|
|
Msg("SECURITY NOTICE: Pulse is running in a container with SSH-based temperature monitoring enabled. " +
|
|
"SSH private keys are stored inside the container, which could be a security risk if the container is compromised. " +
|
|
"Future versions will use agent-based architecture for better security. " +
|
|
"See documentation for hardening recommendations.")
|
|
}
|
|
|
|
// New creates a new Monitor instance
|
|
func New(cfg *config.Config) (*Monitor, error) {
|
|
if cfg == nil {
|
|
return nil, fmt.Errorf("config cannot be nil")
|
|
}
|
|
|
|
// Initialize temperature collector with sensors SSH key
|
|
// Will use root user for now - can be made configurable later
|
|
homeDir := os.Getenv("HOME")
|
|
if homeDir == "" {
|
|
homeDir = "/home/pulse"
|
|
}
|
|
sshKeyPath := filepath.Join(homeDir, ".ssh/id_ed25519_sensors")
|
|
tempCollector := NewTemperatureCollectorWithPort("root", sshKeyPath, cfg.SSHPort)
|
|
|
|
// Security warning if running in container with SSH temperature monitoring
|
|
checkContainerizedTempMonitoring()
|
|
|
|
stalenessTracker := NewStalenessTracker(getPollMetrics())
|
|
stalenessTracker.SetBounds(cfg.AdaptivePollingBaseInterval, cfg.AdaptivePollingMaxInterval)
|
|
taskQueue := NewTaskQueue()
|
|
deadLetterQueue := NewTaskQueue()
|
|
breakers := make(map[string]*circuitBreaker)
|
|
failureCounts := make(map[string]int)
|
|
lastOutcome := make(map[string]taskOutcome)
|
|
backoff := backoffConfig{
|
|
Initial: 5 * time.Second,
|
|
Multiplier: 2,
|
|
Jitter: 0.2,
|
|
Max: 5 * time.Minute,
|
|
}
|
|
|
|
if cfg.AdaptivePollingEnabled && cfg.AdaptivePollingMaxInterval > 0 && cfg.AdaptivePollingMaxInterval <= 15*time.Second {
|
|
backoff.Initial = 750 * time.Millisecond
|
|
backoff.Max = 6 * time.Second
|
|
}
|
|
|
|
var scheduler *AdaptiveScheduler
|
|
if cfg.AdaptivePollingEnabled {
|
|
scheduler = NewAdaptiveScheduler(SchedulerConfig{
|
|
BaseInterval: cfg.AdaptivePollingBaseInterval,
|
|
MinInterval: cfg.AdaptivePollingMinInterval,
|
|
MaxInterval: cfg.AdaptivePollingMaxInterval,
|
|
}, stalenessTracker, nil, nil)
|
|
}
|
|
|
|
minRefresh := cfg.GuestMetadataMinRefreshInterval
|
|
if minRefresh <= 0 {
|
|
minRefresh = config.DefaultGuestMetadataMinRefresh
|
|
}
|
|
jitter := cfg.GuestMetadataRefreshJitter
|
|
if jitter < 0 {
|
|
jitter = 0
|
|
}
|
|
retryBackoff := cfg.GuestMetadataRetryBackoff
|
|
if retryBackoff <= 0 {
|
|
retryBackoff = config.DefaultGuestMetadataRetryBackoff
|
|
}
|
|
concurrency := cfg.GuestMetadataMaxConcurrent
|
|
if concurrency <= 0 {
|
|
concurrency = config.DefaultGuestMetadataMaxConcurrent
|
|
}
|
|
holdDuration := defaultGuestMetadataHold
|
|
|
|
// Load guest agent timeout configuration from environment variables (refs #592)
|
|
guestAgentFSInfoTimeout := parsePositiveDurationEnv("GUEST_AGENT_FSINFO_TIMEOUT", defaultGuestAgentFSInfoTimeout)
|
|
guestAgentNetworkTimeout := parsePositiveDurationEnv("GUEST_AGENT_NETWORK_TIMEOUT", defaultGuestAgentNetworkTimeout)
|
|
guestAgentOSInfoTimeout := parsePositiveDurationEnv("GUEST_AGENT_OSINFO_TIMEOUT", defaultGuestAgentOSInfoTimeout)
|
|
guestAgentVersionTimeout := parsePositiveDurationEnv("GUEST_AGENT_VERSION_TIMEOUT", defaultGuestAgentVersionTimeout)
|
|
guestAgentRetries := parseNonNegativeIntEnv("GUEST_AGENT_RETRIES", defaultGuestAgentRetries)
|
|
|
|
// Initialize persistent metrics store (SQLite) with configurable retention
|
|
var metricsStore *metrics.Store
|
|
metricsStoreConfig := metrics.DefaultConfig(cfg.DataPath)
|
|
// Override retention settings from config (allows tier-based pricing in future)
|
|
if cfg.MetricsRetentionRawHours > 0 {
|
|
metricsStoreConfig.RetentionRaw = time.Duration(cfg.MetricsRetentionRawHours) * time.Hour
|
|
}
|
|
if cfg.MetricsRetentionMinuteHours > 0 {
|
|
metricsStoreConfig.RetentionMinute = time.Duration(cfg.MetricsRetentionMinuteHours) * time.Hour
|
|
}
|
|
if cfg.MetricsRetentionHourlyDays > 0 {
|
|
metricsStoreConfig.RetentionHourly = time.Duration(cfg.MetricsRetentionHourlyDays) * 24 * time.Hour
|
|
}
|
|
if cfg.MetricsRetentionDailyDays > 0 {
|
|
metricsStoreConfig.RetentionDaily = time.Duration(cfg.MetricsRetentionDailyDays) * 24 * time.Hour
|
|
}
|
|
|
|
// In mock mode, extend hourly/daily retention to 90 days to match the
|
|
// seeded data range (seeds write directly to hourly+daily tiers).
|
|
// Raw and minute tiers keep production defaults — seeded data doesn't
|
|
// use them, and live mock ticks at 2s intervals would bloat the DB
|
|
// (the old 90-day raw retention caused metrics.db to grow to ~2 GB).
|
|
if mock.IsMockEnabled() {
|
|
metricsStoreConfig.WriteBufferSize = 2000
|
|
metricsStoreConfig.RetentionHourly = 90 * 24 * time.Hour
|
|
metricsStoreConfig.RetentionDaily = 90 * 24 * time.Hour
|
|
}
|
|
ms, err := metrics.NewStore(metricsStoreConfig)
|
|
if err != nil {
|
|
// Do not automatically delete the DB on error, as it causes data loss on transient errors (e.g. locks).
|
|
// If the DB is truly corrupted, the user should manually remove it.
|
|
log.Error().Err(err).Msg("failed to initialize persistent metrics store - continuing without metrics persistence")
|
|
} else {
|
|
if mock.IsMockEnabled() {
|
|
ms.SetMaxOpenConns(10)
|
|
}
|
|
metricsStore = ms
|
|
log.Info().
|
|
Str("path", metricsStoreConfig.DBPath).
|
|
Dur("retentionRaw", metricsStoreConfig.RetentionRaw).
|
|
Dur("retentionMinute", metricsStoreConfig.RetentionMinute).
|
|
Dur("retentionHourly", metricsStoreConfig.RetentionHourly).
|
|
Dur("retentionDaily", metricsStoreConfig.RetentionDaily).
|
|
Msg("Persistent metrics store initialized with configurable retention")
|
|
}
|
|
|
|
incidentStore := memory.NewIncidentStore(memory.IncidentStoreConfig{
|
|
DataDir: cfg.DataPath,
|
|
})
|
|
|
|
m := &Monitor{
|
|
config: cfg,
|
|
state: models.NewState(),
|
|
pveClients: make(map[string]PVEClientInterface),
|
|
pbsClients: make(map[string]*pbs.Client),
|
|
pmgClients: make(map[string]*pmg.Client),
|
|
pollProviders: make(map[InstanceType]PollProvider),
|
|
pollMetrics: getPollMetrics(),
|
|
scheduler: scheduler,
|
|
stalenessTracker: stalenessTracker,
|
|
taskQueue: taskQueue,
|
|
pollTimeout: derivePollTimeout(cfg),
|
|
deadLetterQueue: deadLetterQueue,
|
|
circuitBreakers: breakers,
|
|
failureCounts: failureCounts,
|
|
lastOutcome: lastOutcome,
|
|
backoffCfg: backoff,
|
|
rng: rand.New(rand.NewSource(time.Now().UnixNano())),
|
|
maxRetryAttempts: 5,
|
|
tempCollector: tempCollector,
|
|
guestMetadataStore: config.NewGuestMetadataStore(cfg.DataPath, nil),
|
|
dockerMetadataStore: config.NewDockerMetadataStore(cfg.DataPath, nil),
|
|
hostMetadataStore: config.NewHostMetadataStore(cfg.DataPath, nil),
|
|
startTime: time.Now(),
|
|
rateTracker: NewRateTracker(),
|
|
metricsHistory: NewMetricsHistory(1000, 24*time.Hour), // Keep up to 1000 points (~8h @ 30s)
|
|
metricsStore: metricsStore, // Persistent SQLite storage
|
|
alertManager: alerts.NewManagerWithDataDir(cfg.DataPath),
|
|
incidentStore: incidentStore,
|
|
notificationMgr: notifications.NewNotificationManagerWithDataDir(cfg.PublicURL, cfg.DataPath),
|
|
configPersist: config.NewConfigPersistence(cfg.DataPath),
|
|
discoveryService: nil, // Will be initialized in Start()
|
|
authFailures: make(map[string]int),
|
|
lastAuthAttempt: make(map[string]time.Time),
|
|
lastClusterCheck: make(map[string]time.Time),
|
|
lastPhysicalDiskPoll: make(map[string]time.Time),
|
|
lastPVEBackupPoll: make(map[string]time.Time),
|
|
lastPBSBackupPoll: make(map[string]time.Time),
|
|
backupPermissionWarnings: make(map[string]string),
|
|
persistence: config.NewConfigPersistence(cfg.DataPath),
|
|
pbsBackupPollers: make(map[string]bool),
|
|
pbsBackupCacheTime: make(map[string]map[pbsBackupGroupKey]time.Time),
|
|
nodeSnapshots: make(map[string]NodeMemorySnapshot),
|
|
guestSnapshots: make(map[string]GuestMemorySnapshot),
|
|
nodeRRDMemCache: make(map[string]rrdMemCacheEntry),
|
|
vmRRDMemCache: make(map[string]rrdMemCacheEntry),
|
|
vmAgentMemCache: make(map[string]agentMemCacheEntry),
|
|
removedDockerHosts: make(map[string]time.Time),
|
|
dockerTokenBindings: make(map[string]string),
|
|
removedKubernetesClusters: make(map[string]time.Time),
|
|
kubernetesTokenBindings: make(map[string]string),
|
|
removedHostAgents: make(map[string]time.Time),
|
|
hostTokenBindings: make(map[string]string),
|
|
clusterSensorsCache: make(map[string]clusterSensorsCacheEntry),
|
|
dockerCommands: make(map[string]*dockerHostCommand),
|
|
dockerCommandIndex: make(map[string]string),
|
|
guestMetadataCache: make(map[string]guestMetadataCacheEntry),
|
|
guestMetadataLimiter: make(map[string]time.Time),
|
|
guestMetadataMinRefresh: minRefresh,
|
|
guestMetadataRefreshJitter: jitter,
|
|
guestMetadataRetryBackoff: retryBackoff,
|
|
guestMetadataHoldDuration: holdDuration,
|
|
guestAgentFSInfoTimeout: guestAgentFSInfoTimeout,
|
|
guestAgentNetworkTimeout: guestAgentNetworkTimeout,
|
|
guestAgentOSInfoTimeout: guestAgentOSInfoTimeout,
|
|
guestAgentVersionTimeout: guestAgentVersionTimeout,
|
|
guestAgentRetries: guestAgentRetries,
|
|
instanceInfoCache: make(map[string]*instanceInfo),
|
|
pollStatusMap: make(map[string]*pollStatus),
|
|
dlqInsightMap: make(map[string]*dlqInsight),
|
|
nodeLastOnline: make(map[string]time.Time),
|
|
nodePendingUpdatesCache: make(map[string]pendingUpdatesCache),
|
|
supplementalProviders: make(map[unifiedresources.DataSource]MonitorSupplementalRecordsProvider),
|
|
}
|
|
|
|
m.breakerBaseRetry = 5 * time.Second
|
|
m.breakerMaxDelay = 5 * time.Minute
|
|
m.breakerHalfOpenWindow = 30 * time.Second
|
|
|
|
if cfg.AdaptivePollingEnabled && cfg.AdaptivePollingMaxInterval > 0 && cfg.AdaptivePollingMaxInterval <= 15*time.Second {
|
|
m.breakerBaseRetry = 2 * time.Second
|
|
m.breakerMaxDelay = 10 * time.Second
|
|
m.breakerHalfOpenWindow = 2 * time.Second
|
|
}
|
|
|
|
m.executor = newRealExecutor(m)
|
|
m.registerBuiltInPollProviders()
|
|
m.buildInstanceInfoCache(cfg)
|
|
|
|
// Initialize state with config values
|
|
m.state.TemperatureMonitoringEnabled = cfg.TemperatureMonitoringEnabled
|
|
|
|
if m.pollMetrics != nil {
|
|
m.pollMetrics.ResetQueueDepth(0)
|
|
}
|
|
|
|
// Load saved configurations
|
|
if alertConfig, err := m.configPersist.LoadAlertConfig(); err == nil {
|
|
m.alertManager.UpdateConfig(*alertConfig)
|
|
// Apply schedule settings to notification manager
|
|
m.notificationMgr.SetEnabled(alertConfig.Enabled && alertConfig.ActivationState == alerts.ActivationActive)
|
|
m.notificationMgr.SetCooldown(alertConfig.Schedule.Cooldown)
|
|
m.notificationMgr.SetGroupingWindow(alertConfig.Schedule.Grouping.Window)
|
|
m.notificationMgr.SetGroupingOptions(
|
|
alertConfig.Schedule.Grouping.ByNode,
|
|
alertConfig.Schedule.Grouping.ByGuest,
|
|
)
|
|
m.notificationMgr.SetNotifyOnResolve(alertConfig.Schedule.NotifyOnResolve)
|
|
} else {
|
|
log.Warn().Err(err).Msg("failed to load alert configuration")
|
|
}
|
|
|
|
if emailConfig, err := m.configPersist.LoadEmailConfig(); err == nil {
|
|
m.notificationMgr.SetEmailConfig(*emailConfig)
|
|
} else {
|
|
log.Warn().Err(err).Msg("failed to load email configuration")
|
|
}
|
|
|
|
if concurrency > 0 {
|
|
m.guestMetadataSlots = make(chan struct{}, concurrency)
|
|
}
|
|
|
|
if appriseConfig, err := m.configPersist.LoadAppriseConfig(); err == nil {
|
|
m.notificationMgr.SetAppriseConfig(*appriseConfig)
|
|
} else {
|
|
log.Warn().Err(err).Msg("failed to load Apprise configuration")
|
|
}
|
|
|
|
// Migrate webhooks if needed (from unencrypted to encrypted)
|
|
if err := m.configPersist.MigrateWebhooksIfNeeded(); err != nil {
|
|
log.Warn().Err(err).Msg("failed to migrate webhooks")
|
|
}
|
|
|
|
if webhooks, err := m.configPersist.LoadWebhooks(); err == nil {
|
|
for _, webhook := range webhooks {
|
|
m.notificationMgr.AddWebhook(webhook)
|
|
}
|
|
} else {
|
|
log.Warn().Err(err).Msg("failed to load webhook configuration")
|
|
}
|
|
|
|
// In mock mode the canonical sampler owns demo chart history by default.
|
|
// Support-only hybrid runs can opt back into real client initialization.
|
|
mockEnabled := mock.IsMockEnabled()
|
|
if mockEnabled && !keepRealPollingInMockMode() {
|
|
log.Info().Msg("mock mode enabled - real client initialization disabled")
|
|
} else {
|
|
m.initPVEClients(cfg)
|
|
m.initPBSClients(cfg)
|
|
m.initPMGClients(cfg)
|
|
}
|
|
|
|
// Initialize state stats
|
|
m.state.Stats = models.Stats{
|
|
StartTime: m.startTime,
|
|
Version: "2.0.0-go",
|
|
}
|
|
|
|
return m, nil
|
|
}
|
|
|
|
// SetExecutor allows tests to override the poll executor; passing nil restores the default executor.
|
|
func (m *Monitor) SetExecutor(exec PollExecutor) {
|
|
if m == nil {
|
|
return
|
|
}
|
|
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if exec == nil {
|
|
m.executor = newRealExecutor(m)
|
|
return
|
|
}
|
|
|
|
m.executor = exec
|
|
}
|
|
|
|
func (m *Monitor) buildInstanceInfoCache(cfg *config.Config) {
|
|
_ = cfg
|
|
m.refreshInstanceInfoCacheFromProviders()
|
|
}
|
|
|
|
func (m *Monitor) getExecutor() PollExecutor {
|
|
m.mu.RLock()
|
|
exec := m.executor
|
|
m.mu.RUnlock()
|
|
return exec
|
|
}
|
|
|
|
func clampInterval(value, min, max time.Duration) time.Duration {
|
|
if value <= 0 {
|
|
return min
|
|
}
|
|
if min > 0 && value < min {
|
|
return min
|
|
}
|
|
if max > 0 && value > max {
|
|
return max
|
|
}
|
|
return value
|
|
}
|
|
|
|
func (m *Monitor) effectivePVEPollingInterval() time.Duration {
|
|
const minInterval = 10 * time.Second
|
|
const maxInterval = time.Hour
|
|
|
|
interval := minInterval
|
|
if m != nil && m.config != nil && m.config.PVEPollingInterval > 0 {
|
|
interval = m.config.PVEPollingInterval
|
|
}
|
|
if interval < minInterval {
|
|
interval = minInterval
|
|
}
|
|
if interval > maxInterval {
|
|
interval = maxInterval
|
|
}
|
|
return interval
|
|
}
|
|
|
|
func (m *Monitor) baseIntervalForInstanceType(instanceType InstanceType) time.Duration {
|
|
if provider := m.getPollProvider(instanceType); provider != nil {
|
|
if interval := provider.BaseInterval(m); interval > 0 {
|
|
return interval
|
|
}
|
|
}
|
|
|
|
if m == nil || m.config == nil {
|
|
return DefaultSchedulerConfig().BaseInterval
|
|
}
|
|
|
|
switch instanceType {
|
|
case InstanceTypePVE:
|
|
return m.effectivePVEPollingInterval()
|
|
case InstanceTypePBS:
|
|
return clampInterval(m.config.PBSPollingInterval, 10*time.Second, time.Hour)
|
|
case InstanceTypePMG:
|
|
return clampInterval(m.config.PMGPollingInterval, 10*time.Second, time.Hour)
|
|
default:
|
|
base := m.config.AdaptivePollingBaseInterval
|
|
if base <= 0 {
|
|
base = DefaultSchedulerConfig().BaseInterval
|
|
}
|
|
return clampInterval(base, time.Second, 0)
|
|
}
|
|
}
|
|
|
|
// Start begins the monitoring loop
|
|
func (m *Monitor) Start(ctx context.Context, wsHub *websocket.Hub) {
|
|
// Consolidate any duplicate cluster instances before starting
|
|
// This fixes the case where multiple agents registered from the same cluster
|
|
m.consolidateDuplicateClusters()
|
|
|
|
pollingInterval := m.effectivePVEPollingInterval()
|
|
log.Info().
|
|
Dur("pollingInterval", pollingInterval).
|
|
Msg("Starting monitoring loop")
|
|
|
|
m.mu.Lock()
|
|
m.runtimeCtx = ctx
|
|
m.wsHub = wsHub
|
|
m.mu.Unlock()
|
|
defer m.stopMockMetricsSampler()
|
|
|
|
// Best-effort startup cleanup: when direct PBS is configured, remove legacy
|
|
// PVE-proxied PBS backup points to prevent duplicate recovery entries.
|
|
m.purgeStalePVEPBSBackupsBestEffort(ctx)
|
|
|
|
if mock.IsMockEnabled() {
|
|
m.startMockMetricsSampler(ctx)
|
|
}
|
|
|
|
// Initialize and start discovery service if enabled
|
|
if mock.IsMockEnabled() {
|
|
log.Info().Msg("mock mode enabled - skipping discovery service")
|
|
m.discoveryService = nil
|
|
} else if m.config.DiscoveryEnabled {
|
|
discoverySubnet := m.config.DiscoverySubnet
|
|
if discoverySubnet == "" {
|
|
discoverySubnet = "auto"
|
|
}
|
|
cfgProvider := func() config.DiscoveryConfig {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
if m.config == nil {
|
|
return config.DefaultDiscoveryConfig()
|
|
}
|
|
cfg := config.CloneDiscoveryConfig(m.config.Discovery)
|
|
// Auto-populate IPBlocklist with configured Proxmox host IPs to avoid
|
|
// probing hosts we already know about (reduces PBS auth failure log spam)
|
|
cfg.IPBlocklist = m.getConfiguredHostIPs()
|
|
return cfg
|
|
}
|
|
m.discoveryService = discovery.NewService(wsHub, 5*time.Minute, discoverySubnet, cfgProvider)
|
|
if m.discoveryService != nil {
|
|
m.discoveryService.Start(ctx)
|
|
log.Info().Msg("discovery service initialized and started")
|
|
} else {
|
|
log.Error().Msg("failed to initialize discovery service")
|
|
}
|
|
} else {
|
|
log.Info().Msg("discovery service disabled by configuration")
|
|
m.discoveryService = nil
|
|
}
|
|
|
|
// Set up alert callbacks
|
|
m.alertManager.SetAlertCallback(func(alert *alerts.Alert) {
|
|
m.handleAlertFired(alert)
|
|
})
|
|
// Set up AI analysis callback - this bypasses activation state and other notification suppression
|
|
// so AI can analyze alerts even during pending_review setup phase
|
|
m.alertManager.SetAlertForAICallback(func(alert *alerts.Alert) {
|
|
log.Debug().Str("alertID", alert.ID).Msg("AI alert callback invoked (bypassing notification suppression)")
|
|
if m.alertTriggeredAICallback != nil {
|
|
m.alertTriggeredAICallback(alert)
|
|
}
|
|
})
|
|
m.alertManager.SetResolvedCallback(func(alertID string) {
|
|
m.handleAlertResolved(alertID)
|
|
// Don't broadcast full state here - it causes a cascade with many guests.
|
|
// The frontend will get the updated alerts through the regular broadcast ticker.
|
|
})
|
|
m.alertManager.SetAcknowledgedCallback(func(alert *alerts.Alert, user string) {
|
|
m.handleAlertAcknowledged(alert, user)
|
|
})
|
|
m.alertManager.SetUnacknowledgedCallback(func(alert *alerts.Alert, user string) {
|
|
m.handleAlertUnacknowledged(alert, user)
|
|
})
|
|
m.alertManager.SetEscalateCallback(func(alert *alerts.Alert, level int) {
|
|
log.Info().
|
|
Str("alertID", alert.ID).
|
|
Int("level", level).
|
|
Msg("Alert escalated - sending notifications")
|
|
|
|
// Get escalation config
|
|
config := m.alertManager.GetConfig()
|
|
if level <= 0 || level > len(config.Schedule.Escalation.Levels) {
|
|
return
|
|
}
|
|
|
|
escalationLevel := config.Schedule.Escalation.Levels[level-1]
|
|
|
|
// Send notifications based on escalation level
|
|
switch escalationLevel.Notify {
|
|
case "email":
|
|
// Only send email
|
|
if emailConfig := m.notificationMgr.GetEmailConfig(); emailConfig.Enabled {
|
|
m.notificationMgr.SendAlert(alert)
|
|
}
|
|
case "webhook":
|
|
// Only send webhooks
|
|
for _, webhook := range m.notificationMgr.GetWebhooks() {
|
|
if webhook.Enabled {
|
|
m.notificationMgr.SendAlert(alert)
|
|
break
|
|
}
|
|
}
|
|
case "all":
|
|
// Send all notifications
|
|
m.notificationMgr.SendAlert(alert)
|
|
}
|
|
|
|
// Update WebSocket with escalation
|
|
m.broadcastEscalatedAlert(wsHub, alert)
|
|
})
|
|
|
|
// Create separate tickers for polling and broadcasting using the configured cadence
|
|
|
|
workerCount := m.totalClientCount()
|
|
m.startTaskWorkers(ctx, workerCount)
|
|
|
|
pollTicker := time.NewTicker(pollingInterval)
|
|
defer pollTicker.Stop()
|
|
|
|
broadcastTicker := time.NewTicker(pollingInterval)
|
|
defer broadcastTicker.Stop()
|
|
|
|
keepRealPolling := keepRealPollingInMockMode()
|
|
|
|
// Start connection retry mechanism for failed clients
|
|
// This handles cases where network/Proxmox isn't ready on initial startup
|
|
if !mock.IsMockEnabled() || keepRealPolling {
|
|
go m.retryFailedConnections(ctx)
|
|
}
|
|
|
|
// Do an immediate poll on start.
|
|
if mock.IsMockEnabled() {
|
|
if keepRealPolling {
|
|
log.Info().Msg("mock mode enabled - running mock alerts and real metric polling")
|
|
go m.checkMockAlerts()
|
|
go m.poll(ctx, wsHub)
|
|
} else {
|
|
log.Info().Msg("mock mode enabled - skipping real node polling")
|
|
go m.checkMockAlerts()
|
|
}
|
|
} else {
|
|
go m.poll(ctx, wsHub)
|
|
}
|
|
|
|
for {
|
|
select {
|
|
case <-pollTicker.C:
|
|
now := time.Now()
|
|
m.evaluateDockerAgents(now)
|
|
m.evaluateKubernetesAgents(now)
|
|
m.evaluateHostAgents(now)
|
|
m.cleanupRemovedDockerHosts(now)
|
|
m.cleanupRemovedKubernetesClusters(now)
|
|
m.cleanupRemovedHostAgents(now)
|
|
m.cleanupGuestMetadataCache(now)
|
|
m.cleanupDiagnosticSnapshots(now)
|
|
m.cleanupRRDCache(now)
|
|
m.cleanupTrackingMaps(now)
|
|
m.cleanupMetricsHistory()
|
|
m.cleanupRateTracker(now)
|
|
if mock.IsMockEnabled() {
|
|
// In mock mode, keep synthetic alerts fresh
|
|
go m.checkMockAlerts()
|
|
if keepRealPolling {
|
|
// Keep real metrics flowing while mock UI mode is active.
|
|
go m.poll(ctx, wsHub)
|
|
}
|
|
} else {
|
|
// Poll real infrastructure
|
|
go m.poll(ctx, wsHub)
|
|
}
|
|
|
|
case <-broadcastTicker.C:
|
|
// Broadcast current state regardless of polling status
|
|
// Use GetState() instead of m.state.GetSnapshot() to respect mock mode
|
|
state := m.GetState()
|
|
log.Info().
|
|
Int("nodes", len(state.Nodes)).
|
|
Int("vms", len(state.VMs)).
|
|
Int("containers", len(state.Containers)).
|
|
Int("hosts", len(state.Hosts)).
|
|
Int("pbs", len(state.PBSInstances)).
|
|
Int("pbsBackups", len(state.Backups.PBS)).
|
|
Int("physicalDisks", len(state.PhysicalDisks)).
|
|
Msg("Broadcasting state update (ticker)")
|
|
frontendState := m.buildBroadcastFrontendStateFromSnapshot(state)
|
|
// Use tenant-aware broadcast method
|
|
m.broadcastState(wsHub, frontendState)
|
|
|
|
case <-ctx.Done():
|
|
log.Info().Msg("monitoring loop stopped")
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// poll fetches data from all configured instances
|
|
func (m *Monitor) poll(_ context.Context, wsHub *websocket.Hub) {
|
|
defer recoverFromPanic("poll")
|
|
|
|
// Limit concurrent polls to 2 to prevent resource exhaustion
|
|
currentCount := atomic.AddInt32(&m.activePollCount, 1)
|
|
if currentCount > 2 {
|
|
atomic.AddInt32(&m.activePollCount, -1)
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().Int32("activePolls", currentCount-1).Msg("too many concurrent polls, skipping")
|
|
}
|
|
return
|
|
}
|
|
defer atomic.AddInt32(&m.activePollCount, -1)
|
|
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().Msg("starting polling cycle")
|
|
}
|
|
startTime := time.Now()
|
|
now := startTime
|
|
|
|
plannedTasks := m.buildScheduledTasks(now)
|
|
for _, task := range plannedTasks {
|
|
m.taskQueue.Upsert(task)
|
|
}
|
|
m.updateQueueDepthMetric()
|
|
|
|
// Update performance metrics atomically to prevent data races when
|
|
// multiple poll() goroutines run concurrently (e.g. mock mode transitions).
|
|
wsClients := 0
|
|
if wsHub != nil {
|
|
wsClients = wsHub.GetClientCount()
|
|
}
|
|
m.state.UpdatePollStats(
|
|
time.Since(startTime).Seconds(),
|
|
int64(time.Since(m.startTime).Seconds()),
|
|
wsClients,
|
|
)
|
|
|
|
// Sync alert state so broadcasts include the latest acknowledgement data
|
|
m.syncAlertsToState()
|
|
|
|
// Increment poll counter
|
|
m.mu.Lock()
|
|
m.pollCounter++
|
|
m.mu.Unlock()
|
|
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().Dur("duration", time.Since(startTime)).Msg("polling cycle completed")
|
|
}
|
|
|
|
// Broadcasting is now handled by the timer in Start()
|
|
}
|
|
|
|
func (m *Monitor) startTaskWorkers(ctx context.Context, workers int) {
|
|
if m.taskQueue == nil {
|
|
return
|
|
}
|
|
if workers < 1 {
|
|
workers = 1
|
|
}
|
|
if workers > 10 {
|
|
workers = 10
|
|
}
|
|
for i := 0; i < workers; i++ {
|
|
go m.taskWorker(ctx, i)
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) taskWorker(ctx context.Context, id int) {
|
|
defer recoverFromPanic(fmt.Sprintf("taskWorker-%d", id))
|
|
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().Int("worker", id).Msg("task worker started")
|
|
}
|
|
for {
|
|
task, ok := m.taskQueue.WaitNext(ctx)
|
|
if !ok {
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().Int("worker", id).Msg("task worker stopping")
|
|
}
|
|
return
|
|
}
|
|
|
|
m.executeScheduledTask(ctx, task)
|
|
|
|
m.rescheduleTask(task)
|
|
m.updateQueueDepthMetric()
|
|
}
|
|
}
|
|
|
|
func derivePollTimeout(cfg *config.Config) time.Duration {
|
|
timeout := defaultTaskTimeout
|
|
if cfg != nil && cfg.ConnectionTimeout > 0 {
|
|
timeout = cfg.ConnectionTimeout * 2
|
|
}
|
|
if timeout < minTaskTimeout {
|
|
timeout = minTaskTimeout
|
|
}
|
|
// Use configurable max timeout from config (set via MAX_POLL_TIMEOUT env var)
|
|
// Falls back to hardcoded maxTaskTimeout if config is nil or MaxPollTimeout not set
|
|
maxTimeout := maxTaskTimeout
|
|
if cfg != nil && cfg.MaxPollTimeout > 0 {
|
|
maxTimeout = cfg.MaxPollTimeout
|
|
}
|
|
if timeout > maxTimeout {
|
|
timeout = maxTimeout
|
|
}
|
|
return timeout
|
|
}
|
|
|
|
func (m *Monitor) taskExecutionTimeout(_ InstanceType) time.Duration {
|
|
if m == nil {
|
|
return defaultTaskTimeout
|
|
}
|
|
timeout := m.pollTimeout
|
|
if timeout <= 0 {
|
|
timeout = defaultTaskTimeout
|
|
}
|
|
return timeout
|
|
}
|
|
|
|
func (m *Monitor) executeScheduledTask(ctx context.Context, task ScheduledTask) {
|
|
if !m.allowExecution(task) {
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().
|
|
Str("instance", task.InstanceName).
|
|
Str("type", string(task.InstanceType)).
|
|
Msg("Task blocked by circuit breaker")
|
|
}
|
|
return
|
|
}
|
|
|
|
if m.pollMetrics != nil {
|
|
wait := time.Duration(0)
|
|
if !task.NextRun.IsZero() {
|
|
wait = time.Since(task.NextRun)
|
|
if wait < 0 {
|
|
wait = 0
|
|
}
|
|
}
|
|
instanceType := string(task.InstanceType)
|
|
if strings.TrimSpace(instanceType) == "" {
|
|
instanceType = "unknown"
|
|
}
|
|
m.pollMetrics.RecordQueueWait(instanceType, wait)
|
|
}
|
|
|
|
executor := m.getExecutor()
|
|
if executor == nil {
|
|
log.Error().
|
|
Str("instance", task.InstanceName).
|
|
Str("type", string(task.InstanceType)).
|
|
Msg("No poll executor configured; skipping task")
|
|
return
|
|
}
|
|
|
|
pollTask, ok := m.buildPollTask(task)
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
taskCtx := ctx
|
|
var cancel context.CancelFunc
|
|
timeout := m.taskExecutionTimeout(task.InstanceType)
|
|
if timeout > 0 {
|
|
taskCtx, cancel = context.WithTimeout(ctx, timeout)
|
|
defer cancel()
|
|
}
|
|
|
|
executor.Execute(taskCtx, pollTask)
|
|
|
|
if timeout > 0 && stderrors.Is(taskCtx.Err(), context.DeadlineExceeded) {
|
|
log.Warn().
|
|
Str("instance", task.InstanceName).
|
|
Str("type", string(task.InstanceType)).
|
|
Dur("timeout", timeout).
|
|
Msg("Polling task timed out; rescheduling with fresh worker")
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) buildPollTask(task ScheduledTask) (PollTask, bool) {
|
|
provider := m.getPollProvider(task.InstanceType)
|
|
if provider == nil {
|
|
log.Debug().
|
|
Str("instance", task.InstanceName).
|
|
Str("type", string(task.InstanceType)).
|
|
Msg("Skipping unsupported task type")
|
|
return PollTask{}, false
|
|
}
|
|
|
|
pollTask, err := provider.BuildPollTask(m, task.InstanceName)
|
|
if err != nil {
|
|
log.Warn().
|
|
Str("instance", task.InstanceName).
|
|
Str("type", string(task.InstanceType)).
|
|
Err(err).
|
|
Msg("Skipping scheduled task")
|
|
return PollTask{}, false
|
|
}
|
|
|
|
if strings.TrimSpace(pollTask.InstanceName) == "" {
|
|
pollTask.InstanceName = task.InstanceName
|
|
}
|
|
if strings.TrimSpace(pollTask.InstanceType) == "" {
|
|
pollTask.InstanceType = string(task.InstanceType)
|
|
}
|
|
return pollTask, true
|
|
}
|
|
|
|
func (m *Monitor) rescheduleTask(task ScheduledTask) {
|
|
if m.taskQueue == nil {
|
|
return
|
|
}
|
|
|
|
key := schedulerKey(task.InstanceType, task.InstanceName)
|
|
m.mu.Lock()
|
|
outcome, hasOutcome := m.lastOutcome[key]
|
|
failureCount := m.failureCounts[key]
|
|
m.mu.Unlock()
|
|
|
|
if hasOutcome && !outcome.success {
|
|
if !outcome.transient || failureCount >= m.maxRetryAttempts {
|
|
m.sendToDeadLetter(task, outcome.err)
|
|
return
|
|
}
|
|
delay := m.backoffCfg.nextDelay(failureCount-1, m.randomFloat())
|
|
if delay <= 0 {
|
|
delay = 5 * time.Second
|
|
}
|
|
if m.config != nil && m.config.AdaptivePollingEnabled && m.config.AdaptivePollingMaxInterval > 0 && m.config.AdaptivePollingMaxInterval <= 15*time.Second {
|
|
maxDelay := 4 * time.Second
|
|
if delay > maxDelay {
|
|
delay = maxDelay
|
|
}
|
|
}
|
|
next := task
|
|
next.Interval = delay
|
|
next.NextRun = time.Now().Add(delay)
|
|
m.taskQueue.Upsert(next)
|
|
return
|
|
}
|
|
|
|
if m.scheduler == nil {
|
|
baseInterval := m.baseIntervalForInstanceType(task.InstanceType)
|
|
nextInterval := task.Interval
|
|
if nextInterval <= 0 {
|
|
nextInterval = baseInterval
|
|
}
|
|
if nextInterval <= 0 {
|
|
nextInterval = DefaultSchedulerConfig().BaseInterval
|
|
}
|
|
next := task
|
|
next.NextRun = time.Now().Add(nextInterval)
|
|
next.Interval = nextInterval
|
|
m.taskQueue.Upsert(next)
|
|
return
|
|
}
|
|
|
|
desc := InstanceDescriptor{
|
|
Name: task.InstanceName,
|
|
Type: task.InstanceType,
|
|
LastInterval: task.Interval,
|
|
LastScheduled: task.NextRun,
|
|
}
|
|
if m.stalenessTracker != nil {
|
|
if snap, ok := m.stalenessTracker.snapshot(task.InstanceType, task.InstanceName); ok {
|
|
desc.LastSuccess = snap.LastSuccess
|
|
desc.LastFailure = snap.LastError
|
|
if snap.ChangeHash != "" {
|
|
desc.Metadata = TaskMetadata{ChangeHash: snap.ChangeHash}
|
|
}
|
|
}
|
|
}
|
|
|
|
tasks := m.scheduler.BuildPlan(time.Now(), []InstanceDescriptor{desc}, m.taskQueue.Size())
|
|
if len(tasks) == 0 {
|
|
next := task
|
|
nextInterval := task.Interval
|
|
if nextInterval <= 0 && m.config != nil {
|
|
nextInterval = m.config.AdaptivePollingBaseInterval
|
|
}
|
|
if nextInterval <= 0 {
|
|
nextInterval = DefaultSchedulerConfig().BaseInterval
|
|
}
|
|
next.Interval = nextInterval
|
|
next.NextRun = time.Now().Add(nextInterval)
|
|
m.taskQueue.Upsert(next)
|
|
return
|
|
}
|
|
for _, next := range tasks {
|
|
m.taskQueue.Upsert(next)
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) sendToDeadLetter(task ScheduledTask, err error) {
|
|
if m.deadLetterQueue == nil {
|
|
log.Error().
|
|
Str("instance", task.InstanceName).
|
|
Str("type", string(task.InstanceType)).
|
|
Err(err).
|
|
Msg("Dead-letter queue unavailable; dropping task")
|
|
return
|
|
}
|
|
|
|
log.Error().
|
|
Str("instance", task.InstanceName).
|
|
Str("type", string(task.InstanceType)).
|
|
Err(err).
|
|
Msg("Routing task to dead-letter queue after repeated failures")
|
|
|
|
next := task
|
|
next.Interval = 30 * time.Minute
|
|
next.NextRun = time.Now().Add(next.Interval)
|
|
m.deadLetterQueue.Upsert(next)
|
|
m.updateDeadLetterMetrics()
|
|
|
|
key := schedulerKey(task.InstanceType, task.InstanceName)
|
|
now := time.Now()
|
|
|
|
m.mu.Lock()
|
|
if m.dlqInsightMap == nil {
|
|
m.dlqInsightMap = make(map[string]*dlqInsight)
|
|
}
|
|
info, ok := m.dlqInsightMap[key]
|
|
if !ok {
|
|
info = &dlqInsight{}
|
|
m.dlqInsightMap[key] = info
|
|
}
|
|
if info.FirstAttempt.IsZero() {
|
|
info.FirstAttempt = now
|
|
}
|
|
info.LastAttempt = now
|
|
info.RetryCount++
|
|
info.NextRetry = next.NextRun
|
|
if err != nil {
|
|
info.Reason = classifyDLQReason(err)
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
func classifyDLQReason(err error) string {
|
|
if err == nil {
|
|
return ""
|
|
}
|
|
if errors.IsRetryableError(err) {
|
|
return "max_retry_attempts"
|
|
}
|
|
return "permanent_failure"
|
|
}
|
|
|
|
func (m *Monitor) updateDeadLetterMetrics() {
|
|
if m.pollMetrics == nil || m.deadLetterQueue == nil {
|
|
return
|
|
}
|
|
|
|
size := m.deadLetterQueue.Size()
|
|
if size <= 0 {
|
|
m.pollMetrics.UpdateDeadLetterCounts(nil)
|
|
return
|
|
}
|
|
|
|
tasks := m.deadLetterQueue.PeekAll(size)
|
|
m.pollMetrics.UpdateDeadLetterCounts(tasks)
|
|
}
|
|
|
|
func (m *Monitor) updateBreakerMetric(instanceType InstanceType, instance string, breaker *circuitBreaker) {
|
|
if m.pollMetrics == nil || breaker == nil {
|
|
return
|
|
}
|
|
|
|
state, failures, retryAt, _, _ := breaker.stateDetails()
|
|
m.pollMetrics.SetBreakerState(string(instanceType), instance, state, failures, retryAt)
|
|
}
|
|
|
|
func (m *Monitor) randomFloat() float64 {
|
|
if m.rng == nil {
|
|
m.rng = rand.New(rand.NewSource(time.Now().UnixNano()))
|
|
}
|
|
return m.rng.Float64()
|
|
}
|
|
|
|
func (m *Monitor) updateQueueDepthMetric() {
|
|
if m.pollMetrics == nil || m.taskQueue == nil {
|
|
return
|
|
}
|
|
snapshot := m.taskQueue.Snapshot()
|
|
m.pollMetrics.SetQueueDepth(snapshot.Depth)
|
|
m.pollMetrics.UpdateQueueSnapshot(snapshot)
|
|
}
|
|
|
|
func (m *Monitor) allowExecution(task ScheduledTask) bool {
|
|
if m.circuitBreakers == nil {
|
|
return true
|
|
}
|
|
key := schedulerKey(task.InstanceType, task.InstanceName)
|
|
breaker := m.ensureBreaker(key)
|
|
allowed := breaker.allow(time.Now())
|
|
m.updateBreakerMetric(task.InstanceType, task.InstanceName, breaker)
|
|
return allowed
|
|
}
|
|
|
|
func (m *Monitor) ensureBreaker(key string) *circuitBreaker {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if m.circuitBreakers == nil {
|
|
m.circuitBreakers = make(map[string]*circuitBreaker)
|
|
}
|
|
if breaker, ok := m.circuitBreakers[key]; ok {
|
|
return breaker
|
|
}
|
|
baseRetry := m.breakerBaseRetry
|
|
if baseRetry <= 0 {
|
|
baseRetry = 5 * time.Second
|
|
}
|
|
maxDelay := m.breakerMaxDelay
|
|
if maxDelay <= 0 {
|
|
maxDelay = 5 * time.Minute
|
|
}
|
|
halfOpen := m.breakerHalfOpenWindow
|
|
if halfOpen <= 0 {
|
|
halfOpen = 30 * time.Second
|
|
}
|
|
breaker := newCircuitBreaker(3, baseRetry, maxDelay, halfOpen)
|
|
m.circuitBreakers[key] = breaker
|
|
return breaker
|
|
}
|
|
|
|
func (m *Monitor) recordTaskResult(instanceType InstanceType, instance string, pollErr error) {
|
|
if m == nil {
|
|
return
|
|
}
|
|
|
|
key := schedulerKey(instanceType, instance)
|
|
now := time.Now()
|
|
|
|
breaker := m.ensureBreaker(key)
|
|
|
|
m.mu.Lock()
|
|
status, ok := m.pollStatusMap[key]
|
|
if !ok {
|
|
status = &pollStatus{}
|
|
m.pollStatusMap[key] = status
|
|
}
|
|
|
|
if pollErr == nil {
|
|
if m.failureCounts != nil {
|
|
m.failureCounts[key] = 0
|
|
}
|
|
if m.lastOutcome != nil {
|
|
m.lastOutcome[key] = taskOutcome{
|
|
success: true,
|
|
transient: true,
|
|
err: nil,
|
|
recordedAt: now,
|
|
}
|
|
}
|
|
status.LastSuccess = now
|
|
status.ConsecutiveFailures = 0
|
|
status.FirstFailureAt = time.Time{}
|
|
m.mu.Unlock()
|
|
if breaker != nil {
|
|
breaker.recordSuccess()
|
|
m.updateBreakerMetric(instanceType, instance, breaker)
|
|
}
|
|
return
|
|
}
|
|
|
|
transient := isTransientError(pollErr)
|
|
category := "permanent"
|
|
if transient {
|
|
category = "transient"
|
|
}
|
|
if m.failureCounts != nil {
|
|
m.failureCounts[key] = m.failureCounts[key] + 1
|
|
}
|
|
if m.lastOutcome != nil {
|
|
m.lastOutcome[key] = taskOutcome{
|
|
success: false,
|
|
transient: transient,
|
|
err: pollErr,
|
|
recordedAt: now,
|
|
}
|
|
}
|
|
status.LastErrorAt = now
|
|
status.LastErrorMessage = pollErr.Error()
|
|
status.LastErrorCategory = category
|
|
status.ConsecutiveFailures++
|
|
if status.ConsecutiveFailures == 1 {
|
|
status.FirstFailureAt = now
|
|
}
|
|
m.mu.Unlock()
|
|
if breaker != nil {
|
|
breaker.recordFailure(now)
|
|
m.updateBreakerMetric(instanceType, instance, breaker)
|
|
}
|
|
}
|
|
|
|
// SchedulerHealthResponse contains complete scheduler health data for API exposure.
|
|
type SchedulerHealthResponse struct {
|
|
UpdatedAt time.Time `json:"updatedAt"`
|
|
Enabled bool `json:"enabled"`
|
|
Queue QueueSnapshot `json:"queue"`
|
|
DeadLetter DeadLetterSnapshot `json:"deadLetter"`
|
|
Breakers []BreakerSnapshot `json:"breakers"`
|
|
Staleness []StalenessSnapshot `json:"staleness"`
|
|
Instances []InstanceHealth `json:"instances"`
|
|
}
|
|
|
|
// DeadLetterSnapshot contains dead-letter queue data.
|
|
type DeadLetterSnapshot struct {
|
|
Count int `json:"count"`
|
|
Tasks []DeadLetterTask `json:"tasks"`
|
|
}
|
|
|
|
func emptyDeadLetterSnapshot() DeadLetterSnapshot {
|
|
return DeadLetterSnapshot{
|
|
Tasks: []DeadLetterTask{},
|
|
}
|
|
}
|
|
|
|
func emptySchedulerHealthResponse(enabled bool) SchedulerHealthResponse {
|
|
return SchedulerHealthResponse{
|
|
UpdatedAt: time.Now(),
|
|
Enabled: enabled,
|
|
Queue: emptyQueueSnapshot(),
|
|
DeadLetter: emptyDeadLetterSnapshot(),
|
|
Breakers: []BreakerSnapshot{},
|
|
Staleness: []StalenessSnapshot{},
|
|
Instances: []InstanceHealth{},
|
|
}
|
|
}
|
|
|
|
// SchedulerHealth returns a complete snapshot of scheduler health for API exposure.
|
|
func (m *Monitor) SchedulerHealth() SchedulerHealthResponse {
|
|
response := emptySchedulerHealthResponse(m.config != nil && m.config.AdaptivePollingEnabled)
|
|
|
|
m.refreshInstanceInfoCacheFromProviders()
|
|
|
|
// Queue snapshot
|
|
if m.taskQueue != nil {
|
|
response.Queue = m.taskQueue.Snapshot()
|
|
if m.pollMetrics != nil {
|
|
m.pollMetrics.UpdateQueueSnapshot(response.Queue)
|
|
}
|
|
}
|
|
|
|
// Dead-letter queue snapshot
|
|
if m.deadLetterQueue != nil {
|
|
deadLetterTasks := m.deadLetterQueue.PeekAll(25) // limit to top 25
|
|
m.mu.RLock()
|
|
for i := range deadLetterTasks {
|
|
key := schedulerKey(InstanceType(deadLetterTasks[i].Type), deadLetterTasks[i].Instance)
|
|
if outcome, ok := m.lastOutcome[key]; ok && outcome.err != nil {
|
|
deadLetterTasks[i].LastError = outcome.err.Error()
|
|
}
|
|
if count, ok := m.failureCounts[key]; ok {
|
|
deadLetterTasks[i].Failures = count
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
response.DeadLetter = DeadLetterSnapshot{
|
|
Count: m.deadLetterQueue.Size(),
|
|
Tasks: deadLetterTasks,
|
|
}
|
|
m.updateDeadLetterMetrics()
|
|
}
|
|
|
|
// Circuit breaker snapshots
|
|
m.mu.RLock()
|
|
breakerSnapshots := make([]BreakerSnapshot, 0, len(m.circuitBreakers))
|
|
for key, breaker := range m.circuitBreakers {
|
|
state, failures, retryAt := breaker.State()
|
|
// Only include breakers that are not in default closed state with 0 failures
|
|
if state != "closed" || failures > 0 {
|
|
// Parse instance type and name from key
|
|
parts := strings.SplitN(key, "::", 2)
|
|
instanceType, instanceName := "unknown", key
|
|
if len(parts) == 2 {
|
|
instanceType, instanceName = parts[0], parts[1]
|
|
}
|
|
breakerSnapshots = append(breakerSnapshots, BreakerSnapshot{
|
|
Instance: instanceName,
|
|
Type: instanceType,
|
|
State: state,
|
|
Failures: failures,
|
|
RetryAt: retryAt,
|
|
})
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
response.Breakers = breakerSnapshots
|
|
|
|
// Staleness snapshots
|
|
if m.stalenessTracker != nil {
|
|
response.Staleness = m.stalenessTracker.Snapshot()
|
|
}
|
|
|
|
instanceInfos := make(map[string]*instanceInfo)
|
|
pollStatuses := make(map[string]pollStatus)
|
|
dlqInsights := make(map[string]dlqInsight)
|
|
breakerRefs := make(map[string]*circuitBreaker)
|
|
|
|
m.mu.RLock()
|
|
for k, v := range m.instanceInfoCache {
|
|
if v == nil {
|
|
continue
|
|
}
|
|
copyVal := *v
|
|
instanceInfos[k] = ©Val
|
|
}
|
|
for k, v := range m.pollStatusMap {
|
|
if v == nil {
|
|
continue
|
|
}
|
|
pollStatuses[k] = *v
|
|
}
|
|
for k, v := range m.dlqInsightMap {
|
|
if v == nil {
|
|
continue
|
|
}
|
|
dlqInsights[k] = *v
|
|
}
|
|
for k, v := range m.circuitBreakers {
|
|
if v != nil {
|
|
breakerRefs[k] = v
|
|
}
|
|
}
|
|
m.mu.RUnlock()
|
|
for key, breaker := range breakerRefs {
|
|
instanceType := InstanceType("unknown")
|
|
instanceName := key
|
|
if parts := strings.SplitN(key, "::", 2); len(parts) == 2 {
|
|
if parts[0] != "" {
|
|
instanceType = InstanceType(parts[0])
|
|
}
|
|
if parts[1] != "" {
|
|
instanceName = parts[1]
|
|
}
|
|
}
|
|
m.updateBreakerMetric(instanceType, instanceName, breaker)
|
|
}
|
|
|
|
keySet := make(map[string]struct{})
|
|
for k := range instanceInfos {
|
|
if k != "" {
|
|
keySet[k] = struct{}{}
|
|
}
|
|
}
|
|
for k := range pollStatuses {
|
|
if k != "" {
|
|
keySet[k] = struct{}{}
|
|
}
|
|
}
|
|
for k := range dlqInsights {
|
|
if k != "" {
|
|
keySet[k] = struct{}{}
|
|
}
|
|
}
|
|
for k := range breakerRefs {
|
|
if k != "" {
|
|
keySet[k] = struct{}{}
|
|
}
|
|
}
|
|
for _, task := range response.DeadLetter.Tasks {
|
|
if task.Instance == "" {
|
|
continue
|
|
}
|
|
keySet[schedulerKey(InstanceType(task.Type), task.Instance)] = struct{}{}
|
|
}
|
|
for _, snap := range response.Staleness {
|
|
if snap.Instance == "" {
|
|
continue
|
|
}
|
|
keySet[schedulerKey(InstanceType(snap.Type), snap.Instance)] = struct{}{}
|
|
}
|
|
|
|
if len(keySet) > 0 {
|
|
keys := make([]string, 0, len(keySet))
|
|
for k := range keySet {
|
|
keys = append(keys, k)
|
|
}
|
|
sort.Strings(keys)
|
|
|
|
instances := make([]InstanceHealth, 0, len(keys))
|
|
for _, key := range keys {
|
|
instType := "unknown"
|
|
instName := key
|
|
if parts := strings.SplitN(key, "::", 2); len(parts) == 2 {
|
|
if parts[0] != "" {
|
|
instType = parts[0]
|
|
}
|
|
if parts[1] != "" {
|
|
instName = parts[1]
|
|
}
|
|
}
|
|
instType = strings.TrimSpace(instType)
|
|
instName = strings.TrimSpace(instName)
|
|
|
|
info := instanceInfos[key]
|
|
display := instName
|
|
connection := ""
|
|
if info != nil {
|
|
if instType == "unknown" || instType == "" {
|
|
if info.Type != "" {
|
|
instType = string(info.Type)
|
|
}
|
|
}
|
|
if strings.Contains(info.Key, "::") {
|
|
if parts := strings.SplitN(info.Key, "::", 2); len(parts) == 2 {
|
|
if instName == key {
|
|
instName = parts[1]
|
|
}
|
|
if (instType == "" || instType == "unknown") && parts[0] != "" {
|
|
instType = parts[0]
|
|
}
|
|
}
|
|
}
|
|
if info.DisplayName != "" {
|
|
display = info.DisplayName
|
|
}
|
|
if info.Connection != "" {
|
|
connection = info.Connection
|
|
}
|
|
}
|
|
display = strings.TrimSpace(display)
|
|
connection = strings.TrimSpace(connection)
|
|
if display == "" {
|
|
display = instName
|
|
}
|
|
if display == "" {
|
|
display = connection
|
|
}
|
|
if instType == "" {
|
|
instType = "unknown"
|
|
}
|
|
if instName == "" {
|
|
instName = key
|
|
}
|
|
|
|
status, hasStatus := pollStatuses[key]
|
|
instanceStatus := InstancePollStatus{}
|
|
if hasStatus {
|
|
instanceStatus.ConsecutiveFailures = status.ConsecutiveFailures
|
|
instanceStatus.LastSuccess = timePtr(status.LastSuccess)
|
|
if !status.FirstFailureAt.IsZero() {
|
|
instanceStatus.FirstFailureAt = timePtr(status.FirstFailureAt)
|
|
}
|
|
if !status.LastErrorAt.IsZero() && status.LastErrorMessage != "" {
|
|
instanceStatus.LastError = &ErrorDetail{
|
|
At: status.LastErrorAt,
|
|
Message: status.LastErrorMessage,
|
|
Category: status.LastErrorCategory,
|
|
}
|
|
}
|
|
}
|
|
|
|
breakerInfo := InstanceBreaker{
|
|
State: "closed",
|
|
FailureCount: 0,
|
|
}
|
|
if br, ok := breakerRefs[key]; ok && br != nil {
|
|
state, failures, retryAt, since, lastTransition := br.stateDetails()
|
|
if state != "" {
|
|
breakerInfo.State = state
|
|
}
|
|
breakerInfo.FailureCount = failures
|
|
breakerInfo.RetryAt = timePtr(retryAt)
|
|
breakerInfo.Since = timePtr(since)
|
|
breakerInfo.LastTransition = timePtr(lastTransition)
|
|
}
|
|
|
|
dlqInfo := InstanceDLQ{Present: false}
|
|
if dlq, ok := dlqInsights[key]; ok {
|
|
dlqInfo.Present = true
|
|
dlqInfo.Reason = dlq.Reason
|
|
dlqInfo.FirstAttempt = timePtr(dlq.FirstAttempt)
|
|
dlqInfo.LastAttempt = timePtr(dlq.LastAttempt)
|
|
dlqInfo.RetryCount = dlq.RetryCount
|
|
dlqInfo.NextRetry = timePtr(dlq.NextRetry)
|
|
}
|
|
|
|
// Collect any warnings for this instance
|
|
var warnings []string
|
|
if instType == "pve" {
|
|
if warning, ok := m.backupPermissionWarnings[instName]; ok {
|
|
warnings = append(warnings, warning)
|
|
}
|
|
}
|
|
|
|
instances = append(instances, InstanceHealth{
|
|
Key: key,
|
|
Type: instType,
|
|
DisplayName: display,
|
|
Instance: instName,
|
|
Connection: connection,
|
|
PollStatus: instanceStatus,
|
|
Breaker: breakerInfo,
|
|
DeadLetter: dlqInfo,
|
|
Warnings: warnings,
|
|
}.NormalizeCollections())
|
|
}
|
|
|
|
response.Instances = instances
|
|
} else {
|
|
response.Instances = []InstanceHealth{}
|
|
}
|
|
|
|
return response
|
|
}
|
|
|
|
func isTransientError(err error) bool {
|
|
if err == nil {
|
|
return true
|
|
}
|
|
if errors.IsRetryableError(err) {
|
|
return true
|
|
}
|
|
if stderrors.Is(err, context.Canceled) || stderrors.Is(err, context.DeadlineExceeded) {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (m *Monitor) GetState() models.StateSnapshot {
|
|
if m == nil {
|
|
return models.StateSnapshot{}
|
|
}
|
|
|
|
// Check if mock mode is enabled
|
|
if mock.IsMockEnabled() {
|
|
state := mock.CurrentFixtureGraph().State
|
|
if state.ActiveAlerts == nil && m.alertManager != nil {
|
|
// Populate snapshot lazily if the cache hasn't been filled yet.
|
|
mock.UpdateAlertSnapshots(m.alertManager.GetActiveAlerts(), m.alertManager.GetRecentlyResolved())
|
|
state = mock.CurrentFixtureGraph().State
|
|
}
|
|
return state
|
|
}
|
|
if m.state == nil {
|
|
return models.StateSnapshot{}
|
|
}
|
|
|
|
state := m.state.GetSnapshot()
|
|
// Keep externally served alert arrays aligned with the live alert manager
|
|
// even between explicit sync points, so APIs do not expose stale alert
|
|
// counts or recently resolved incidents from cached state.
|
|
state.ActiveAlerts = m.activeAlertsSnapshot()
|
|
state.RecentlyResolved = m.recentlyResolvedAlertsSnapshot()
|
|
return state
|
|
}
|
|
|
|
// ReadSnapshot returns a snapshot of the current infrastructure state,
|
|
// respecting mock mode when enabled.
|
|
//
|
|
// This is the preferred accessor for consumer code that needs the full
|
|
// StateSnapshot (e.g., chart rendering, reporting, AI state queries).
|
|
// This method satisfies models.SnapshotProvider — the single canonical
|
|
// interface that all consumer packages depend on. Fields available
|
|
// via ReadState should be accessed there instead when practical.
|
|
func (m *Monitor) ReadSnapshot() models.StateSnapshot {
|
|
return m.GetState()
|
|
}
|
|
|
|
// BackupsSnapshot returns the current backup state.
|
|
func (m *Monitor) BackupsSnapshot() models.Backups {
|
|
return m.GetState().Backups
|
|
}
|
|
|
|
// PBSInstancesSnapshot returns the current PBS instances.
|
|
func (m *Monitor) PBSInstancesSnapshot() []models.PBSInstance {
|
|
if m == nil {
|
|
return nil
|
|
}
|
|
readState := m.GetUnifiedReadStateOrSnapshot()
|
|
if readState == nil {
|
|
return nil
|
|
}
|
|
|
|
pbsViews := readState.PBSInstances()
|
|
if len(pbsViews) == 0 {
|
|
return nil
|
|
}
|
|
|
|
instances := make([]models.PBSInstance, 0, len(pbsViews))
|
|
for _, instance := range pbsViews {
|
|
if instance == nil {
|
|
continue
|
|
}
|
|
instances = append(instances, pbsInstanceFromReadStateView(instance))
|
|
}
|
|
return instances
|
|
}
|
|
|
|
// ReplicationJobsSnapshot returns the current replication jobs.
|
|
func (m *Monitor) ReplicationJobsSnapshot() []models.ReplicationJob {
|
|
return m.GetState().ReplicationJobs
|
|
}
|
|
|
|
// ConnectionHealthSnapshot returns the current connection health map.
|
|
func (m *Monitor) ConnectionHealthSnapshot() map[string]bool {
|
|
return m.GetState().ConnectionHealth
|
|
}
|
|
|
|
// HostsSnapshot returns the current hosts.
|
|
func (m *Monitor) HostsSnapshot() []models.Host {
|
|
if m == nil {
|
|
return nil
|
|
}
|
|
readState := m.GetUnifiedReadStateOrSnapshot()
|
|
if readState == nil {
|
|
return nil
|
|
}
|
|
|
|
hostViews := readState.Hosts()
|
|
if len(hostViews) == 0 {
|
|
return nil
|
|
}
|
|
|
|
hosts := make([]models.Host, 0, len(hostViews))
|
|
for _, host := range hostViews {
|
|
if host == nil {
|
|
continue
|
|
}
|
|
hosts = append(hosts, hostFromReadStateView(host))
|
|
}
|
|
return hosts
|
|
}
|
|
|
|
// VMsSnapshot returns the current VMs.
|
|
func (m *Monitor) VMsSnapshot() []models.VM {
|
|
if m == nil {
|
|
return nil
|
|
}
|
|
readState := m.GetUnifiedReadStateOrSnapshot()
|
|
if readState == nil {
|
|
return nil
|
|
}
|
|
|
|
vmViews := readState.VMs()
|
|
if len(vmViews) == 0 {
|
|
return nil
|
|
}
|
|
|
|
vms := make([]models.VM, 0, len(vmViews))
|
|
for _, vm := range vmViews {
|
|
if vm == nil {
|
|
continue
|
|
}
|
|
vms = append(vms, vmFromReadStateView(vm))
|
|
}
|
|
return vms
|
|
}
|
|
|
|
// ContainersSnapshot returns the current system containers.
|
|
func (m *Monitor) ContainersSnapshot() []models.Container {
|
|
if m == nil {
|
|
return nil
|
|
}
|
|
readState := m.GetUnifiedReadStateOrSnapshot()
|
|
if readState == nil {
|
|
return nil
|
|
}
|
|
|
|
containerViews := readState.Containers()
|
|
if len(containerViews) == 0 {
|
|
return nil
|
|
}
|
|
|
|
containers := make([]models.Container, 0, len(containerViews))
|
|
for _, container := range containerViews {
|
|
if container == nil {
|
|
continue
|
|
}
|
|
containers = append(containers, containerFromReadStateView(container))
|
|
}
|
|
return containers
|
|
}
|
|
|
|
// NodesSnapshot returns the current Proxmox nodes.
|
|
func (m *Monitor) NodesSnapshot() []models.Node {
|
|
if m == nil {
|
|
return nil
|
|
}
|
|
readState := m.GetUnifiedReadStateOrSnapshot()
|
|
if readState == nil {
|
|
return nil
|
|
}
|
|
|
|
nodeViews := readState.Nodes()
|
|
if len(nodeViews) == 0 {
|
|
return nil
|
|
}
|
|
|
|
nodes := make([]models.Node, 0, len(nodeViews))
|
|
for _, node := range nodeViews {
|
|
if node == nil {
|
|
continue
|
|
}
|
|
nodes = append(nodes, nodeFromReadStateView(node))
|
|
}
|
|
return nodes
|
|
}
|
|
|
|
// DockerHostsSnapshot returns the current Docker hosts.
|
|
func (m *Monitor) DockerHostsSnapshot() []models.DockerHost {
|
|
if m == nil {
|
|
return nil
|
|
}
|
|
readState := m.GetUnifiedReadStateOrSnapshot()
|
|
if readState == nil {
|
|
return nil
|
|
}
|
|
|
|
dockerViews := readState.DockerHosts()
|
|
if len(dockerViews) == 0 {
|
|
return nil
|
|
}
|
|
|
|
hosts := make([]models.DockerHost, 0, len(dockerViews))
|
|
for _, host := range dockerViews {
|
|
if host == nil {
|
|
continue
|
|
}
|
|
hosts = append(hosts, dockerHostFromReadStateView(host))
|
|
}
|
|
return hosts
|
|
}
|
|
|
|
// StorageSnapshot returns the current storage pools.
|
|
func (m *Monitor) StorageSnapshot() []models.Storage {
|
|
if m == nil {
|
|
return nil
|
|
}
|
|
readState := m.GetUnifiedReadStateOrSnapshot()
|
|
if readState == nil {
|
|
return nil
|
|
}
|
|
|
|
storagePools := readState.StoragePools()
|
|
if len(storagePools) == 0 {
|
|
return nil
|
|
}
|
|
|
|
storage := make([]models.Storage, 0, len(storagePools))
|
|
for _, pool := range storagePools {
|
|
if pool == nil {
|
|
continue
|
|
}
|
|
storage = append(storage, storageFromReadStateView(pool))
|
|
}
|
|
return storage
|
|
}
|
|
|
|
func storageFromReadStateView(view *unifiedresources.StoragePoolView) models.Storage {
|
|
if view == nil {
|
|
return models.Storage{}
|
|
}
|
|
|
|
storageID := strings.TrimSpace(view.SourceID())
|
|
if storageID == "" {
|
|
storageID = strings.TrimSpace(view.ID())
|
|
}
|
|
|
|
nodes := view.AccessibleNodes()
|
|
nodeIDs := storageNodeIDsFromReadState(view.Instance(), nodes)
|
|
total := view.DiskTotal()
|
|
used := view.DiskUsed()
|
|
free := total - used
|
|
if free < 0 {
|
|
free = 0
|
|
}
|
|
|
|
return models.Storage{
|
|
ID: storageID,
|
|
Name: view.Name(),
|
|
Node: view.Node(),
|
|
Instance: view.Instance(),
|
|
Nodes: nodes,
|
|
NodeIDs: nodeIDs,
|
|
NodeCount: len(nodes),
|
|
Type: view.StorageType(),
|
|
Status: string(view.Status()),
|
|
Path: view.Path(),
|
|
Total: total,
|
|
Used: used,
|
|
Free: free,
|
|
Usage: view.DiskPercent(),
|
|
Content: view.Content(),
|
|
Shared: view.Shared(),
|
|
Enabled: view.Enabled(),
|
|
Active: view.Active(),
|
|
ZFSPool: storageZFSPoolFromReadStateView(view),
|
|
}
|
|
}
|
|
|
|
func nodeFromReadStateView(view *unifiedresources.NodeView) models.Node {
|
|
if view == nil {
|
|
return models.Node{}
|
|
}
|
|
|
|
name := view.NodeName()
|
|
displayName := ""
|
|
if trimmed := strings.TrimSpace(view.Name()); trimmed != "" && trimmed != name {
|
|
displayName = trimmed
|
|
}
|
|
|
|
return models.Node{
|
|
ID: firstNonEmptyString(view.SourceID(), view.ID()),
|
|
Name: name,
|
|
DisplayName: displayName,
|
|
Instance: view.Instance(),
|
|
Host: view.HostURL(),
|
|
GuestURL: view.GuestURL(),
|
|
Status: string(view.Status()),
|
|
Type: "node",
|
|
CPU: view.CPUPercent(),
|
|
Memory: models.Memory{Used: view.MemoryUsed(), Total: view.MemoryTotal(), Free: maxInt64(0, view.MemoryTotal()-view.MemoryUsed()), Usage: view.MemoryPercent()},
|
|
Disk: models.Disk{Used: view.DiskUsed(), Total: view.DiskTotal(), Free: maxInt64(0, view.DiskTotal()-view.DiskUsed()), Usage: view.DiskPercent()},
|
|
Uptime: view.Uptime(),
|
|
LoadAverage: view.LoadAverage(),
|
|
KernelVersion: view.KernelVersion(),
|
|
PVEVersion: view.PVEVersion(),
|
|
CPUInfo: view.CPUInfo(),
|
|
Temperature: view.TemperatureDetails(),
|
|
TemperatureMonitoringEnabled: view.TemperatureMonitoringEnabled(),
|
|
LastSeen: view.LastSeen(),
|
|
ConnectionHealth: view.ConnectionHealth(),
|
|
IsClusterMember: view.IsClusterMember(),
|
|
ClusterName: view.ClusterName(),
|
|
PendingUpdates: view.PendingUpdates(),
|
|
PendingUpdatesCheckedAt: view.PendingUpdatesCheckedAt(),
|
|
LinkedAgentID: view.LinkedAgentID(),
|
|
}
|
|
}
|
|
|
|
func hostFromReadStateView(view *unifiedresources.HostView) models.Host {
|
|
if view == nil {
|
|
return models.Host{}
|
|
}
|
|
|
|
displayName := ""
|
|
if trimmed := strings.TrimSpace(view.Name()); trimmed != "" && trimmed != view.Hostname() {
|
|
displayName = trimmed
|
|
}
|
|
|
|
return models.Host{
|
|
ID: firstNonEmptyString(view.AgentID(), view.ID()),
|
|
Hostname: view.Hostname(),
|
|
DisplayName: displayName,
|
|
Platform: view.Platform(),
|
|
OSName: view.OSName(),
|
|
OSVersion: view.OSVersion(),
|
|
KernelVersion: view.KernelVersion(),
|
|
Architecture: view.Architecture(),
|
|
CPUCount: view.CPUCount(),
|
|
CPUUsage: view.CPUPercent(),
|
|
Memory: hostMemoryFromReadStateView(view),
|
|
LoadAverage: view.LoadAverage(),
|
|
Disks: hostDisksFromReadStateView(view.Disks()),
|
|
DiskIO: hostDiskIOFromReadStateView(view.DiskIO()),
|
|
NetworkInterfaces: hostNetworkInterfacesFromReadStateView(view.NetworkInterfaces()),
|
|
Sensors: hostSensorsFromReadStateView(view.Sensors()),
|
|
RAID: hostRAIDFromReadStateView(view.RAID()),
|
|
Unraid: hostUnraidFromReadStateView(view.Unraid()),
|
|
Ceph: hostCephFromReadStateView(view.Ceph()),
|
|
Status: string(view.Status()),
|
|
UptimeSeconds: view.UptimeSeconds(),
|
|
IntervalSeconds: view.IntervalSeconds(),
|
|
LastSeen: view.LastSeen(),
|
|
AgentVersion: view.AgentVersion(),
|
|
MachineID: view.MachineID(),
|
|
CommandsEnabled: view.CommandsEnabled(),
|
|
ReportIP: view.ReportIP(),
|
|
TokenID: view.TokenID(),
|
|
TokenName: view.TokenName(),
|
|
TokenHint: view.TokenHint(),
|
|
TokenLastUsedAt: view.TokenLastUsedAt(),
|
|
Tags: view.Tags(),
|
|
DiskExclude: view.DiskExclude(),
|
|
IsLegacy: view.IsLegacy(),
|
|
NetInRate: view.NetInRate(),
|
|
NetOutRate: view.NetOutRate(),
|
|
DiskReadRate: view.DiskReadRate(),
|
|
DiskWriteRate: view.DiskWriteRate(),
|
|
LinkedNodeID: view.LinkedNodeID(),
|
|
LinkedVMID: view.LinkedVMID(),
|
|
LinkedContainerID: view.LinkedContainerID(),
|
|
}
|
|
}
|
|
|
|
func vmFromReadStateView(view *unifiedresources.VMView) models.VM {
|
|
if view == nil {
|
|
return models.VM{}
|
|
}
|
|
|
|
totalMemory := view.MemoryTotal()
|
|
usedMemory := view.MemoryUsed()
|
|
totalDisk := view.DiskTotal()
|
|
usedDisk := view.DiskUsed()
|
|
|
|
return models.VM{
|
|
ID: firstNonEmptyString(view.SourceID(), view.ID()),
|
|
VMID: view.VMID(),
|
|
Name: view.Name(),
|
|
Node: view.Node(),
|
|
Instance: view.Instance(),
|
|
Status: string(view.Status()),
|
|
Type: "qemu",
|
|
CPU: view.CPUPercent(),
|
|
CPUs: view.CPUs(),
|
|
Memory: models.Memory{Total: totalMemory, Used: usedMemory, Free: maxInt64(0, totalMemory-usedMemory), Usage: view.MemoryPercent()},
|
|
Disk: models.Disk{Used: usedDisk, Total: totalDisk, Free: maxInt64(0, totalDisk-usedDisk), Usage: view.DiskPercent()},
|
|
Disks: guestDisksFromReadStateView(view.Disks()),
|
|
DiskStatusReason: view.DiskStatusReason(),
|
|
IPAddresses: view.IPAddresses(),
|
|
OSName: view.OSName(),
|
|
OSVersion: view.OSVersion(),
|
|
AgentVersion: view.AgentVersion(),
|
|
NetworkInterfaces: guestNetworkInterfacesFromReadStateView(view.NetworkInterfaces()),
|
|
NetworkIn: maxInt64(0, int64(view.NetIn())),
|
|
NetworkOut: maxInt64(0, int64(view.NetOut())),
|
|
DiskRead: maxInt64(0, int64(view.DiskRead())),
|
|
DiskWrite: maxInt64(0, int64(view.DiskWrite())),
|
|
Uptime: view.Uptime(),
|
|
Template: view.Template(),
|
|
LastBackup: view.LastBackup(),
|
|
Tags: view.Tags(),
|
|
Lock: view.Lock(),
|
|
LastSeen: view.LastSeen(),
|
|
}
|
|
}
|
|
|
|
func containerFromReadStateView(view *unifiedresources.ContainerView) models.Container {
|
|
if view == nil {
|
|
return models.Container{}
|
|
}
|
|
|
|
totalMemory := view.MemoryTotal()
|
|
usedMemory := view.MemoryUsed()
|
|
totalDisk := view.DiskTotal()
|
|
usedDisk := view.DiskUsed()
|
|
|
|
return models.Container{
|
|
ID: firstNonEmptyString(view.SourceID(), view.ID()),
|
|
VMID: view.VMID(),
|
|
Name: view.Name(),
|
|
Node: view.Node(),
|
|
Instance: view.Instance(),
|
|
Status: string(view.Status()),
|
|
Type: firstNonEmptyString(view.ContainerType(), "lxc"),
|
|
CPU: view.CPUPercent(),
|
|
CPUs: view.CPUs(),
|
|
Memory: models.Memory{Total: totalMemory, Used: usedMemory, Free: maxInt64(0, totalMemory-usedMemory), Usage: view.MemoryPercent()},
|
|
Disk: models.Disk{Used: usedDisk, Total: totalDisk, Free: maxInt64(0, totalDisk-usedDisk), Usage: view.DiskPercent()},
|
|
Disks: guestDisksFromReadStateView(view.Disks()),
|
|
NetworkIn: maxInt64(0, int64(view.NetIn())),
|
|
NetworkOut: maxInt64(0, int64(view.NetOut())),
|
|
DiskRead: maxInt64(0, int64(view.DiskRead())),
|
|
DiskWrite: maxInt64(0, int64(view.DiskWrite())),
|
|
Uptime: view.Uptime(),
|
|
Template: view.Template(),
|
|
LastBackup: view.LastBackup(),
|
|
Tags: view.Tags(),
|
|
Lock: view.Lock(),
|
|
LastSeen: view.LastSeen(),
|
|
IPAddresses: view.IPAddresses(),
|
|
NetworkInterfaces: guestNetworkInterfacesFromReadStateView(view.NetworkInterfaces()),
|
|
OSName: view.OSName(),
|
|
IsOCI: view.IsOCI(),
|
|
OSTemplate: view.OSTemplate(),
|
|
HasDocker: view.HasDocker(),
|
|
DockerCheckedAt: view.DockerCheckedAt(),
|
|
}
|
|
}
|
|
|
|
func guestDisksFromReadStateView(disks []unifiedresources.DiskInfo) []models.Disk {
|
|
if len(disks) == 0 {
|
|
return nil
|
|
}
|
|
out := make([]models.Disk, 0, len(disks))
|
|
for _, disk := range disks {
|
|
out = append(out, models.Disk{
|
|
Total: disk.Total,
|
|
Used: disk.Used,
|
|
Free: disk.Free,
|
|
Usage: disk.Usage,
|
|
Mountpoint: disk.Mountpoint,
|
|
Type: disk.Filesystem,
|
|
Device: disk.Device,
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func guestNetworkInterfacesFromReadStateView(interfaces []unifiedresources.NetworkInterface) []models.GuestNetworkInterface {
|
|
if len(interfaces) == 0 {
|
|
return nil
|
|
}
|
|
out := make([]models.GuestNetworkInterface, 0, len(interfaces))
|
|
for _, iface := range interfaces {
|
|
out = append(out, models.GuestNetworkInterface{
|
|
Name: iface.Name,
|
|
MAC: iface.MAC,
|
|
Addresses: append([]string(nil), iface.Addresses...),
|
|
RXBytes: maxInt64(0, int64(iface.RXBytes)),
|
|
TXBytes: maxInt64(0, int64(iface.TXBytes)),
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func pbsInstanceFromReadStateView(view *unifiedresources.PBSInstanceView) models.PBSInstance {
|
|
if view == nil {
|
|
return models.PBSInstance{}
|
|
}
|
|
|
|
return models.PBSInstance{
|
|
ID: firstNonEmptyString(view.InstanceID(), view.ID()),
|
|
Name: view.Name(),
|
|
Host: view.HostURL(),
|
|
GuestURL: view.GuestURL(),
|
|
Status: string(view.Status()),
|
|
Version: view.Version(),
|
|
CPU: view.CPUPercent(),
|
|
Memory: view.MemoryPercent(),
|
|
MemoryUsed: view.MemoryUsed(),
|
|
MemoryTotal: view.MemoryTotal(),
|
|
Uptime: view.UptimeSeconds(),
|
|
Datastores: view.DatastoreDetails(),
|
|
BackupJobs: view.BackupJobs(),
|
|
SyncJobs: view.SyncJobs(),
|
|
VerifyJobs: view.VerifyJobs(),
|
|
PruneJobs: view.PruneJobs(),
|
|
GarbageJobs: view.GarbageJobs(),
|
|
ConnectionHealth: view.ConnectionHealth(),
|
|
LastSeen: view.LastSeen(),
|
|
}
|
|
}
|
|
|
|
func hostMemoryFromReadStateView(view *unifiedresources.HostView) models.Memory {
|
|
if view == nil {
|
|
return models.Memory{}
|
|
}
|
|
total := view.MemoryTotal()
|
|
used := view.MemoryUsed()
|
|
return models.Memory{
|
|
Total: total,
|
|
Used: used,
|
|
Free: maxInt64(0, total-used),
|
|
Usage: view.MemoryPercent(),
|
|
SwapUsed: view.SwapUsed(),
|
|
SwapTotal: view.SwapTotal(),
|
|
}
|
|
}
|
|
|
|
func hostDisksFromReadStateView(disks []unifiedresources.DiskInfo) []models.Disk {
|
|
if len(disks) == 0 {
|
|
return nil
|
|
}
|
|
out := make([]models.Disk, 0, len(disks))
|
|
for _, disk := range disks {
|
|
out = append(out, models.Disk{
|
|
Total: disk.Total,
|
|
Used: disk.Used,
|
|
Free: disk.Free,
|
|
Usage: disk.Usage,
|
|
Mountpoint: disk.Mountpoint,
|
|
Type: disk.Filesystem,
|
|
Device: disk.Device,
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func hostDiskIOFromReadStateView(diskIO []unifiedresources.HostDiskIOMeta) []models.DiskIO {
|
|
if len(diskIO) == 0 {
|
|
return nil
|
|
}
|
|
out := make([]models.DiskIO, 0, len(diskIO))
|
|
for _, entry := range diskIO {
|
|
out = append(out, models.DiskIO{
|
|
Device: entry.Device,
|
|
ReadBytes: entry.ReadBytes,
|
|
WriteBytes: entry.WriteBytes,
|
|
ReadOps: entry.ReadOps,
|
|
WriteOps: entry.WriteOps,
|
|
IOTime: entry.IOTimeMs,
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func hostNetworkInterfacesFromReadStateView(interfaces []unifiedresources.NetworkInterface) []models.HostNetworkInterface {
|
|
if len(interfaces) == 0 {
|
|
return nil
|
|
}
|
|
out := make([]models.HostNetworkInterface, 0, len(interfaces))
|
|
for _, iface := range interfaces {
|
|
out = append(out, models.HostNetworkInterface{
|
|
Name: iface.Name,
|
|
MAC: iface.MAC,
|
|
Addresses: append([]string(nil), iface.Addresses...),
|
|
RXBytes: iface.RXBytes,
|
|
TXBytes: iface.TXBytes,
|
|
SpeedMbps: int64PtrCopy(iface.SpeedMbps),
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func hostSensorsFromReadStateView(sensors *unifiedresources.HostSensorMeta) models.HostSensorSummary {
|
|
if sensors == nil {
|
|
return models.HostSensorSummary{}
|
|
}
|
|
out := models.HostSensorSummary{}
|
|
if len(sensors.TemperatureCelsius) > 0 {
|
|
out.TemperatureCelsius = make(map[string]float64, len(sensors.TemperatureCelsius))
|
|
for k, v := range sensors.TemperatureCelsius {
|
|
out.TemperatureCelsius[k] = v
|
|
}
|
|
}
|
|
if len(sensors.FanRPM) > 0 {
|
|
out.FanRPM = make(map[string]float64, len(sensors.FanRPM))
|
|
for k, v := range sensors.FanRPM {
|
|
out.FanRPM[k] = v
|
|
}
|
|
}
|
|
if len(sensors.Additional) > 0 {
|
|
out.Additional = make(map[string]float64, len(sensors.Additional))
|
|
for k, v := range sensors.Additional {
|
|
out.Additional[k] = v
|
|
}
|
|
}
|
|
if len(sensors.SMART) > 0 {
|
|
out.SMART = make([]models.HostDiskSMART, 0, len(sensors.SMART))
|
|
for _, smart := range sensors.SMART {
|
|
out.SMART = append(out.SMART, models.HostDiskSMART{
|
|
Device: smart.Device,
|
|
Model: smart.Model,
|
|
Serial: smart.Serial,
|
|
WWN: smart.WWN,
|
|
Type: smart.Type,
|
|
Temperature: smart.Temperature,
|
|
Health: smart.Health,
|
|
Standby: smart.Standby,
|
|
Attributes: smartAttributesCopy(smart.Attributes),
|
|
})
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func hostRAIDFromReadStateView(raid []unifiedresources.HostRAIDMeta) []models.HostRAIDArray {
|
|
if len(raid) == 0 {
|
|
return nil
|
|
}
|
|
out := make([]models.HostRAIDArray, 0, len(raid))
|
|
for _, entry := range raid {
|
|
devices := make([]models.HostRAIDDevice, 0, len(entry.Devices))
|
|
for _, device := range entry.Devices {
|
|
devices = append(devices, models.HostRAIDDevice{
|
|
Device: device.Device,
|
|
State: device.State,
|
|
Slot: device.Slot,
|
|
})
|
|
}
|
|
out = append(out, models.HostRAIDArray{
|
|
Device: entry.Device,
|
|
Name: entry.Name,
|
|
Level: entry.Level,
|
|
State: entry.State,
|
|
TotalDevices: entry.TotalDevices,
|
|
ActiveDevices: entry.ActiveDevices,
|
|
WorkingDevices: entry.WorkingDevices,
|
|
FailedDevices: entry.FailedDevices,
|
|
SpareDevices: entry.SpareDevices,
|
|
UUID: entry.UUID,
|
|
Devices: devices,
|
|
RebuildPercent: entry.RebuildPercent,
|
|
RebuildSpeed: entry.RebuildSpeed,
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func hostUnraidFromReadStateView(unraid *unifiedresources.HostUnraidMeta) *models.HostUnraidStorage {
|
|
if unraid == nil {
|
|
return nil
|
|
}
|
|
out := &models.HostUnraidStorage{
|
|
ArrayStarted: unraid.ArrayStarted,
|
|
ArrayState: unraid.ArrayState,
|
|
SyncAction: unraid.SyncAction,
|
|
SyncProgress: unraid.SyncProgress,
|
|
SyncErrors: unraid.SyncErrors,
|
|
NumProtected: unraid.NumProtected,
|
|
NumDisabled: unraid.NumDisabled,
|
|
NumInvalid: unraid.NumInvalid,
|
|
NumMissing: unraid.NumMissing,
|
|
}
|
|
if len(unraid.Disks) > 0 {
|
|
out.Disks = make([]models.HostUnraidDisk, 0, len(unraid.Disks))
|
|
for _, disk := range unraid.Disks {
|
|
out.Disks = append(out.Disks, models.HostUnraidDisk{
|
|
Name: disk.Name,
|
|
Device: disk.Device,
|
|
Role: disk.Role,
|
|
Status: disk.Status,
|
|
RawStatus: disk.RawStatus,
|
|
Serial: disk.Serial,
|
|
Filesystem: disk.Filesystem,
|
|
SizeBytes: disk.SizeBytes,
|
|
Slot: disk.Slot,
|
|
})
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func hostCephFromReadStateView(ceph *unifiedresources.HostCephMeta) *models.HostCephCluster {
|
|
if ceph == nil {
|
|
return nil
|
|
}
|
|
out := &models.HostCephCluster{
|
|
FSID: ceph.FSID,
|
|
Health: models.HostCephHealth{
|
|
Status: ceph.Health.Status,
|
|
},
|
|
MonMap: models.HostCephMonitorMap{
|
|
Epoch: ceph.MonMap.Epoch,
|
|
NumMons: ceph.MonMap.NumMons,
|
|
},
|
|
MgrMap: models.HostCephManagerMap{
|
|
Available: ceph.MgrMap.Available,
|
|
NumMgrs: ceph.MgrMap.NumMgrs,
|
|
ActiveMgr: ceph.MgrMap.ActiveMgr,
|
|
Standbys: ceph.MgrMap.Standbys,
|
|
},
|
|
OSDMap: models.HostCephOSDMap{
|
|
Epoch: ceph.OSDMap.Epoch,
|
|
NumOSDs: ceph.OSDMap.NumOSDs,
|
|
NumUp: ceph.OSDMap.NumUp,
|
|
NumIn: ceph.OSDMap.NumIn,
|
|
NumDown: ceph.OSDMap.NumDown,
|
|
NumOut: ceph.OSDMap.NumOut,
|
|
},
|
|
PGMap: models.HostCephPGMap{
|
|
NumPGs: ceph.PGMap.NumPGs,
|
|
BytesTotal: ceph.PGMap.BytesTotal,
|
|
BytesUsed: ceph.PGMap.BytesUsed,
|
|
BytesAvailable: ceph.PGMap.BytesAvailable,
|
|
DataBytes: ceph.PGMap.DataBytes,
|
|
UsagePercent: ceph.PGMap.UsagePercent,
|
|
DegradedRatio: ceph.PGMap.DegradedRatio,
|
|
MisplacedRatio: ceph.PGMap.MisplacedRatio,
|
|
ReadBytesPerSec: ceph.PGMap.ReadBytesPerSec,
|
|
WriteBytesPerSec: ceph.PGMap.WriteBytesPerSec,
|
|
ReadOpsPerSec: ceph.PGMap.ReadOpsPerSec,
|
|
WriteOpsPerSec: ceph.PGMap.WriteOpsPerSec,
|
|
},
|
|
CollectedAt: ceph.CollectedAt,
|
|
}
|
|
if len(ceph.Health.Summary) > 0 {
|
|
out.Health.Summary = make([]models.HostCephHealthSummary, 0, len(ceph.Health.Summary))
|
|
for _, summary := range ceph.Health.Summary {
|
|
out.Health.Summary = append(out.Health.Summary, models.HostCephHealthSummary{
|
|
Severity: summary.Severity,
|
|
Message: summary.Message,
|
|
})
|
|
}
|
|
}
|
|
if len(ceph.Health.Checks) > 0 {
|
|
out.Health.Checks = make(map[string]models.HostCephCheck, len(ceph.Health.Checks))
|
|
for name, check := range ceph.Health.Checks {
|
|
out.Health.Checks[name] = models.HostCephCheck{
|
|
Severity: check.Severity,
|
|
Message: check.Message,
|
|
Detail: append([]string(nil), check.Detail...),
|
|
}
|
|
}
|
|
}
|
|
if len(ceph.MonMap.Monitors) > 0 {
|
|
out.MonMap.Monitors = make([]models.HostCephMonitor, 0, len(ceph.MonMap.Monitors))
|
|
for _, monitor := range ceph.MonMap.Monitors {
|
|
out.MonMap.Monitors = append(out.MonMap.Monitors, models.HostCephMonitor{
|
|
Name: monitor.Name,
|
|
Rank: monitor.Rank,
|
|
Addr: monitor.Addr,
|
|
Status: monitor.Status,
|
|
})
|
|
}
|
|
}
|
|
if len(ceph.Pools) > 0 {
|
|
out.Pools = make([]models.HostCephPool, 0, len(ceph.Pools))
|
|
for _, pool := range ceph.Pools {
|
|
out.Pools = append(out.Pools, models.HostCephPool{
|
|
ID: pool.ID,
|
|
Name: pool.Name,
|
|
BytesUsed: pool.BytesUsed,
|
|
BytesAvailable: pool.BytesAvailable,
|
|
Objects: pool.Objects,
|
|
PercentUsed: pool.PercentUsed,
|
|
})
|
|
}
|
|
}
|
|
if len(ceph.Services) > 0 {
|
|
out.Services = make([]models.HostCephService, 0, len(ceph.Services))
|
|
for _, service := range ceph.Services {
|
|
out.Services = append(out.Services, models.HostCephService{
|
|
Type: service.Type,
|
|
Running: service.Running,
|
|
Total: service.Total,
|
|
Daemons: append([]string(nil), service.Daemons...),
|
|
})
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func dockerHostFromReadStateView(view *unifiedresources.DockerHostView) models.DockerHost {
|
|
if view == nil {
|
|
return models.DockerHost{}
|
|
}
|
|
|
|
totalMemory := view.TotalMemoryBytes()
|
|
if totalMemory == 0 {
|
|
totalMemory = view.MemoryTotal()
|
|
}
|
|
usedMemory := view.MemoryUsed()
|
|
freeMemory := maxInt64(0, totalMemory-usedMemory)
|
|
|
|
return models.DockerHost{
|
|
ID: firstNonEmptyString(view.HostSourceID(), view.ID()),
|
|
AgentID: view.AgentID(),
|
|
Hostname: view.Hostname(),
|
|
DisplayName: view.DisplayName(),
|
|
CustomDisplayName: view.CustomDisplayName(),
|
|
MachineID: view.MachineID(),
|
|
OS: view.OS(),
|
|
KernelVersion: view.KernelVersion(),
|
|
Architecture: view.Architecture(),
|
|
Runtime: view.Runtime(),
|
|
RuntimeVersion: view.RuntimeVersion(),
|
|
DockerVersion: view.DockerVersion(),
|
|
CPUs: view.CPUs(),
|
|
TotalMemoryBytes: totalMemory,
|
|
UptimeSeconds: view.UptimeSeconds(),
|
|
CPUUsage: view.CPUPercent(),
|
|
LoadAverage: view.LoadAverage(),
|
|
Memory: models.Memory{
|
|
Total: totalMemory,
|
|
Used: usedMemory,
|
|
Free: freeMemory,
|
|
Usage: view.MemoryPercent(),
|
|
},
|
|
Disks: hostDisksFromReadStateView(view.Disks()),
|
|
NetworkInterfaces: hostNetworkInterfacesFromReadStateView(view.NetworkInterfaces()),
|
|
Status: string(view.Status()),
|
|
LastSeen: view.LastSeen(),
|
|
IntervalSeconds: view.IntervalSeconds(),
|
|
AgentVersion: view.AgentVersion(),
|
|
Containers: view.Containers(),
|
|
Services: view.Services(),
|
|
Tasks: view.Tasks(),
|
|
Swarm: dockerSwarmFromReadStateView(view.Swarm()),
|
|
TokenID: view.TokenID(),
|
|
TokenName: view.TokenName(),
|
|
TokenHint: view.TokenHint(),
|
|
TokenLastUsedAt: view.TokenLastUsedAt(),
|
|
Hidden: view.Hidden(),
|
|
PendingUninstall: view.PendingUninstall(),
|
|
Command: view.Command(),
|
|
IsLegacy: view.IsLegacy(),
|
|
NetInRate: view.NetInRate(),
|
|
NetOutRate: view.NetOutRate(),
|
|
DiskReadRate: view.DiskReadRate(),
|
|
DiskWriteRate: view.DiskWriteRate(),
|
|
}
|
|
}
|
|
|
|
func dockerSwarmFromReadStateView(in *unifiedresources.DockerSwarmInfo) *models.DockerSwarmInfo {
|
|
if in == nil {
|
|
return nil
|
|
}
|
|
return &models.DockerSwarmInfo{
|
|
NodeID: in.NodeID,
|
|
NodeRole: in.NodeRole,
|
|
LocalState: in.LocalState,
|
|
ControlAvailable: in.ControlAvailable,
|
|
ClusterID: in.ClusterID,
|
|
ClusterName: in.ClusterName,
|
|
Scope: in.Scope,
|
|
Error: in.Error,
|
|
}
|
|
}
|
|
|
|
func maxInt64(a, b int64) int64 {
|
|
if a > b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
func int64PtrCopy(in *int64) *int64 {
|
|
if in == nil {
|
|
return nil
|
|
}
|
|
out := *in
|
|
return &out
|
|
}
|
|
|
|
func smartAttributesCopy(in *models.SMARTAttributes) *models.SMARTAttributes {
|
|
if in == nil {
|
|
return nil
|
|
}
|
|
out := *in
|
|
out.PowerOnHours = int64PtrCopy(in.PowerOnHours)
|
|
out.PowerCycles = int64PtrCopy(in.PowerCycles)
|
|
out.ReallocatedSectors = int64PtrCopy(in.ReallocatedSectors)
|
|
out.PendingSectors = int64PtrCopy(in.PendingSectors)
|
|
out.OfflineUncorrectable = int64PtrCopy(in.OfflineUncorrectable)
|
|
out.UDMACRCErrors = int64PtrCopy(in.UDMACRCErrors)
|
|
if in.PercentageUsed != nil {
|
|
value := *in.PercentageUsed
|
|
out.PercentageUsed = &value
|
|
}
|
|
if in.AvailableSpare != nil {
|
|
value := *in.AvailableSpare
|
|
out.AvailableSpare = &value
|
|
}
|
|
out.MediaErrors = int64PtrCopy(in.MediaErrors)
|
|
out.UnsafeShutdowns = int64PtrCopy(in.UnsafeShutdowns)
|
|
return &out
|
|
}
|
|
|
|
func firstNonEmptyString(values ...string) string {
|
|
for _, value := range values {
|
|
if trimmed := strings.TrimSpace(value); trimmed != "" {
|
|
return trimmed
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func storageZFSPoolFromReadStateView(view *unifiedresources.StoragePoolView) *models.ZFSPool {
|
|
if view == nil {
|
|
return nil
|
|
}
|
|
|
|
state := strings.TrimSpace(view.ZFSPoolState())
|
|
if !view.IsZFS() && state == "" && view.ZFSReadErrors() == 0 && view.ZFSWriteErrors() == 0 && view.ZFSChecksumErrors() == 0 {
|
|
return nil
|
|
}
|
|
|
|
return &models.ZFSPool{
|
|
Name: view.Name(),
|
|
State: state,
|
|
ReadErrors: view.ZFSReadErrors(),
|
|
WriteErrors: view.ZFSWriteErrors(),
|
|
ChecksumErrors: view.ZFSChecksumErrors(),
|
|
}
|
|
}
|
|
|
|
func storageNodeIDsFromReadState(instance string, nodes []string) []string {
|
|
if len(nodes) == 0 {
|
|
return nil
|
|
}
|
|
|
|
nodeIDs := make([]string, 0, len(nodes))
|
|
for _, node := range nodes {
|
|
node = strings.TrimSpace(node)
|
|
if node == "" {
|
|
continue
|
|
}
|
|
if instance == "" {
|
|
nodeIDs = append(nodeIDs, node)
|
|
continue
|
|
}
|
|
nodeIDs = append(nodeIDs, instance+"-"+node)
|
|
}
|
|
if len(nodeIDs) == 0 {
|
|
return nil
|
|
}
|
|
return nodeIDs
|
|
}
|
|
|
|
// ActiveAlertsSnapshot returns the current active alerts.
|
|
func (m *Monitor) ActiveAlertsSnapshot() []models.Alert {
|
|
return m.activeAlertsSnapshot()
|
|
}
|
|
|
|
// RecentlyResolvedSnapshot returns the recently resolved alerts.
|
|
func (m *Monitor) RecentlyResolvedSnapshot() []models.ResolvedAlert {
|
|
return m.recentlyResolvedAlertsSnapshot()
|
|
}
|
|
|
|
// PVEBackupsSnapshot returns the current PVE backups.
|
|
func (m *Monitor) PVEBackupsSnapshot() models.PVEBackups {
|
|
return m.GetState().PVEBackups
|
|
}
|
|
|
|
// BuildFrontendState returns the current state converted to frontend format.
|
|
// This replaces the GetState().ToFrontend() pattern in consumer code.
|
|
func (m *Monitor) BuildFrontendState() models.StateFrontend {
|
|
return m.buildBroadcastFrontendStateFromSnapshot(m.GetState())
|
|
}
|
|
|
|
// BuildBroadcastFrontendState returns frontend state ready for websocket
|
|
// broadcasts, including the unified resource payload when a resource store is
|
|
// configured.
|
|
func (m *Monitor) BuildBroadcastFrontendState() models.StateFrontend {
|
|
return m.buildBroadcastFrontendStateFromSnapshot(m.GetState())
|
|
}
|
|
|
|
func buildFrontendStateFromSnapshot(snapshot models.StateSnapshot) models.StateFrontend {
|
|
return snapshot.ToFrontend()
|
|
}
|
|
|
|
func (m *Monitor) buildBroadcastFrontendStateFromSnapshot(snapshot models.StateSnapshot) models.StateFrontend {
|
|
frontendState := buildFrontendStateFromSnapshot(snapshot)
|
|
m.updateResourceStore(snapshot)
|
|
if m != nil && m.alertManager != nil {
|
|
if liveAlerts := m.activeAlertsSnapshot(); len(liveAlerts) > 0 || len(frontendState.ActiveAlerts) > 0 {
|
|
frontendState.ActiveAlerts = liveAlerts
|
|
}
|
|
}
|
|
unifiedView := m.currentUnifiedStateView()
|
|
frontendState.Resources = convertResourcesForBroadcast(unifiedView.resources)
|
|
frontendState.ConnectedInfrastructure = buildConnectedInfrastructure(unifiedView.resources, snapshot)
|
|
if !unifiedView.freshness.IsZero() {
|
|
frontendState.LastUpdate = unifiedView.freshness.UnixMilli()
|
|
}
|
|
return frontendState
|
|
}
|
|
|
|
// GetLiveStateSnapshot returns the underlying monitor state snapshot without
|
|
// applying global mock mode overrides.
|
|
//
|
|
// This is useful for agent management endpoints that need to reflect actual
|
|
// registrations even when mock mode is enabled for the UI/demo experience.
|
|
func (m *Monitor) GetLiveStateSnapshot() models.StateSnapshot {
|
|
if m == nil || m.state == nil {
|
|
return models.EmptyStateSnapshot()
|
|
}
|
|
return m.state.GetSnapshot()
|
|
}
|
|
|
|
// GetLiveHostsSnapshot returns the underlying registered host agents without
|
|
// applying global mock mode overrides.
|
|
func (m *Monitor) GetLiveHostsSnapshot() []models.Host {
|
|
if m == nil || m.state == nil {
|
|
return nil
|
|
}
|
|
return m.state.GetSnapshot().Hosts
|
|
}
|
|
|
|
// SetOrgID sets the organization ID for this monitor instance.
|
|
// This is used for tenant isolation in multi-tenant deployments.
|
|
func (m *Monitor) SetOrgID(orgID string) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
m.orgID = strings.TrimSpace(orgID)
|
|
}
|
|
|
|
// GetOrgID returns the organization ID for this monitor instance.
|
|
// Returns empty string for default/legacy monitors.
|
|
func (m *Monitor) GetOrgID() string {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
return m.orgID
|
|
}
|
|
|
|
type stateBroadcaster interface {
|
|
BroadcastState(state interface{})
|
|
BroadcastStateToTenant(orgID string, state interface{})
|
|
}
|
|
|
|
// broadcastState broadcasts state to WebSocket clients.
|
|
// Monitors with an explicit org ID (including "default") are tenant-scoped.
|
|
// Legacy monitors without an org ID broadcast globally.
|
|
func (m *Monitor) broadcastState(hub stateBroadcaster, frontendState interface{}) {
|
|
if hub == nil {
|
|
return
|
|
}
|
|
|
|
orgID := strings.TrimSpace(m.GetOrgID())
|
|
if orgID != "" {
|
|
hub.BroadcastStateToTenant(orgID, frontendState)
|
|
} else {
|
|
hub.BroadcastState(frontendState)
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) broadcastEscalatedAlert(hub *websocket.Hub, alert *alerts.Alert) {
|
|
if hub == nil || alert == nil {
|
|
return
|
|
}
|
|
|
|
hub.BroadcastAlertToTenant(m.GetOrgID(), alert)
|
|
}
|
|
|
|
// SetMockMode switches between mock data and real infrastructure data at runtime.
|
|
func (m *Monitor) SetMockMode(enable bool) error {
|
|
current := mock.IsMockEnabled()
|
|
if current == enable {
|
|
log.Info().Bool("mockMode", enable).Msg("mock mode already in desired state")
|
|
return nil
|
|
}
|
|
|
|
if enable {
|
|
m.stopMockMetricsSampler()
|
|
if err := mock.SetEnabled(true); err != nil {
|
|
return err
|
|
}
|
|
m.alertManager.ClearActiveAlerts()
|
|
m.mu.Lock()
|
|
m.resetStateLocked()
|
|
m.metricsHistory.Reset()
|
|
m.mu.Unlock()
|
|
m.StopDiscoveryService()
|
|
m.mu.RLock()
|
|
ctx := m.runtimeCtx
|
|
m.mu.RUnlock()
|
|
if ctx != nil {
|
|
m.startMockMetricsSampler(ctx)
|
|
}
|
|
log.Info().Msg("switched monitor to mock mode")
|
|
} else {
|
|
m.stopMockMetricsSampler()
|
|
if err := mock.SetEnabled(false); err != nil {
|
|
return err
|
|
}
|
|
m.alertManager.ClearActiveAlerts()
|
|
m.mu.Lock()
|
|
m.resetStateLocked()
|
|
m.metricsHistory.Reset()
|
|
m.mu.Unlock()
|
|
log.Info().Msg("switched monitor to real data mode")
|
|
}
|
|
|
|
m.mu.RLock()
|
|
ctx := m.runtimeCtx
|
|
hub := m.wsHub
|
|
m.mu.RUnlock()
|
|
|
|
if hub != nil {
|
|
frontendState := m.buildBroadcastFrontendStateFromSnapshot(m.GetState())
|
|
// Use tenant-aware broadcast method
|
|
m.broadcastState(hub, frontendState)
|
|
}
|
|
|
|
if enable && ctx != nil && keepRealPollingInMockMode() {
|
|
// Keep real metrics flowing while mock mode is enabled.
|
|
go m.poll(ctx, hub)
|
|
}
|
|
|
|
if !enable && ctx != nil {
|
|
// Kick off an immediate poll to repopulate state with live data.
|
|
go m.poll(ctx, hub)
|
|
if hub != nil && m.config.DiscoveryEnabled {
|
|
go m.StartDiscoveryService(ctx, hub, m.config.DiscoverySubnet)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (m *Monitor) resetStateLocked() {
|
|
m.state = models.NewState()
|
|
m.state.Stats = models.Stats{
|
|
StartTime: m.startTime,
|
|
Version: "2.0.0-go",
|
|
}
|
|
}
|
|
|
|
// GetStartTime returns the monitor start time
|
|
func (m *Monitor) GetStartTime() time.Time {
|
|
return m.startTime
|
|
}
|
|
|
|
// GetDiscoveryService returns the discovery service
|
|
func (m *Monitor) GetDiscoveryService() *discovery.Service {
|
|
return m.discoveryService
|
|
}
|
|
|
|
// StartDiscoveryService starts the discovery service if not already running
|
|
func (m *Monitor) StartDiscoveryService(ctx context.Context, wsHub *websocket.Hub, subnet string) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if m.discoveryService != nil {
|
|
log.Debug().Msg("discovery service already running")
|
|
return
|
|
}
|
|
|
|
if subnet == "" {
|
|
subnet = "auto"
|
|
}
|
|
|
|
cfgProvider := func() config.DiscoveryConfig {
|
|
m.mu.RLock()
|
|
defer m.mu.RUnlock()
|
|
if m.config == nil {
|
|
return config.DefaultDiscoveryConfig()
|
|
}
|
|
return config.CloneDiscoveryConfig(m.config.Discovery)
|
|
}
|
|
|
|
m.discoveryService = discovery.NewService(wsHub, 5*time.Minute, subnet, cfgProvider)
|
|
if m.discoveryService != nil {
|
|
m.discoveryService.Start(ctx)
|
|
log.Info().Str("subnet", subnet).Msg("discovery service started")
|
|
} else {
|
|
log.Error().Msg("failed to create discovery service")
|
|
}
|
|
}
|
|
|
|
// StopDiscoveryService stops the discovery service if running
|
|
func (m *Monitor) StopDiscoveryService() {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if m.discoveryService != nil {
|
|
m.discoveryService.Stop()
|
|
m.discoveryService = nil
|
|
log.Info().Msg("discovery service stopped")
|
|
}
|
|
}
|
|
|
|
// EnableTemperatureMonitoring enables temperature data collection
|
|
func (m *Monitor) EnableTemperatureMonitoring() {
|
|
// Temperature collection is always enabled when tempCollector is initialized
|
|
// This method exists for interface compatibility
|
|
log.Info().Msg("temperature monitoring enabled")
|
|
}
|
|
|
|
// DisableTemperatureMonitoring disables temperature data collection
|
|
func (m *Monitor) DisableTemperatureMonitoring() {
|
|
// Temperature collection is always enabled when tempCollector is initialized
|
|
// This method exists for interface compatibility
|
|
log.Info().Msg("temperature monitoring disabled")
|
|
}
|
|
|
|
// SetResourceStore sets the resource store for polling optimization.
|
|
// When set, the monitor will check if it should reduce polling frequency
|
|
// for nodes that have host agents providing data.
|
|
func (m *Monitor) SetResourceStore(store ResourceStoreInterface) {
|
|
m.mu.Lock()
|
|
m.resourceStore = store
|
|
incidentStore := m.incidentStore
|
|
m.mu.Unlock()
|
|
log.Info().Msg("resource store set for polling optimization")
|
|
|
|
if incidentStore != nil {
|
|
if timelineStore, ok := store.(memory.IncidentTimelineStore); ok {
|
|
incidentStore.SetResourceTimelineStore(timelineStore)
|
|
} else {
|
|
incidentStore.SetResourceTimelineStore(nil)
|
|
}
|
|
}
|
|
|
|
// Immediately backfill the store from current state so ReadState
|
|
// consumers have data as soon as the store is wired.
|
|
// Guard against minimally initialized monitors (e.g., test fixtures
|
|
// with bare &Monitor{}) where m.state may be nil.
|
|
if store != nil && m.state != nil {
|
|
m.updateResourceStore(m.GetState())
|
|
}
|
|
}
|
|
|
|
// SetSupplementalRecordsProvider configures source-native resource providers
|
|
// that ingest alongside the legacy state snapshot path.
|
|
func (m *Monitor) SetSupplementalRecordsProvider(source unifiedresources.DataSource, provider MonitorSupplementalRecordsProvider) {
|
|
if m == nil {
|
|
return
|
|
}
|
|
|
|
normalized := unifiedresources.DataSource(strings.ToLower(strings.TrimSpace(string(source))))
|
|
if normalized == "" {
|
|
return
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if m.supplementalProviders == nil {
|
|
m.supplementalProviders = make(map[unifiedresources.DataSource]MonitorSupplementalRecordsProvider)
|
|
}
|
|
if provider == nil {
|
|
delete(m.supplementalProviders, normalized)
|
|
} else {
|
|
m.supplementalProviders[normalized] = provider
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.updateResourceStore(m.GetState())
|
|
}
|
|
|
|
// SetRecoveryManager wires the recovery store manager for best-effort ingestion of
|
|
// recovery points derived from polled backup/snapshot data.
|
|
func (m *Monitor) SetRecoveryManager(manager *recoverymanager.Manager) {
|
|
m.mu.Lock()
|
|
m.recoveryManager = manager
|
|
m.mu.Unlock()
|
|
|
|
// Try cleanup during wiring so monitors that are already running still get
|
|
// the migration once a recovery manager becomes available.
|
|
go m.purgeStalePVEPBSBackupsBestEffort(context.Background())
|
|
}
|
|
|
|
// GetNotificationManager returns the notification manager
|
|
func (m *Monitor) GetNotificationManager() *notifications.NotificationManager {
|
|
return m.notificationMgr
|
|
}
|
|
|
|
// GetConfigPersistence returns the config persistence manager
|
|
func (m *Monitor) GetConfigPersistence() *config.ConfigPersistence {
|
|
return m.configPersist
|
|
}
|
|
|
|
// GetMetricsStore returns the persistent metrics store
|
|
func (m *Monitor) GetMetricsStore() *metrics.Store {
|
|
return m.metricsStore
|
|
}
|
|
|
|
// GetMetricsHistory returns the in-memory metrics history for trend analysis
|
|
// This is used by the AI context builder to compute trends and predictions
|
|
func (m *Monitor) GetMetricsHistory() *MetricsHistory {
|
|
return m.metricsHistory
|
|
}
|
|
|
|
// GetUnifiedResources returns the current unified resource view for this monitor.
|
|
// Returns nil when no resource store is configured.
|
|
func (m *Monitor) GetUnifiedResources() []unifiedresources.Resource {
|
|
if m == nil {
|
|
return nil
|
|
}
|
|
|
|
m.mu.RLock()
|
|
store := m.resourceStore
|
|
m.mu.RUnlock()
|
|
if store == nil {
|
|
return nil
|
|
}
|
|
|
|
return store.GetAll()
|
|
}
|
|
|
|
type monitorUnifiedStateView struct {
|
|
resources []unifiedresources.Resource
|
|
readState unifiedresources.ReadState
|
|
freshness time.Time
|
|
}
|
|
|
|
func monitorUnifiedStateViewFromSnapshot(snapshot models.StateSnapshot) monitorUnifiedStateView {
|
|
registry := unifiedresources.NewRegistry(nil)
|
|
registry.IngestSnapshot(snapshot)
|
|
adapter := unifiedresources.NewMonitorAdapter(registry)
|
|
return monitorUnifiedStateView{
|
|
resources: registry.List(),
|
|
readState: adapter,
|
|
freshness: snapshot.LastUpdate,
|
|
}
|
|
}
|
|
|
|
func monitorUnifiedStateViewFromResources(resources []unifiedresources.Resource, freshness time.Time) monitorUnifiedStateView {
|
|
registry := unifiedresources.NewRegistry(nil)
|
|
registry.IngestResources(resources)
|
|
adapter := unifiedresources.NewMonitorAdapter(registry)
|
|
return monitorUnifiedStateView{
|
|
resources: registry.List(),
|
|
readState: adapter,
|
|
freshness: freshness,
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) currentUnifiedStateView() monitorUnifiedStateView {
|
|
if m == nil {
|
|
return monitorUnifiedStateView{}
|
|
}
|
|
|
|
if mock.IsMockEnabled() {
|
|
resources, freshness := mock.UnifiedResourceSnapshot()
|
|
if len(resources) > 0 || !freshness.IsZero() {
|
|
return monitorUnifiedStateViewFromResources(resources, freshness)
|
|
}
|
|
return monitorUnifiedStateViewFromSnapshot(m.GetState())
|
|
}
|
|
|
|
m.mu.RLock()
|
|
store := m.resourceStore
|
|
state := m.state
|
|
m.mu.RUnlock()
|
|
|
|
if store == nil {
|
|
return monitorUnifiedStateViewFromSnapshot(m.GetState())
|
|
}
|
|
|
|
resources := store.GetAll()
|
|
freshness := unifiedResourceFreshness(store, state)
|
|
|
|
if readState, ok := store.(unifiedresources.ReadState); ok {
|
|
return monitorUnifiedStateView{
|
|
resources: resources,
|
|
readState: readState,
|
|
freshness: freshness,
|
|
}
|
|
}
|
|
|
|
if len(resources) > 0 || state == nil {
|
|
return monitorUnifiedStateViewFromResources(resources, freshness)
|
|
}
|
|
|
|
return monitorUnifiedStateViewFromSnapshot(m.GetState())
|
|
}
|
|
|
|
func (m *Monitor) currentUnifiedResourceFreshness() time.Time {
|
|
if m == nil {
|
|
return time.Time{}
|
|
}
|
|
|
|
m.mu.RLock()
|
|
store := m.resourceStore
|
|
state := m.state
|
|
m.mu.RUnlock()
|
|
return unifiedResourceFreshness(store, state)
|
|
}
|
|
|
|
func unifiedResourceFreshness(store ResourceStoreInterface, state *models.State) time.Time {
|
|
freshness := time.Time{}
|
|
if freshnessStore, ok := store.(UnifiedResourceFreshnessStore); ok {
|
|
freshness = freshnessStore.UnifiedResourceFreshness()
|
|
}
|
|
if freshness.IsZero() && state != nil {
|
|
freshness = state.GetLastUpdate()
|
|
}
|
|
return freshness
|
|
}
|
|
|
|
// UnifiedResourceSnapshot returns a canonical unified-resource seed plus the
|
|
// associated freshness marker. In mock mode it returns the shared mock
|
|
// unified-resource fixture graph rather than the live resource store.
|
|
func (m *Monitor) UnifiedResourceSnapshot() ([]unifiedresources.Resource, time.Time) {
|
|
view := m.currentUnifiedStateView()
|
|
return view.resources, view.freshness
|
|
}
|
|
|
|
// GetUnifiedReadState returns a typed unified read-state provider when the
|
|
// configured resource store supports it.
|
|
func (m *Monitor) GetUnifiedReadState() unifiedresources.ReadState {
|
|
if m == nil {
|
|
return nil
|
|
}
|
|
|
|
m.mu.RLock()
|
|
store := m.resourceStore
|
|
m.mu.RUnlock()
|
|
if store == nil {
|
|
return nil
|
|
}
|
|
|
|
readState, ok := store.(unifiedresources.ReadState)
|
|
if !ok {
|
|
return nil
|
|
}
|
|
return readState
|
|
}
|
|
|
|
// GetUnifiedReadStateOrSnapshot returns unified read-state when available.
|
|
// If the monitor has not been wired with a resource store yet, it creates an
|
|
// ephemeral snapshot-backed adapter to preserve read access without exposing
|
|
// direct state reads to consumer packages.
|
|
func (m *Monitor) GetUnifiedReadStateOrSnapshot() unifiedresources.ReadState {
|
|
return m.currentUnifiedStateView().readState
|
|
}
|
|
|
|
// shouldSkipNodeMetrics returns true if we should skip detailed metric polling
|
|
// for the given node because a host agent is providing richer data.
|
|
// This helps reduce API load when agents are active.
|
|
func (m *Monitor) shouldSkipNodeMetrics(nodeName string) bool {
|
|
m.mu.RLock()
|
|
store := m.resourceStore
|
|
m.mu.RUnlock()
|
|
|
|
if store == nil {
|
|
return false
|
|
}
|
|
|
|
should := store.ShouldSkipAPIPolling(nodeName)
|
|
if should {
|
|
log.Debug().
|
|
Str("node", nodeName).
|
|
Msg("Skipping detailed node metrics - host agent provides data")
|
|
}
|
|
return should
|
|
}
|
|
|
|
// updateResourceStore populates the resource store with data from the current state.
|
|
// This should be called before broadcasting to ensure fresh data.
|
|
func (m *Monitor) updateResourceStore(state models.StateSnapshot) {
|
|
m.mu.RLock()
|
|
store := m.resourceStore
|
|
m.mu.RUnlock()
|
|
|
|
if store == nil {
|
|
log.Debug().Msg("[Resources] No resource store configured, skipping population")
|
|
return
|
|
}
|
|
|
|
log.Debug().
|
|
Int("nodes", len(state.Nodes)).
|
|
Int("vms", len(state.VMs)).
|
|
Int("containers", len(state.Containers)).
|
|
Int("hosts", len(state.Hosts)).
|
|
Int("dockerHosts", len(state.DockerHosts)).
|
|
Msg("[Resources] Populating resource store from state snapshot")
|
|
|
|
snapshotForStore := state
|
|
ownedSources := m.providerOwnedSnapshotSources()
|
|
if len(ownedSources) > 0 {
|
|
snapshotForStore = unifiedresources.SnapshotWithoutSources(state, ownedSources)
|
|
sourceNames := make([]string, 0, len(ownedSources))
|
|
for _, source := range ownedSources {
|
|
sourceNames = append(sourceNames, string(source))
|
|
}
|
|
log.Debug().
|
|
Strs("sources", sourceNames).
|
|
Msg("[Resources] Suppressing legacy snapshot slices for provider-owned sources")
|
|
}
|
|
|
|
recordsBySource := m.collectSupplementalRecordsBySource()
|
|
supplementalChanges := m.collectSupplementalChanges()
|
|
if atomicStore, ok := store.(AtomicSnapshotResourceStore); ok {
|
|
atomicStore.PopulateSnapshotAndSupplemental(snapshotForStore, recordsBySource)
|
|
recordSupplementalResourceChanges(store, supplementalChanges)
|
|
m.syncUnifiedAgentMetrics(store)
|
|
m.syncUnifiedVMMetrics(store)
|
|
m.syncUnifiedStorageMetrics(store)
|
|
m.syncUnifiedPhysicalDiskMetrics(store)
|
|
m.syncUnifiedAppContainerMetrics(store)
|
|
for source, records := range recordsBySource {
|
|
if len(records) == 0 {
|
|
continue
|
|
}
|
|
log.Debug().
|
|
Str("source", string(source)).
|
|
Int("records", len(records)).
|
|
Msg("[Resources] Atomically ingested supplemental records")
|
|
}
|
|
m.syncUnifiedResourceAlertsToState(store.GetAll())
|
|
return
|
|
}
|
|
|
|
store.PopulateFromSnapshot(snapshotForStore)
|
|
|
|
supplementalStore, ok := store.(SupplementalRecordStore)
|
|
if ok {
|
|
for source, records := range recordsBySource {
|
|
if len(records) == 0 {
|
|
continue
|
|
}
|
|
supplementalStore.PopulateSupplementalRecords(source, records)
|
|
log.Debug().
|
|
Str("source", string(source)).
|
|
Int("records", len(records)).
|
|
Msg("[Resources] Ingested supplemental records")
|
|
}
|
|
}
|
|
|
|
recordSupplementalResourceChanges(store, supplementalChanges)
|
|
m.syncUnifiedAgentMetrics(store)
|
|
m.syncUnifiedVMMetrics(store)
|
|
m.syncUnifiedStorageMetrics(store)
|
|
m.syncUnifiedPhysicalDiskMetrics(store)
|
|
m.syncUnifiedAppContainerMetrics(store)
|
|
m.syncUnifiedResourceAlertsToState(store.GetAll())
|
|
}
|
|
|
|
func recordSupplementalResourceChanges(store ResourceStoreInterface, changes []unifiedresources.ResourceChange) {
|
|
if store == nil || len(changes) == 0 {
|
|
return
|
|
}
|
|
|
|
recorder, ok := store.(canonicalResourceChangeRecorder)
|
|
if !ok || recorder == nil {
|
|
return
|
|
}
|
|
|
|
for _, change := range changes {
|
|
if err := recorder.RecordChange(change); err != nil {
|
|
log.Warn().
|
|
Err(err).
|
|
Str("resource_id", change.ResourceID).
|
|
Str("change_id", change.ID).
|
|
Str("kind", string(change.Kind)).
|
|
Msg("failed to record supplemental canonical resource change")
|
|
}
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) syncUnifiedAgentMetrics(store ResourceStoreInterface) {
|
|
if store == nil || (m.metricsHistory == nil && m.metricsStore == nil) {
|
|
return
|
|
}
|
|
|
|
resolver, ok := store.(MetricsTargetResourceStore)
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
now := time.Now()
|
|
storeWrites := make([]metrics.WriteMetric, 0)
|
|
appendStoreWrite := func(resourceType, resourceID, metricType string, value float64) {
|
|
if m.metricsStore == nil {
|
|
return
|
|
}
|
|
storeWrites = append(storeWrites, metrics.WriteMetric{
|
|
ResourceType: resourceType,
|
|
ResourceID: resourceID,
|
|
MetricType: metricType,
|
|
Value: value,
|
|
Timestamp: now,
|
|
Tier: metrics.TierRaw,
|
|
})
|
|
}
|
|
seenTargets := make(map[string]struct{})
|
|
for _, resource := range store.GetAll() {
|
|
if resource.Type != unifiedresources.ResourceTypeAgent || resource.Metrics == nil {
|
|
continue
|
|
}
|
|
if shouldSkipMockOwnedUnifiedMetricSync(resource) {
|
|
continue
|
|
}
|
|
if monitorHasSource(resource.Sources, unifiedresources.SourceAgent) ||
|
|
monitorHasSource(resource.Sources, unifiedresources.SourceProxmox) ||
|
|
monitorHasSource(resource.Sources, unifiedresources.SourceDocker) {
|
|
continue
|
|
}
|
|
|
|
target := resolver.MetricsTargetForResource(resource.ID)
|
|
if target == nil || target.ResourceType != "agent" || strings.TrimSpace(target.ResourceID) == "" {
|
|
continue
|
|
}
|
|
targetID := strings.TrimSpace(target.ResourceID)
|
|
if _, ok := seenTargets[targetID]; ok {
|
|
continue
|
|
}
|
|
seenTargets[targetID] = struct{}{}
|
|
metricKey := fmt.Sprintf("agent:%s", targetID)
|
|
|
|
if metric := resource.Metrics.CPU; metric != nil {
|
|
value := metric.Percent
|
|
if value == 0 {
|
|
value = metric.Value
|
|
}
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(metricKey, "cpu", value, now)
|
|
}
|
|
appendStoreWrite("agent", targetID, "cpu", value)
|
|
}
|
|
|
|
if metric := resource.Metrics.Memory; metric != nil && (metric.Total != nil || metric.Percent > 0 || metric.Used != nil) {
|
|
value := metric.Percent
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(metricKey, "memory", value, now)
|
|
}
|
|
appendStoreWrite("agent", targetID, "memory", value)
|
|
}
|
|
|
|
if metric := resource.Metrics.Disk; metric != nil && (metric.Total != nil || metric.Percent > 0 || metric.Used != nil) {
|
|
value := metric.Percent
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(metricKey, "disk", value, now)
|
|
}
|
|
appendStoreWrite("agent", targetID, "disk", value)
|
|
}
|
|
|
|
if metric := resource.Metrics.NetIn; metric != nil {
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(metricKey, "netin", metric.Value, now)
|
|
}
|
|
appendStoreWrite("agent", targetID, "netin", metric.Value)
|
|
}
|
|
|
|
if metric := resource.Metrics.NetOut; metric != nil {
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(metricKey, "netout", metric.Value, now)
|
|
}
|
|
appendStoreWrite("agent", targetID, "netout", metric.Value)
|
|
}
|
|
|
|
if metric := resource.Metrics.DiskRead; metric != nil {
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(metricKey, "diskread", metric.Value, now)
|
|
}
|
|
appendStoreWrite("agent", targetID, "diskread", metric.Value)
|
|
}
|
|
|
|
if metric := resource.Metrics.DiskWrite; metric != nil {
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(metricKey, "diskwrite", metric.Value, now)
|
|
}
|
|
appendStoreWrite("agent", targetID, "diskwrite", metric.Value)
|
|
}
|
|
}
|
|
if len(storeWrites) > 0 {
|
|
m.metricsStore.WriteBatchSync(storeWrites)
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) syncUnifiedVMMetrics(store ResourceStoreInterface) {
|
|
if store == nil || (m.metricsHistory == nil && m.metricsStore == nil) {
|
|
return
|
|
}
|
|
|
|
resolver, ok := store.(MetricsTargetResourceStore)
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
now := time.Now()
|
|
storeWrites := make([]metrics.WriteMetric, 0)
|
|
appendStoreWrite := func(resourceType, resourceID, metricType string, value float64) {
|
|
if m.metricsStore == nil {
|
|
return
|
|
}
|
|
storeWrites = append(storeWrites, metrics.WriteMetric{
|
|
ResourceType: resourceType,
|
|
ResourceID: resourceID,
|
|
MetricType: metricType,
|
|
Value: value,
|
|
Timestamp: now,
|
|
Tier: metrics.TierRaw,
|
|
})
|
|
}
|
|
seenTargets := make(map[string]struct{})
|
|
for _, resource := range store.GetAll() {
|
|
if resource.Type != unifiedresources.ResourceTypeVM || resource.Metrics == nil {
|
|
continue
|
|
}
|
|
if shouldSkipMockOwnedUnifiedMetricSync(resource) {
|
|
continue
|
|
}
|
|
|
|
hasNativeVMWriter := false
|
|
for _, source := range resource.Sources {
|
|
if source == unifiedresources.SourceProxmox {
|
|
hasNativeVMWriter = true
|
|
break
|
|
}
|
|
}
|
|
if hasNativeVMWriter {
|
|
continue
|
|
}
|
|
|
|
target := resolver.MetricsTargetForResource(resource.ID)
|
|
if target == nil || target.ResourceType != "vm" || strings.TrimSpace(target.ResourceID) == "" {
|
|
continue
|
|
}
|
|
targetID := strings.TrimSpace(target.ResourceID)
|
|
if _, ok := seenTargets[targetID]; ok {
|
|
continue
|
|
}
|
|
seenTargets[targetID] = struct{}{}
|
|
|
|
if metric := resource.Metrics.CPU; metric != nil {
|
|
value := metric.Percent
|
|
if value == 0 {
|
|
value = metric.Value
|
|
}
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(targetID, "cpu", value, now)
|
|
}
|
|
appendStoreWrite("vm", targetID, "cpu", value)
|
|
}
|
|
|
|
if metric := resource.Metrics.Memory; metric != nil && (metric.Total != nil || metric.Percent > 0 || metric.Used != nil) {
|
|
value := metric.Percent
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(targetID, "memory", value, now)
|
|
}
|
|
appendStoreWrite("vm", targetID, "memory", value)
|
|
}
|
|
|
|
if metric := resource.Metrics.Disk; metric != nil && (metric.Total != nil || metric.Percent > 0 || metric.Used != nil) {
|
|
value := metric.Percent
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(targetID, "disk", value, now)
|
|
}
|
|
appendStoreWrite("vm", targetID, "disk", value)
|
|
}
|
|
|
|
if metric := resource.Metrics.NetIn; metric != nil {
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(targetID, "netin", metric.Value, now)
|
|
}
|
|
appendStoreWrite("vm", targetID, "netin", metric.Value)
|
|
}
|
|
|
|
if metric := resource.Metrics.NetOut; metric != nil {
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(targetID, "netout", metric.Value, now)
|
|
}
|
|
appendStoreWrite("vm", targetID, "netout", metric.Value)
|
|
}
|
|
|
|
if metric := resource.Metrics.DiskRead; metric != nil {
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(targetID, "diskread", metric.Value, now)
|
|
}
|
|
appendStoreWrite("vm", targetID, "diskread", metric.Value)
|
|
}
|
|
|
|
if metric := resource.Metrics.DiskWrite; metric != nil {
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(targetID, "diskwrite", metric.Value, now)
|
|
}
|
|
appendStoreWrite("vm", targetID, "diskwrite", metric.Value)
|
|
}
|
|
}
|
|
if len(storeWrites) > 0 {
|
|
m.metricsStore.WriteBatchSync(storeWrites)
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) syncUnifiedStorageMetrics(store ResourceStoreInterface) {
|
|
if store == nil || (m.metricsHistory == nil && m.metricsStore == nil) {
|
|
return
|
|
}
|
|
|
|
resolver, ok := store.(MetricsTargetResourceStore)
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
now := time.Now()
|
|
storeWrites := make([]metrics.WriteMetric, 0)
|
|
appendStoreWrite := func(resourceType, resourceID, metricType string, value float64) {
|
|
if m.metricsStore == nil {
|
|
return
|
|
}
|
|
storeWrites = append(storeWrites, metrics.WriteMetric{
|
|
ResourceType: resourceType,
|
|
ResourceID: resourceID,
|
|
MetricType: metricType,
|
|
Value: value,
|
|
Timestamp: now,
|
|
Tier: metrics.TierRaw,
|
|
})
|
|
}
|
|
seenTargets := make(map[string]struct{})
|
|
for _, resource := range store.GetAll() {
|
|
if resource.Type != unifiedresources.ResourceTypeStorage || resource.Metrics == nil || resource.Metrics.Disk == nil {
|
|
continue
|
|
}
|
|
if shouldSkipMockOwnedUnifiedMetricSync(resource) {
|
|
continue
|
|
}
|
|
|
|
// Native Proxmox storage already writes to history during the storage poller.
|
|
if resource.Storage != nil && resource.Storage.Platform == "" {
|
|
hasProxmoxSource := false
|
|
for _, source := range resource.Sources {
|
|
if source == unifiedresources.SourceProxmox {
|
|
hasProxmoxSource = true
|
|
break
|
|
}
|
|
}
|
|
if hasProxmoxSource {
|
|
continue
|
|
}
|
|
}
|
|
|
|
target := resolver.MetricsTargetForResource(resource.ID)
|
|
if target == nil || target.ResourceType != "storage" || strings.TrimSpace(target.ResourceID) == "" {
|
|
continue
|
|
}
|
|
targetID := strings.TrimSpace(target.ResourceID)
|
|
if _, ok := seenTargets[targetID]; ok {
|
|
continue
|
|
}
|
|
seenTargets[targetID] = struct{}{}
|
|
|
|
disk := resource.Metrics.Disk
|
|
usage := disk.Percent
|
|
used := int64(0)
|
|
total := int64(0)
|
|
free := int64(0)
|
|
if disk.Used != nil {
|
|
used = *disk.Used
|
|
}
|
|
if disk.Total != nil {
|
|
total = *disk.Total
|
|
}
|
|
if total > 0 {
|
|
free = total - used
|
|
if usage == 0 && used > 0 {
|
|
usage = (float64(used) / float64(total)) * 100
|
|
}
|
|
}
|
|
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddStorageMetric(targetID, "usage", usage, now)
|
|
if total > 0 {
|
|
m.metricsHistory.AddStorageMetric(targetID, "used", float64(used), now)
|
|
m.metricsHistory.AddStorageMetric(targetID, "total", float64(total), now)
|
|
m.metricsHistory.AddStorageMetric(targetID, "avail", float64(free), now)
|
|
}
|
|
}
|
|
appendStoreWrite("storage", targetID, "usage", usage)
|
|
if total > 0 {
|
|
appendStoreWrite("storage", targetID, "used", float64(used))
|
|
appendStoreWrite("storage", targetID, "total", float64(total))
|
|
appendStoreWrite("storage", targetID, "avail", float64(free))
|
|
}
|
|
}
|
|
if len(storeWrites) > 0 {
|
|
m.metricsStore.WriteBatchSync(storeWrites)
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) syncUnifiedPhysicalDiskMetrics(store ResourceStoreInterface) {
|
|
if store == nil || m.metricsStore == nil {
|
|
return
|
|
}
|
|
|
|
resolver, ok := store.(MetricsTargetResourceStore)
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
now := time.Now()
|
|
seenTargets := make(map[string]struct{})
|
|
for _, resource := range store.GetAll() {
|
|
if resource.Type != unifiedresources.ResourceTypePhysicalDisk || resource.PhysicalDisk == nil {
|
|
continue
|
|
}
|
|
if shouldSkipMockOwnedUnifiedMetricSync(resource) {
|
|
continue
|
|
}
|
|
|
|
hasNativeWriter := false
|
|
for _, source := range resource.Sources {
|
|
if source == unifiedresources.SourceProxmox || source == unifiedresources.SourceAgent {
|
|
hasNativeWriter = true
|
|
break
|
|
}
|
|
}
|
|
if hasNativeWriter {
|
|
continue
|
|
}
|
|
|
|
target := resolver.MetricsTargetForResource(resource.ID)
|
|
if target == nil || target.ResourceType != "disk" || strings.TrimSpace(target.ResourceID) == "" {
|
|
continue
|
|
}
|
|
targetID := strings.TrimSpace(target.ResourceID)
|
|
if _, ok := seenTargets[targetID]; ok {
|
|
continue
|
|
}
|
|
seenTargets[targetID] = struct{}{}
|
|
|
|
nodeName := ""
|
|
for _, hostname := range resource.Identity.Hostnames {
|
|
if hostname = strings.TrimSpace(hostname); hostname != "" {
|
|
nodeName = hostname
|
|
break
|
|
}
|
|
}
|
|
if nodeName == "" {
|
|
nodeName = firstNonEmptyString(strings.TrimSpace(resource.ParentName), strings.TrimSpace(resource.Name))
|
|
}
|
|
|
|
disk := models.PhysicalDisk{
|
|
ID: resource.ID,
|
|
Node: nodeName,
|
|
DevPath: resource.PhysicalDisk.DevPath,
|
|
Model: resource.PhysicalDisk.Model,
|
|
Serial: resource.PhysicalDisk.Serial,
|
|
WWN: resource.PhysicalDisk.WWN,
|
|
Type: resource.PhysicalDisk.DiskType,
|
|
Size: resource.PhysicalDisk.SizeBytes,
|
|
Health: resource.PhysicalDisk.Health,
|
|
Wearout: resource.PhysicalDisk.Wearout,
|
|
Temperature: resource.PhysicalDisk.Temperature,
|
|
RPM: resource.PhysicalDisk.RPM,
|
|
Used: resource.PhysicalDisk.Used,
|
|
SmartAttributes: smartAttributesFromUnifiedMeta(resource.PhysicalDisk.SMART),
|
|
LastChecked: resource.LastSeen,
|
|
}
|
|
if disk.Serial == "" {
|
|
disk.ID = targetID
|
|
}
|
|
m.writeSMARTMetrics(disk, now)
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) syncUnifiedAppContainerMetrics(store ResourceStoreInterface) {
|
|
if store == nil || (m.metricsHistory == nil && m.metricsStore == nil) {
|
|
return
|
|
}
|
|
|
|
resolver, ok := store.(MetricsTargetResourceStore)
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
now := time.Now()
|
|
storeWrites := make([]metrics.WriteMetric, 0)
|
|
appendStoreWrite := func(resourceType, resourceID, metricType string, value float64) {
|
|
if m.metricsStore == nil {
|
|
return
|
|
}
|
|
storeWrites = append(storeWrites, metrics.WriteMetric{
|
|
ResourceType: resourceType,
|
|
ResourceID: resourceID,
|
|
MetricType: metricType,
|
|
Value: value,
|
|
Timestamp: now,
|
|
Tier: metrics.TierRaw,
|
|
})
|
|
}
|
|
seenTargets := make(map[string]struct{})
|
|
for _, resource := range store.GetAll() {
|
|
if resource.Type != unifiedresources.ResourceTypeAppContainer || resource.Metrics == nil {
|
|
continue
|
|
}
|
|
if shouldSkipMockOwnedUnifiedMetricSync(resource) {
|
|
continue
|
|
}
|
|
hasDockerSource := false
|
|
for _, source := range resource.Sources {
|
|
if source == unifiedresources.SourceDocker {
|
|
hasDockerSource = true
|
|
break
|
|
}
|
|
}
|
|
if hasDockerSource {
|
|
continue
|
|
}
|
|
|
|
target := resolver.MetricsTargetForResource(resource.ID)
|
|
if target == nil || target.ResourceType != "app-container" || strings.TrimSpace(target.ResourceID) == "" {
|
|
continue
|
|
}
|
|
targetID := strings.TrimSpace(target.ResourceID)
|
|
if _, ok := seenTargets[targetID]; ok {
|
|
continue
|
|
}
|
|
seenTargets[targetID] = struct{}{}
|
|
metricKey := fmt.Sprintf("docker:%s", targetID)
|
|
|
|
if metric := resource.Metrics.CPU; metric != nil {
|
|
value := metric.Percent
|
|
if value == 0 {
|
|
value = metric.Value
|
|
}
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(metricKey, "cpu", value, now)
|
|
}
|
|
appendStoreWrite("dockerContainer", targetID, "cpu", value)
|
|
}
|
|
|
|
if metric := resource.Metrics.Memory; metric != nil && (metric.Total != nil || metric.Percent > 0) {
|
|
value := metric.Percent
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(metricKey, "memory", value, now)
|
|
}
|
|
appendStoreWrite("dockerContainer", targetID, "memory", value)
|
|
}
|
|
|
|
if metric := resource.Metrics.Disk; metric != nil && (metric.Total != nil || metric.Percent > 0) {
|
|
value := metric.Percent
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(metricKey, "disk", value, now)
|
|
}
|
|
appendStoreWrite("dockerContainer", targetID, "disk", value)
|
|
}
|
|
|
|
if metric := resource.Metrics.NetIn; metric != nil {
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(metricKey, "netin", metric.Value, now)
|
|
}
|
|
appendStoreWrite("dockerContainer", targetID, "netin", metric.Value)
|
|
}
|
|
|
|
if metric := resource.Metrics.NetOut; metric != nil {
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(metricKey, "netout", metric.Value, now)
|
|
}
|
|
appendStoreWrite("dockerContainer", targetID, "netout", metric.Value)
|
|
}
|
|
|
|
if metric := resource.Metrics.DiskRead; metric != nil {
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(metricKey, "diskread", metric.Value, now)
|
|
}
|
|
appendStoreWrite("dockerContainer", targetID, "diskread", metric.Value)
|
|
}
|
|
|
|
if metric := resource.Metrics.DiskWrite; metric != nil {
|
|
if m.metricsHistory != nil {
|
|
m.metricsHistory.AddGuestMetric(metricKey, "diskwrite", metric.Value, now)
|
|
}
|
|
appendStoreWrite("dockerContainer", targetID, "diskwrite", metric.Value)
|
|
}
|
|
}
|
|
if len(storeWrites) > 0 {
|
|
m.metricsStore.WriteBatchSync(storeWrites)
|
|
}
|
|
}
|
|
|
|
func shouldSkipMockOwnedUnifiedMetricSync(resource unifiedresources.Resource) bool {
|
|
if !mock.IsMockEnabled() {
|
|
return false
|
|
}
|
|
|
|
// In mock mode the canonical mock sampler owns chart/history continuity for
|
|
// the entire demo estate. Unified-resource sync must not append a second
|
|
// live timeline on top of seeded mock history for any resource class.
|
|
_ = resource
|
|
return true
|
|
}
|
|
|
|
func shouldSkipNativeMockStateMetricWrites() bool {
|
|
return mock.IsMockEnabled()
|
|
}
|
|
|
|
// getUnifiedResourcesForBroadcast retrieves all resources from the store.
|
|
// Returns nil if no resource store is configured.
|
|
func (m *Monitor) getUnifiedResourcesForBroadcast() []unifiedresources.Resource {
|
|
m.mu.RLock()
|
|
store := m.resourceStore
|
|
m.mu.RUnlock()
|
|
|
|
if store == nil {
|
|
log.Debug().Msg("[Resources] No store for broadcast")
|
|
return nil
|
|
}
|
|
|
|
allResources := store.GetAll()
|
|
log.Debug().Int("count", len(allResources)).Msg("[Resources] Got resources for broadcast")
|
|
if len(allResources) == 0 {
|
|
return nil
|
|
}
|
|
return allResources
|
|
}
|
|
|
|
// getResourcesForBroadcast retrieves all resources from the store and converts
|
|
// them to frontend format.
|
|
func (m *Monitor) getResourcesForBroadcast() []models.ResourceFrontend {
|
|
return convertResourcesForBroadcast(m.getUnifiedResourcesForBroadcast())
|
|
}
|
|
|
|
// convertResourcesForBroadcast converts unified resources into the frontend payload shape.
|
|
func convertResourcesForBroadcast(allResources []unifiedresources.Resource) []models.ResourceFrontend {
|
|
if len(allResources) == 0 {
|
|
return []models.ResourceFrontend{}
|
|
}
|
|
type broadcastResource struct {
|
|
input models.ResourceConvertInput
|
|
sortKey string
|
|
resourceID string
|
|
}
|
|
|
|
converted := make([]broadcastResource, 0, len(allResources))
|
|
for _, r := range allResources {
|
|
input := monitorResourceToConvertInput(r)
|
|
sortKey := strings.ToLower(input.DisplayName)
|
|
if sortKey == "" {
|
|
sortKey = strings.ToLower(input.Name)
|
|
}
|
|
converted = append(converted, broadcastResource{
|
|
input: input,
|
|
sortKey: sortKey,
|
|
resourceID: input.ID,
|
|
})
|
|
}
|
|
|
|
sort.Slice(converted, func(i, j int) bool {
|
|
if converted[i].sortKey == converted[j].sortKey {
|
|
return converted[i].resourceID < converted[j].resourceID
|
|
}
|
|
return converted[i].sortKey < converted[j].sortKey
|
|
})
|
|
|
|
result := make([]models.ResourceFrontend, len(converted))
|
|
for i, resource := range converted {
|
|
result[i] = models.ConvertResourceToFrontend(resource.input)
|
|
}
|
|
return result
|
|
}
|
|
|
|
func monitorResourceToConvertInput(resource unifiedresources.Resource) models.ResourceConvertInput {
|
|
unifiedresources.RefreshCanonicalMetadata(&resource)
|
|
resourceType := monitorFrontendResourceType(resource)
|
|
name, displayName := monitorFrontendNames(resource, resourceType)
|
|
platformID := monitorPlatformID(resource, resourceType)
|
|
|
|
input := models.ResourceConvertInput{
|
|
ID: resource.ID,
|
|
Type: resourceType,
|
|
Name: name,
|
|
DisplayName: displayName,
|
|
PlatformID: platformID,
|
|
PlatformType: monitorPlatformType(resource, resourceType),
|
|
SourceType: monitorSourceType(resource.Sources),
|
|
ParentID: monitorStringValue(resource.ParentID),
|
|
ClusterID: monitorClusterID(resource),
|
|
Status: monitorFrontendStatus(resource, resourceType),
|
|
CPU: monitorMetricInput(monitorMetricValue(resource.Metrics, func(metrics *unifiedresources.ResourceMetrics) *unifiedresources.MetricValue { return metrics.CPU })),
|
|
Memory: monitorMetricInput(monitorMetricValue(resource.Metrics, func(metrics *unifiedresources.ResourceMetrics) *unifiedresources.MetricValue { return metrics.Memory })),
|
|
Disk: monitorMetricInput(monitorMetricValue(resource.Metrics, func(metrics *unifiedresources.ResourceMetrics) *unifiedresources.MetricValue { return metrics.Disk })),
|
|
Temperature: monitorTemperature(resource),
|
|
Uptime: monitorUptime(resource),
|
|
Tags: append([]string(nil), resource.Tags...),
|
|
Labels: monitorLabels(resource),
|
|
LastSeenUnix: monitorLastSeenUnix(resource.LastSeen),
|
|
Identity: monitorIdentity(resource, name),
|
|
PlatformData: monitorPlatformData(resource, resourceType, platformID),
|
|
}
|
|
|
|
hasNetwork, rx, tx := monitorNetworkMetricInput(resource.Metrics)
|
|
input.HasNetwork = hasNetwork
|
|
input.NetworkRX = rx
|
|
input.NetworkTX = tx
|
|
|
|
return input
|
|
}
|
|
|
|
func monitorFrontendResourceType(resource unifiedresources.Resource) string {
|
|
return string(unifiedresources.ContractResourceType(resource))
|
|
}
|
|
|
|
func monitorFrontendNames(resource unifiedresources.Resource, resourceType string) (string, string) {
|
|
name := strings.TrimSpace(unifiedresources.ResourceDisplayName(resource))
|
|
if name == "" {
|
|
name = resource.ID
|
|
}
|
|
return name, name
|
|
}
|
|
|
|
func monitorPlatformType(resource unifiedresources.Resource, resourceType string) string {
|
|
if resource.Proxmox != nil {
|
|
return "proxmox-pve"
|
|
}
|
|
if resource.VMware != nil {
|
|
return "vmware-vsphere"
|
|
}
|
|
if resource.TrueNAS != nil {
|
|
return "truenas"
|
|
}
|
|
switch resourceType {
|
|
case "vm", "system-container", "storage", "pool":
|
|
return "proxmox-pve"
|
|
case "docker-host", "app-container":
|
|
return "docker"
|
|
case "k8s-cluster", "k8s-node", "pod", "k8s-deployment":
|
|
return "kubernetes"
|
|
case "pbs":
|
|
return "proxmox-pbs"
|
|
case "pmg":
|
|
return "proxmox-pmg"
|
|
case "agent":
|
|
return "agent"
|
|
default:
|
|
if monitorHasSource(resource.Sources, unifiedresources.SourceK8s) {
|
|
return "kubernetes"
|
|
}
|
|
if monitorHasSource(resource.Sources, unifiedresources.SourceDocker) {
|
|
return "docker"
|
|
}
|
|
if monitorHasSource(resource.Sources, unifiedresources.SourcePBS) {
|
|
return "proxmox-pbs"
|
|
}
|
|
if monitorHasSource(resource.Sources, unifiedresources.SourcePMG) {
|
|
return "proxmox-pmg"
|
|
}
|
|
if monitorHasSource(resource.Sources, unifiedresources.SourceAgent) {
|
|
return "agent"
|
|
}
|
|
if monitorHasSource(resource.Sources, unifiedresources.SourceProxmox) {
|
|
return "proxmox-pve"
|
|
}
|
|
for _, source := range resource.Sources {
|
|
if candidate := strings.TrimSpace(string(source)); candidate != "" {
|
|
return candidate
|
|
}
|
|
}
|
|
return "unknown"
|
|
}
|
|
}
|
|
|
|
func monitorPlatformID(resource unifiedresources.Resource, resourceType string) string {
|
|
switch resourceType {
|
|
case "node", "vm", "system-container":
|
|
if resource.Proxmox != nil && strings.TrimSpace(resource.Proxmox.Instance) != "" {
|
|
return strings.TrimSpace(resource.Proxmox.Instance)
|
|
}
|
|
case "agent":
|
|
if resource.Agent != nil && strings.TrimSpace(resource.Agent.AgentID) != "" {
|
|
return strings.TrimSpace(resource.Agent.AgentID)
|
|
}
|
|
case "docker-host":
|
|
if resource.Docker != nil && strings.TrimSpace(resource.Docker.Hostname) != "" {
|
|
return strings.TrimSpace(resource.Docker.Hostname)
|
|
}
|
|
case "app-container":
|
|
if resource.Docker != nil && strings.TrimSpace(resource.Docker.Hostname) != "" {
|
|
return strings.TrimSpace(resource.Docker.Hostname)
|
|
}
|
|
if resource.ParentID != nil {
|
|
return strings.TrimSpace(*resource.ParentID)
|
|
}
|
|
case "k8s-cluster", "k8s-node", "pod", "k8s-deployment":
|
|
if resource.Kubernetes != nil && strings.TrimSpace(resource.Kubernetes.AgentID) != "" {
|
|
return strings.TrimSpace(resource.Kubernetes.AgentID)
|
|
}
|
|
case "pbs":
|
|
if resource.PBS != nil && strings.TrimSpace(resource.PBS.Hostname) != "" {
|
|
return strings.TrimSpace(resource.PBS.Hostname)
|
|
}
|
|
case "pmg":
|
|
if resource.PMG != nil && strings.TrimSpace(resource.PMG.Hostname) != "" {
|
|
return strings.TrimSpace(resource.PMG.Hostname)
|
|
}
|
|
}
|
|
return resource.ID
|
|
}
|
|
|
|
func monitorFrontendStatus(resource unifiedresources.Resource, resourceType string) string {
|
|
switch resourceType {
|
|
case "app-container":
|
|
switch resource.Status {
|
|
case unifiedresources.StatusOnline:
|
|
return "running"
|
|
case unifiedresources.StatusOffline:
|
|
return "stopped"
|
|
case unifiedresources.StatusWarning:
|
|
return "degraded"
|
|
}
|
|
case "pod":
|
|
if resource.Kubernetes != nil {
|
|
phase := strings.ToLower(strings.TrimSpace(resource.Kubernetes.PodPhase))
|
|
switch phase {
|
|
case "running":
|
|
return "running"
|
|
case "pending", "unknown":
|
|
return "degraded"
|
|
case "succeeded", "failed":
|
|
return "stopped"
|
|
}
|
|
}
|
|
}
|
|
|
|
switch resource.Status {
|
|
case unifiedresources.StatusOnline:
|
|
if monitorIsWorkloadType(resourceType) || resourceType == "pod" {
|
|
return "running"
|
|
}
|
|
return "online"
|
|
case unifiedresources.StatusOffline:
|
|
if monitorIsWorkloadType(resourceType) || resourceType == "pod" {
|
|
return "stopped"
|
|
}
|
|
return "offline"
|
|
case unifiedresources.StatusWarning:
|
|
return "degraded"
|
|
default:
|
|
return "unknown"
|
|
}
|
|
}
|
|
|
|
func monitorIsWorkloadType(resourceType string) bool {
|
|
switch resourceType {
|
|
case "app-container", "system-container", "vm", "oci-container":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func monitorClusterID(resource unifiedresources.Resource) string {
|
|
if clusterID := strings.TrimSpace(unifiedresources.ResourceClusterName(resource)); clusterID != "" {
|
|
return clusterID
|
|
}
|
|
|
|
if resource.Docker != nil && resource.Docker.Swarm != nil {
|
|
if name := strings.TrimSpace(resource.Docker.Swarm.ClusterName); name != "" {
|
|
return name
|
|
}
|
|
if id := strings.TrimSpace(resource.Docker.Swarm.ClusterID); id != "" {
|
|
return id
|
|
}
|
|
}
|
|
|
|
return ""
|
|
}
|
|
|
|
func monitorMetricInput(metric *unifiedresources.MetricValue) *models.ResourceMetricInput {
|
|
if metric == nil {
|
|
return nil
|
|
}
|
|
|
|
current := metric.Percent
|
|
if current == 0 {
|
|
current = metric.Value
|
|
}
|
|
if metric.Percent != 0 && metric.Value != 0 {
|
|
current = math.Max(metric.Percent, metric.Value)
|
|
}
|
|
|
|
result := &models.ResourceMetricInput{Current: current}
|
|
if metric.Total != nil {
|
|
total := *metric.Total
|
|
result.Total = &total
|
|
}
|
|
if metric.Used != nil {
|
|
used := *metric.Used
|
|
result.Used = &used
|
|
}
|
|
if result.Total != nil && result.Used != nil {
|
|
free := *result.Total - *result.Used
|
|
result.Free = &free
|
|
}
|
|
return result
|
|
}
|
|
|
|
func monitorNetworkMetricInput(metrics *unifiedresources.ResourceMetrics) (bool, int64, int64) {
|
|
if metrics == nil || (metrics.NetIn == nil && metrics.NetOut == nil) {
|
|
return false, 0, 0
|
|
}
|
|
|
|
var rx int64
|
|
var tx int64
|
|
if metrics.NetIn != nil {
|
|
rx = int64(math.Round(metrics.NetIn.Value))
|
|
}
|
|
if metrics.NetOut != nil {
|
|
tx = int64(math.Round(metrics.NetOut.Value))
|
|
}
|
|
return true, rx, tx
|
|
}
|
|
|
|
func monitorTemperature(resource unifiedresources.Resource) *float64 {
|
|
if resource.Agent != nil && resource.Agent.Temperature != nil {
|
|
value := *resource.Agent.Temperature
|
|
return &value
|
|
}
|
|
if resource.Proxmox != nil && resource.Proxmox.Temperature != nil {
|
|
value := *resource.Proxmox.Temperature
|
|
return &value
|
|
}
|
|
if resource.Docker != nil && resource.Docker.Temperature != nil {
|
|
value := *resource.Docker.Temperature
|
|
return &value
|
|
}
|
|
if resource.Kubernetes != nil && resource.Kubernetes.Temperature != nil {
|
|
value := *resource.Kubernetes.Temperature
|
|
return &value
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func monitorUptime(resource unifiedresources.Resource) *int64 {
|
|
if resource.Agent != nil && resource.Agent.UptimeSeconds > 0 {
|
|
value := resource.Agent.UptimeSeconds
|
|
return &value
|
|
}
|
|
if resource.Proxmox != nil && resource.Proxmox.Uptime > 0 {
|
|
value := resource.Proxmox.Uptime
|
|
return &value
|
|
}
|
|
if resource.Docker != nil && resource.Docker.UptimeSeconds > 0 {
|
|
value := resource.Docker.UptimeSeconds
|
|
return &value
|
|
}
|
|
if resource.Kubernetes != nil && resource.Kubernetes.UptimeSeconds > 0 {
|
|
value := resource.Kubernetes.UptimeSeconds
|
|
return &value
|
|
}
|
|
if resource.PBS != nil && resource.PBS.UptimeSeconds > 0 {
|
|
value := resource.PBS.UptimeSeconds
|
|
return &value
|
|
}
|
|
if resource.PMG != nil && resource.PMG.UptimeSeconds > 0 {
|
|
value := resource.PMG.UptimeSeconds
|
|
return &value
|
|
}
|
|
if resource.TrueNAS != nil && resource.TrueNAS.UptimeSeconds > 0 {
|
|
value := resource.TrueNAS.UptimeSeconds
|
|
return &value
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func monitorLabels(resource unifiedresources.Resource) map[string]string {
|
|
if resource.Kubernetes == nil || len(resource.Kubernetes.Labels) == 0 {
|
|
return nil
|
|
}
|
|
labels := make(map[string]string, len(resource.Kubernetes.Labels))
|
|
for key, value := range resource.Kubernetes.Labels {
|
|
labels[key] = value
|
|
}
|
|
return labels
|
|
}
|
|
|
|
func monitorIdentity(resource unifiedresources.Resource, fallbackName string) *models.ResourceIdentityInput {
|
|
hostname := ""
|
|
if resource.Agent != nil {
|
|
hostname = strings.TrimSpace(resource.Agent.Hostname)
|
|
}
|
|
if hostname == "" && resource.Docker != nil {
|
|
hostname = strings.TrimSpace(resource.Docker.Hostname)
|
|
}
|
|
if hostname == "" && resource.Proxmox != nil {
|
|
hostname = strings.TrimSpace(resource.Proxmox.NodeName)
|
|
}
|
|
if hostname == "" {
|
|
for _, candidate := range resource.Identity.Hostnames {
|
|
if trimmed := strings.TrimSpace(candidate); trimmed != "" {
|
|
hostname = trimmed
|
|
break
|
|
}
|
|
}
|
|
}
|
|
if hostname == "" {
|
|
hostname = fallbackName
|
|
}
|
|
|
|
ips := make([]string, 0, len(resource.Identity.IPAddresses))
|
|
for _, ip := range resource.Identity.IPAddresses {
|
|
trimmed := strings.TrimSpace(ip)
|
|
if trimmed == "" {
|
|
continue
|
|
}
|
|
ips = append(ips, trimmed)
|
|
}
|
|
|
|
machineID := strings.TrimSpace(resource.Identity.MachineID)
|
|
if hostname == "" && machineID == "" && len(ips) == 0 {
|
|
return nil
|
|
}
|
|
|
|
return &models.ResourceIdentityInput{
|
|
Hostname: hostname,
|
|
MachineID: machineID,
|
|
IPs: ips,
|
|
}
|
|
}
|
|
|
|
func monitorPlatformData(resource unifiedresources.Resource, resourceType string, platformID string) json.RawMessage {
|
|
var payload interface{}
|
|
|
|
switch resourceType {
|
|
case "node":
|
|
if resource.Proxmox != nil {
|
|
payload = map[string]interface{}{
|
|
"instance": resource.Proxmox.Instance,
|
|
"host": "",
|
|
"guestURL": "",
|
|
"pveVersion": resource.Proxmox.PVEVersion,
|
|
"kernelVersion": resource.Proxmox.KernelVersion,
|
|
"cpuInfo": resource.Proxmox.CPUInfo,
|
|
"loadAverage": []float64{},
|
|
"isClusterMember": resource.Proxmox.ClusterName != "",
|
|
"clusterName": resource.Proxmox.ClusterName,
|
|
"connectionHealth": monitorSourceStatus(resource.SourceStatus, unifiedresources.SourceProxmox),
|
|
}
|
|
}
|
|
case "vm":
|
|
payload = buildProxmoxVMPayload(resource)
|
|
case "system-container", "oci-container":
|
|
payload = buildProxmoxVMPayload(resource)
|
|
case "agent":
|
|
if resource.Proxmox != nil {
|
|
payload = map[string]interface{}{
|
|
"instance": resource.Proxmox.Instance,
|
|
"host": "",
|
|
"guestURL": "",
|
|
"pveVersion": resource.Proxmox.PVEVersion,
|
|
"kernelVersion": resource.Proxmox.KernelVersion,
|
|
"cpuInfo": resource.Proxmox.CPUInfo,
|
|
"loadAverage": []float64{},
|
|
"isClusterMember": resource.Proxmox.ClusterName != "",
|
|
"clusterName": resource.Proxmox.ClusterName,
|
|
"connectionHealth": monitorSourceStatus(resource.SourceStatus, unifiedresources.SourceProxmox),
|
|
}
|
|
} else if resource.Agent != nil {
|
|
payload = map[string]interface{}{
|
|
"platform": resource.Agent.Platform,
|
|
"osName": resource.Agent.OSName,
|
|
"osVersion": resource.Agent.OSVersion,
|
|
"kernelVersion": resource.Agent.KernelVersion,
|
|
"architecture": resource.Agent.Architecture,
|
|
"agentVersion": resource.Agent.AgentVersion,
|
|
"interfaces": resource.Agent.NetworkInterfaces,
|
|
"disks": resource.Agent.Disks,
|
|
"memory": resource.Agent.Memory,
|
|
}
|
|
}
|
|
case "docker-host":
|
|
if resource.Docker != nil {
|
|
payload = map[string]interface{}{
|
|
"agentId": platformID,
|
|
"runtime": resource.Docker.Runtime,
|
|
"runtimeVersion": resource.Docker.RuntimeVersion,
|
|
"dockerVersion": resource.Docker.DockerVersion,
|
|
"os": resource.Docker.OS,
|
|
"kernelVersion": resource.Docker.KernelVersion,
|
|
"architecture": resource.Docker.Architecture,
|
|
"agentVersion": resource.Docker.AgentVersion,
|
|
"swarm": resource.Docker.Swarm,
|
|
"interfaces": resource.Docker.NetworkInterfaces,
|
|
"disks": resource.Docker.Disks,
|
|
}
|
|
}
|
|
case "app-container":
|
|
if resource.Docker != nil {
|
|
payload = map[string]interface{}{
|
|
"hostId": monitorStringValue(resource.ParentID),
|
|
"hostName": resource.Docker.Hostname,
|
|
"image": resource.Docker.Image,
|
|
"state": strings.ToLower(string(resource.Status)),
|
|
"status": strings.ToLower(string(resource.Status)),
|
|
"health": "",
|
|
"createdAt": time.Time{},
|
|
}
|
|
}
|
|
case "k8s-cluster":
|
|
if resource.Kubernetes != nil {
|
|
payload = map[string]interface{}{
|
|
"agentId": resource.Kubernetes.AgentID,
|
|
"server": resource.Kubernetes.Server,
|
|
"context": resource.Kubernetes.Context,
|
|
"version": resource.Kubernetes.Version,
|
|
"customDisplayName": "",
|
|
"hidden": false,
|
|
"pendingUninstall": resource.Kubernetes.PendingUninstall,
|
|
"nodeCount": resource.ChildCount,
|
|
}
|
|
}
|
|
case "k8s-node":
|
|
if resource.Kubernetes != nil {
|
|
payload = map[string]interface{}{
|
|
"clusterId": resource.Kubernetes.ClusterID,
|
|
"ready": resource.Kubernetes.Ready,
|
|
"unschedulable": resource.Kubernetes.Unschedulable,
|
|
"kubeletVersion": resource.Kubernetes.KubeletVersion,
|
|
"containerRuntimeVersion": resource.Kubernetes.ContainerRuntimeVersion,
|
|
"osImage": resource.Kubernetes.OSImage,
|
|
"kernelVersion": resource.Kubernetes.KernelVersion,
|
|
"architecture": resource.Kubernetes.Architecture,
|
|
"capacityCpuCores": resource.Kubernetes.CapacityCPU,
|
|
"capacityMemoryBytes": resource.Kubernetes.CapacityMemoryBytes,
|
|
"capacityPods": resource.Kubernetes.CapacityPods,
|
|
"allocatableCpuCores": resource.Kubernetes.AllocCPU,
|
|
"allocatableMemoryBytes": resource.Kubernetes.AllocMemoryBytes,
|
|
"allocatablePods": resource.Kubernetes.AllocPods,
|
|
"roles": append([]string(nil), resource.Kubernetes.Roles...),
|
|
}
|
|
}
|
|
case "pod":
|
|
if resource.Kubernetes != nil {
|
|
payload = map[string]interface{}{
|
|
"clusterId": resource.Kubernetes.ClusterID,
|
|
"namespace": resource.Kubernetes.Namespace,
|
|
"nodeName": resource.Kubernetes.NodeName,
|
|
"phase": resource.Kubernetes.PodPhase,
|
|
"restarts": resource.Kubernetes.Restarts,
|
|
"ownerKind": resource.Kubernetes.OwnerKind,
|
|
"ownerName": resource.Kubernetes.OwnerName,
|
|
}
|
|
}
|
|
case "k8s-deployment":
|
|
if resource.Kubernetes != nil {
|
|
payload = map[string]interface{}{
|
|
"clusterId": resource.Kubernetes.ClusterID,
|
|
"namespace": resource.Kubernetes.Namespace,
|
|
"desiredReplicas": resource.Kubernetes.DesiredReplicas,
|
|
"updatedReplicas": resource.Kubernetes.UpdatedReplicas,
|
|
"readyReplicas": resource.Kubernetes.ReadyReplicas,
|
|
"availableReplicas": resource.Kubernetes.AvailableReplicas,
|
|
}
|
|
}
|
|
case "pbs":
|
|
if resource.PBS != nil {
|
|
payload = map[string]interface{}{
|
|
"host": resource.PBS.Hostname,
|
|
"version": resource.PBS.Version,
|
|
"connectionHealth": resource.PBS.ConnectionHealth,
|
|
"memoryUsed": monitorMetricUsed(monitorMetricValue(resource.Metrics, func(metrics *unifiedresources.ResourceMetrics) *unifiedresources.MetricValue { return metrics.Memory })),
|
|
"memoryTotal": monitorMetricTotal(monitorMetricValue(resource.Metrics, func(metrics *unifiedresources.ResourceMetrics) *unifiedresources.MetricValue { return metrics.Memory })),
|
|
"numDatastores": resource.PBS.DatastoreCount,
|
|
}
|
|
}
|
|
case "pmg":
|
|
if resource.PMG != nil {
|
|
payload = map[string]interface{}{
|
|
"host": resource.PMG.Hostname,
|
|
"version": resource.PMG.Version,
|
|
"connectionHealth": resource.PMG.ConnectionHealth,
|
|
"nodeCount": resource.PMG.NodeCount,
|
|
"queueActive": resource.PMG.QueueActive,
|
|
"queueDeferred": resource.PMG.QueueDeferred,
|
|
"queueHold": resource.PMG.QueueHold,
|
|
"queueIncoming": resource.PMG.QueueIncoming,
|
|
"queueTotal": resource.PMG.QueueTotal,
|
|
"mailCountTotal": resource.PMG.MailCountTotal,
|
|
"spamIn": resource.PMG.SpamIn,
|
|
"virusIn": resource.PMG.VirusIn,
|
|
"lastUpdated": resource.PMG.LastUpdated,
|
|
}
|
|
}
|
|
case "storage", "pool":
|
|
nodeLabel := resource.ParentName
|
|
if nodeLabel == "" {
|
|
nodeLabel = monitorStringValue(resource.ParentID)
|
|
}
|
|
payload = map[string]interface{}{
|
|
"instance": platformID,
|
|
"node": nodeLabel,
|
|
"type": "",
|
|
"content": "",
|
|
"shared": false,
|
|
"enabled": true,
|
|
"active": resource.Status == unifiedresources.StatusOnline,
|
|
}
|
|
}
|
|
|
|
if payload == nil {
|
|
return nil
|
|
}
|
|
|
|
encoded, err := json.Marshal(payload)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
return encoded
|
|
}
|
|
|
|
func convertProxmoxDisks(disks []unifiedresources.DiskInfo) []map[string]interface{} {
|
|
if len(disks) == 0 {
|
|
return nil
|
|
}
|
|
|
|
out := make([]map[string]interface{}, 0, len(disks))
|
|
for _, d := range disks {
|
|
usage := float64(0)
|
|
if d.Total > 0 {
|
|
usage = float64(d.Used) / float64(d.Total) * 100
|
|
}
|
|
out = append(out, map[string]interface{}{
|
|
"total": d.Total,
|
|
"used": d.Used,
|
|
"free": d.Free,
|
|
"usage": usage,
|
|
"mountpoint": d.Mountpoint,
|
|
"type": d.Filesystem,
|
|
"device": d.Device,
|
|
})
|
|
}
|
|
return out
|
|
}
|
|
|
|
func buildProxmoxVMPayload(resource unifiedresources.Resource) map[string]interface{} {
|
|
if resource.Proxmox == nil {
|
|
return nil
|
|
}
|
|
return map[string]interface{}{
|
|
"vmid": resource.Proxmox.VMID,
|
|
"node": resource.Proxmox.NodeName,
|
|
"instance": resource.Proxmox.Instance,
|
|
"cpus": resource.Proxmox.CPUs,
|
|
"template": resource.Proxmox.Template,
|
|
"networkIn": monitorMetricInt64(resource.Metrics, func(metrics *unifiedresources.ResourceMetrics) *unifiedresources.MetricValue { return metrics.NetIn }),
|
|
"networkOut": monitorMetricInt64(resource.Metrics, func(metrics *unifiedresources.ResourceMetrics) *unifiedresources.MetricValue { return metrics.NetOut }),
|
|
"diskRead": monitorMetricInt64(resource.Metrics, func(metrics *unifiedresources.ResourceMetrics) *unifiedresources.MetricValue { return metrics.DiskRead }),
|
|
"diskWrite": monitorMetricInt64(resource.Metrics, func(metrics *unifiedresources.ResourceMetrics) *unifiedresources.MetricValue {
|
|
return metrics.DiskWrite
|
|
}),
|
|
"disks": convertProxmoxDisks(resource.Proxmox.Disks),
|
|
"swapUsed": resource.Proxmox.SwapUsed,
|
|
"swapTotal": resource.Proxmox.SwapTotal,
|
|
"balloon": resource.Proxmox.Balloon,
|
|
"lastBackup": resource.Proxmox.LastBackup,
|
|
"ipAddresses": append([]string(nil), resource.Identity.IPAddresses...),
|
|
}
|
|
}
|
|
|
|
func monitorMetricValue(metrics *unifiedresources.ResourceMetrics, pick func(*unifiedresources.ResourceMetrics) *unifiedresources.MetricValue) *unifiedresources.MetricValue {
|
|
if metrics == nil {
|
|
return nil
|
|
}
|
|
return pick(metrics)
|
|
}
|
|
|
|
func monitorMetricInt64(metrics *unifiedresources.ResourceMetrics, pick func(*unifiedresources.ResourceMetrics) *unifiedresources.MetricValue) int64 {
|
|
metric := monitorMetricValue(metrics, pick)
|
|
if metric == nil {
|
|
return 0
|
|
}
|
|
return int64(math.Round(metric.Value))
|
|
}
|
|
|
|
func monitorMetricUsed(metric *unifiedresources.MetricValue) int64 {
|
|
if metric == nil || metric.Used == nil {
|
|
return 0
|
|
}
|
|
return *metric.Used
|
|
}
|
|
|
|
func monitorMetricTotal(metric *unifiedresources.MetricValue) int64 {
|
|
if metric == nil || metric.Total == nil {
|
|
return 0
|
|
}
|
|
return *metric.Total
|
|
}
|
|
|
|
func monitorSourceStatus(statuses map[unifiedresources.DataSource]unifiedresources.SourceStatus, source unifiedresources.DataSource) string {
|
|
if statuses == nil {
|
|
return ""
|
|
}
|
|
status, ok := statuses[source]
|
|
if !ok {
|
|
return ""
|
|
}
|
|
return status.Status
|
|
}
|
|
|
|
func monitorHasSource(sources []unifiedresources.DataSource, source unifiedresources.DataSource) bool {
|
|
for _, candidate := range sources {
|
|
if candidate == source {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func monitorSourceType(sources []unifiedresources.DataSource) string {
|
|
if len(sources) > 1 {
|
|
return "hybrid"
|
|
}
|
|
if len(sources) == 1 {
|
|
switch sources[0] {
|
|
case unifiedresources.SourceAgent, unifiedresources.SourceDocker, unifiedresources.SourceK8s:
|
|
return "agent"
|
|
default:
|
|
return "api"
|
|
}
|
|
}
|
|
return "api"
|
|
}
|
|
|
|
func monitorStringValue(value *string) string {
|
|
if value == nil {
|
|
return ""
|
|
}
|
|
return strings.TrimSpace(*value)
|
|
}
|
|
|
|
func monitorLastSeenUnix(value time.Time) int64 {
|
|
if value.IsZero() {
|
|
return time.Now().UTC().UnixMilli()
|
|
}
|
|
return value.UnixMilli()
|
|
}
|
|
|
|
// pollStorageBackupsWithNodes polls backups using a provided nodes list to avoid duplicate GetNodes calls
|
|
// Stop gracefully stops the monitor
|
|
func (m *Monitor) Stop() {
|
|
log.Info().Msg("stopping monitor")
|
|
|
|
// Stop the alert manager to save history
|
|
if m.alertManager != nil {
|
|
m.alertManager.Stop()
|
|
}
|
|
|
|
// Stop notification manager
|
|
if m.notificationMgr != nil {
|
|
m.notificationMgr.Stop()
|
|
}
|
|
|
|
// Close persistent metrics store (flushes buffered data)
|
|
if m.metricsStore != nil {
|
|
if err := m.metricsStore.Close(); err != nil {
|
|
log.Error().Err(err).Msg("failed to close metrics store")
|
|
} else {
|
|
log.Info().Msg("metrics store closed successfully")
|
|
}
|
|
}
|
|
|
|
log.Info().Msg("monitor stopped")
|
|
}
|
|
|
|
// recordAuthFailure records an authentication failure for a node
|