mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-07 08:57:12 +00:00
1053 lines
34 KiB
Go
1053 lines
34 KiB
Go
package monitoring
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"net"
|
|
"net/url"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/config"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/logging"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/models"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/monitoring/errors"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/fsfilters"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
|
|
"github.com/rs/zerolog"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
func sortContent(content string) string {
|
|
if content == "" {
|
|
return ""
|
|
}
|
|
parts := strings.Split(content, ",")
|
|
sort.Strings(parts)
|
|
return strings.Join(parts, ",")
|
|
}
|
|
|
|
func (m *Monitor) enrichContainerMetadata(ctx context.Context, client PVEClientInterface, instanceName, nodeName string, container *models.Container) {
|
|
if container == nil {
|
|
return
|
|
}
|
|
|
|
ensureContainerRootDiskEntry(container)
|
|
|
|
if client == nil {
|
|
return
|
|
}
|
|
|
|
isRunning := container.Status == "running"
|
|
|
|
var status *proxmox.Container
|
|
if isRunning {
|
|
statusCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
|
statusResp, err := client.GetContainerStatus(statusCtx, nodeName, container.VMID)
|
|
cancel()
|
|
if err != nil {
|
|
log.Debug().
|
|
Err(err).
|
|
Str("instance", instanceName).
|
|
Str("node", nodeName).
|
|
Str("container", container.Name).
|
|
Int("vmid", container.VMID).
|
|
Msg("Container status metadata unavailable")
|
|
} else {
|
|
status = statusResp
|
|
}
|
|
}
|
|
|
|
rootDeviceHint := ""
|
|
var mountMetadata map[string]containerMountMetadata
|
|
addressSet := make(map[string]struct{})
|
|
addressOrder := make([]string, 0, 4)
|
|
|
|
addAddress := func(addr string) {
|
|
addr = strings.TrimSpace(addr)
|
|
if addr == "" {
|
|
return
|
|
}
|
|
if _, exists := addressSet[addr]; exists {
|
|
return
|
|
}
|
|
addressSet[addr] = struct{}{}
|
|
addressOrder = append(addressOrder, addr)
|
|
}
|
|
|
|
if status != nil {
|
|
for _, addr := range sanitizeGuestAddressStrings(status.IP) {
|
|
addAddress(addr)
|
|
}
|
|
for _, addr := range sanitizeGuestAddressStrings(status.IP6) {
|
|
addAddress(addr)
|
|
}
|
|
for _, addr := range parseContainerRawIPs(status.IPv4) {
|
|
addAddress(addr)
|
|
}
|
|
for _, addr := range parseContainerRawIPs(status.IPv6) {
|
|
addAddress(addr)
|
|
}
|
|
}
|
|
|
|
networkIfaces := make([]models.GuestNetworkInterface, 0, 4)
|
|
if status != nil {
|
|
networkIfaces = make([]models.GuestNetworkInterface, 0, len(status.Network))
|
|
for rawName, cfg := range status.Network {
|
|
if cfg == (proxmox.ContainerNetworkConfig{}) {
|
|
continue
|
|
}
|
|
|
|
iface := models.GuestNetworkInterface{}
|
|
name := strings.TrimSpace(cfg.Name)
|
|
if name == "" {
|
|
name = strings.TrimSpace(rawName)
|
|
}
|
|
if name != "" {
|
|
iface.Name = name
|
|
}
|
|
if mac := strings.TrimSpace(cfg.HWAddr); mac != "" {
|
|
iface.MAC = mac
|
|
}
|
|
|
|
addrCandidates := make([]string, 0, 4)
|
|
addrCandidates = append(addrCandidates, collectIPsFromInterface(cfg.IP)...)
|
|
addrCandidates = append(addrCandidates, collectIPsFromInterface(cfg.IP6)...)
|
|
addrCandidates = append(addrCandidates, collectIPsFromInterface(cfg.IPv4)...)
|
|
addrCandidates = append(addrCandidates, collectIPsFromInterface(cfg.IPv6)...)
|
|
|
|
if len(addrCandidates) > 0 {
|
|
deduped := dedupeStringsPreserveOrder(addrCandidates)
|
|
if len(deduped) > 0 {
|
|
iface.Addresses = deduped
|
|
for _, addr := range deduped {
|
|
addAddress(addr)
|
|
}
|
|
}
|
|
}
|
|
|
|
if iface.Name != "" || iface.MAC != "" || len(iface.Addresses) > 0 {
|
|
networkIfaces = append(networkIfaces, iface)
|
|
}
|
|
}
|
|
}
|
|
|
|
configCtx, cancelConfig := context.WithTimeout(ctx, 5*time.Second)
|
|
configData, configErr := client.GetContainerConfig(configCtx, nodeName, container.VMID)
|
|
cancelConfig()
|
|
if configErr != nil {
|
|
log.Debug().
|
|
Err(configErr).
|
|
Str("instance", instanceName).
|
|
Str("node", nodeName).
|
|
Str("container", container.Name).
|
|
Int("vmid", container.VMID).
|
|
Msg("Container config metadata unavailable")
|
|
} else if len(configData) > 0 {
|
|
mountMetadata = parseContainerMountMetadata(configData)
|
|
if rootDeviceHint == "" {
|
|
if meta, ok := mountMetadata["rootfs"]; ok && meta.Source != "" {
|
|
rootDeviceHint = meta.Source
|
|
}
|
|
}
|
|
if rootDeviceHint == "" {
|
|
if hint := extractContainerRootDeviceFromConfig(configData); hint != "" {
|
|
rootDeviceHint = hint
|
|
}
|
|
}
|
|
for _, detail := range parseContainerConfigNetworks(configData) {
|
|
if len(detail.Addresses) > 0 {
|
|
for _, addr := range detail.Addresses {
|
|
addAddress(addr)
|
|
}
|
|
}
|
|
mergeContainerNetworkInterface(&networkIfaces, detail)
|
|
}
|
|
// Extract OS type from container config
|
|
if osName := extractContainerOSType(configData); osName != "" {
|
|
container.OSName = osName
|
|
}
|
|
// Detect OCI containers (Proxmox VE 9.1+)
|
|
// Method 1: Check ostemplate for OCI registry patterns
|
|
if osTemplate := extractContainerOSTemplate(configData); osTemplate != "" {
|
|
container.OSTemplate = osTemplate
|
|
if isOCITemplate(osTemplate) {
|
|
container.IsOCI = true
|
|
container.Type = "oci"
|
|
log.Debug().
|
|
Str("container", container.Name).
|
|
Int("vmid", container.VMID).
|
|
Str("osTemplate", osTemplate).
|
|
Msg("Detected OCI container by template")
|
|
}
|
|
}
|
|
// Method 2: Check config fields (entrypoint, ostype, cmode)
|
|
// This is needed because Proxmox doesn't persist ostemplate after creation
|
|
if !container.IsOCI && isOCIContainerByConfig(configData) {
|
|
container.IsOCI = true
|
|
container.Type = "oci"
|
|
log.Debug().
|
|
Str("container", container.Name).
|
|
Int("vmid", container.VMID).
|
|
Msg("Detected OCI container by config (entrypoint/ostype)")
|
|
}
|
|
}
|
|
|
|
if len(addressOrder) == 0 {
|
|
if isRunning {
|
|
interfacesCtx, cancelInterfaces := context.WithTimeout(ctx, 5*time.Second)
|
|
ifaceDetails, ifaceErr := client.GetContainerInterfaces(interfacesCtx, nodeName, container.VMID)
|
|
cancelInterfaces()
|
|
if ifaceErr != nil {
|
|
log.Debug().
|
|
Err(ifaceErr).
|
|
Str("instance", instanceName).
|
|
Str("node", nodeName).
|
|
Str("container", container.Name).
|
|
Int("vmid", container.VMID).
|
|
Msg("Container interface metadata unavailable")
|
|
} else if len(ifaceDetails) > 0 {
|
|
for _, detail := range ifaceDetails {
|
|
parsed := containerNetworkDetails{}
|
|
parsed.Name = strings.TrimSpace(detail.Name)
|
|
parsed.MAC = strings.ToUpper(strings.TrimSpace(detail.HWAddr))
|
|
|
|
for _, addr := range detail.IPAddresses {
|
|
stripped := strings.TrimSpace(addr.Address)
|
|
if stripped == "" {
|
|
continue
|
|
}
|
|
if slash := strings.Index(stripped, "/"); slash > 0 {
|
|
stripped = stripped[:slash]
|
|
}
|
|
parsed.Addresses = append(parsed.Addresses, sanitizeGuestAddressStrings(stripped)...)
|
|
}
|
|
|
|
if len(parsed.Addresses) == 0 && strings.TrimSpace(detail.Inet) != "" {
|
|
parts := strings.Fields(detail.Inet)
|
|
for _, part := range parts {
|
|
stripped := strings.TrimSpace(part)
|
|
if stripped == "" {
|
|
continue
|
|
}
|
|
if slash := strings.Index(stripped, "/"); slash > 0 {
|
|
stripped = stripped[:slash]
|
|
}
|
|
parsed.Addresses = append(parsed.Addresses, sanitizeGuestAddressStrings(stripped)...)
|
|
}
|
|
}
|
|
|
|
parsed.Addresses = dedupeStringsPreserveOrder(parsed.Addresses)
|
|
|
|
if len(parsed.Addresses) > 0 {
|
|
for _, addr := range parsed.Addresses {
|
|
addAddress(addr)
|
|
}
|
|
}
|
|
|
|
if parsed.Name != "" || parsed.MAC != "" || len(parsed.Addresses) > 0 {
|
|
mergeContainerNetworkInterface(&networkIfaces, parsed)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(networkIfaces) > 1 {
|
|
sort.SliceStable(networkIfaces, func(i, j int) bool {
|
|
left := strings.TrimSpace(networkIfaces[i].Name)
|
|
right := strings.TrimSpace(networkIfaces[j].Name)
|
|
return left < right
|
|
})
|
|
}
|
|
|
|
if len(addressOrder) > 1 {
|
|
sort.Strings(addressOrder)
|
|
}
|
|
|
|
if len(addressOrder) > 0 {
|
|
container.IPAddresses = addressOrder
|
|
}
|
|
|
|
if len(networkIfaces) > 0 {
|
|
container.NetworkInterfaces = networkIfaces
|
|
}
|
|
|
|
if disks := convertContainerDiskInfo(status, mountMetadata); len(disks) > 0 {
|
|
container.Disks = disks
|
|
}
|
|
|
|
ensureContainerRootDiskEntry(container)
|
|
|
|
if rootDeviceHint != "" && len(container.Disks) > 0 {
|
|
for i := range container.Disks {
|
|
if container.Disks[i].Mountpoint == "/" && container.Disks[i].Device == "" {
|
|
container.Disks[i].Device = rootDeviceHint
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// GetConnectionStatuses returns the current connection status for all nodes
|
|
|
|
func shouldTryPortlessFallback(err error) bool {
|
|
if err == nil {
|
|
return false
|
|
}
|
|
msg := strings.ToLower(err.Error())
|
|
if strings.Contains(msg, "connection refused") ||
|
|
strings.Contains(msg, "connection reset") ||
|
|
strings.Contains(msg, "no such host") ||
|
|
strings.Contains(msg, "client.timeout exceeded") ||
|
|
strings.Contains(msg, "i/o timeout") ||
|
|
strings.Contains(msg, "context deadline exceeded") {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// retryPVEPortFallback handles the case where a normalized :8006 host is unreachable
|
|
// because the actual endpoint is fronted by a reverse proxy on 443. If the initial
|
|
// GetNodes call fails with a connection error and the host has the default PVE port,
|
|
// retry without the default port to hit the proxy. On success, swap the client so
|
|
// subsequent polls reuse the working endpoint.
|
|
func (m *Monitor) retryPVEPortFallback(ctx context.Context, instanceName string, instanceCfg *config.PVEInstance, currentClient PVEClientInterface, cause error) ([]proxmox.Node, PVEClientInterface, error) {
|
|
if instanceCfg == nil || !shouldTryPortlessFallback(cause) {
|
|
return nil, currentClient, cause
|
|
}
|
|
|
|
fallbackHost := config.StripDefaultPort(instanceCfg.Host, config.DefaultPVEPort)
|
|
if fallbackHost == "" || fallbackHost == instanceCfg.Host {
|
|
return nil, currentClient, cause
|
|
}
|
|
|
|
clientCfg := config.CreateProxmoxConfigWithHost(instanceCfg, fallbackHost, false)
|
|
if clientCfg.Timeout <= 0 {
|
|
m.mu.RLock()
|
|
if m.config != nil {
|
|
clientCfg.Timeout = m.config.ConnectionTimeout
|
|
}
|
|
m.mu.RUnlock()
|
|
}
|
|
|
|
fallbackClient, err := newProxmoxClientFunc(clientCfg)
|
|
if err != nil {
|
|
return nil, currentClient, cause
|
|
}
|
|
|
|
fallbackNodes, err := fallbackClient.GetNodes(ctx)
|
|
if err != nil {
|
|
return nil, currentClient, cause
|
|
}
|
|
|
|
// Switch to the working host for the remainder of the poll (and future polls)
|
|
primaryHost := instanceCfg.Host
|
|
|
|
// Persist with an explicit port to avoid re-normalization back to :8006 on reloads.
|
|
persistHost := fallbackHost
|
|
if parsed, err := url.Parse(fallbackHost); err == nil && parsed.Host != "" && parsed.Port() == "" {
|
|
port := "443"
|
|
if strings.EqualFold(parsed.Scheme, "http") {
|
|
port = "80"
|
|
}
|
|
parsed.Host = net.JoinHostPort(parsed.Hostname(), port)
|
|
persistHost = parsed.Scheme + "://" + parsed.Host
|
|
}
|
|
|
|
var persistence *config.ConfigPersistence
|
|
var pveInstances []config.PVEInstance
|
|
var pbsInstances []config.PBSInstance
|
|
var pmgInstances []config.PMGInstance
|
|
|
|
m.mu.Lock()
|
|
instanceCfg.Host = persistHost
|
|
m.pveClients[instanceName] = fallbackClient
|
|
|
|
// Update in-memory config so subsequent polls build clients against the working port.
|
|
if m.config != nil {
|
|
for i := range m.config.PVEInstances {
|
|
if m.config.PVEInstances[i].Name == instanceName {
|
|
m.config.PVEInstances[i].Host = persistHost
|
|
break
|
|
}
|
|
}
|
|
|
|
pveInstances = append(pveInstances, m.config.PVEInstances...)
|
|
pbsInstances = append(pbsInstances, m.config.PBSInstances...)
|
|
pmgInstances = append(pmgInstances, m.config.PMGInstances...)
|
|
}
|
|
persistence = m.persistence
|
|
m.mu.Unlock()
|
|
|
|
// Persist to disk so restarts keep the working endpoint.
|
|
if persistence != nil {
|
|
if err := persistence.SaveNodesConfig(pveInstances, pbsInstances, pmgInstances); err != nil {
|
|
log.Warn().Err(err).Str("instance", instanceName).Msg("Failed to persist fallback PVE host")
|
|
}
|
|
}
|
|
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Str("primary", primaryHost).
|
|
Str("fallback", persistHost).
|
|
Msg("Primary PVE host failed; using fallback without default port")
|
|
|
|
return fallbackNodes, fallbackClient, nil
|
|
}
|
|
|
|
func (m *Monitor) fetchPVENodes(ctx context.Context, instanceName string, instanceCfg *config.PVEInstance, client PVEClientInterface) ([]proxmox.Node, PVEClientInterface, error) {
|
|
nodes, err := client.GetNodes(ctx)
|
|
if err != nil {
|
|
if fallbackNodes, fallbackClient, fallbackErr := m.retryPVEPortFallback(ctx, instanceName, instanceCfg, client, err); fallbackErr == nil {
|
|
return fallbackNodes, fallbackClient, nil
|
|
}
|
|
|
|
monErr := errors.WrapConnectionError("poll_nodes", instanceName, err)
|
|
log.Error().Err(monErr).Str("instance", instanceName).Msg("failed to get nodes")
|
|
m.setProviderConnectionHealth(InstanceTypePVE, instanceName, false)
|
|
|
|
// Track auth failure if it's an authentication error
|
|
if errors.IsAuthError(err) {
|
|
m.recordAuthFailure(instanceName, "pve")
|
|
}
|
|
return nil, client, monErr
|
|
}
|
|
|
|
// Reset auth failures on successful connection
|
|
m.resetAuthFailures(instanceName, "pve")
|
|
return nodes, client, nil
|
|
}
|
|
|
|
func (m *Monitor) updatePVEConnectionHealth(ctx context.Context, instanceName string, client PVEClientInterface) string {
|
|
connectionHealthStr := "healthy"
|
|
if clusterClient, ok := client.(*proxmox.ClusterClient); ok {
|
|
// For cluster clients, check if all endpoints are healthy
|
|
healthStatus := clusterClient.GetHealthStatus()
|
|
healthyCount := 0
|
|
totalCount := len(healthStatus)
|
|
|
|
for _, isHealthy := range healthStatus {
|
|
if isHealthy {
|
|
healthyCount++
|
|
}
|
|
}
|
|
|
|
if healthyCount == 0 {
|
|
// All endpoints are down
|
|
connectionHealthStr = "error"
|
|
m.setProviderConnectionHealth(InstanceTypePVE, instanceName, false)
|
|
} else if healthyCount < totalCount {
|
|
// Some endpoints are down - check if cluster still has quorum
|
|
// A cluster with quorum is healthy even if some nodes are intentionally offline
|
|
// (e.g., backup nodes not running). Only mark as degraded if no quorum.
|
|
isQuorate, err := clusterClient.IsQuorate(ctx)
|
|
if err != nil {
|
|
// Couldn't check quorum - log but continue (assume healthy if we have connectivity)
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Err(err).
|
|
Msg("Could not check cluster quorum status")
|
|
isQuorate = true // Assume healthy if we can't check
|
|
}
|
|
|
|
if isQuorate {
|
|
// Cluster has quorum - healthy even with some nodes offline
|
|
connectionHealthStr = "healthy"
|
|
m.setProviderConnectionHealth(InstanceTypePVE, instanceName, true)
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Int("healthy", healthyCount).
|
|
Int("total", totalCount).
|
|
Msg("Cluster has quorum - some API endpoints unreachable but cluster is healthy")
|
|
} else {
|
|
// Cluster lost quorum - this is actually degraded/critical
|
|
connectionHealthStr = "degraded"
|
|
m.setProviderConnectionHealth(InstanceTypePVE, instanceName, true) // Still functional but degraded
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Int("healthy", healthyCount).
|
|
Int("total", totalCount).
|
|
Msg("Cluster lost quorum - degraded state")
|
|
}
|
|
} else {
|
|
// All endpoints are healthy
|
|
connectionHealthStr = "healthy"
|
|
m.setProviderConnectionHealth(InstanceTypePVE, instanceName, true)
|
|
}
|
|
} else {
|
|
// Regular client - simple healthy/unhealthy
|
|
m.setProviderConnectionHealth(InstanceTypePVE, instanceName, true)
|
|
}
|
|
return connectionHealthStr
|
|
}
|
|
|
|
func (m *Monitor) snapshotPrevNodes(instanceName string) (map[string]models.Memory, []models.Node) {
|
|
return m.previousNodesForInstance(instanceName)
|
|
}
|
|
|
|
func (m *Monitor) pollPVENodesParallel(
|
|
ctx context.Context,
|
|
instanceName string,
|
|
instanceCfg *config.PVEInstance,
|
|
client PVEClientInterface,
|
|
nodes []proxmox.Node,
|
|
connectionHealthStr string,
|
|
prevNodeMemory map[string]models.Memory,
|
|
prevInstanceNodes []models.Node,
|
|
debugEnabled bool,
|
|
) ([]models.Node, map[string]string, map[string]string) {
|
|
var modelNodes []models.Node
|
|
nodeEffectiveStatus := make(map[string]string)
|
|
nodeDiskSources := make(map[string]string)
|
|
|
|
type nodePollResult struct {
|
|
node models.Node
|
|
effectiveStatus string
|
|
diskSource string
|
|
}
|
|
|
|
resultChan := make(chan nodePollResult, len(nodes))
|
|
var wg sync.WaitGroup
|
|
|
|
if debugEnabled {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Int("nodes", len(nodes)).
|
|
Msg("Starting parallel node polling")
|
|
}
|
|
|
|
for _, node := range nodes {
|
|
wg.Add(1)
|
|
go func(node proxmox.Node) {
|
|
defer wg.Done()
|
|
|
|
modelNode, effectiveStatus, diskSource, _ := m.pollPVENode(ctx, instanceName, instanceCfg, client, node, connectionHealthStr, prevNodeMemory, prevInstanceNodes)
|
|
|
|
resultChan <- nodePollResult{
|
|
node: modelNode,
|
|
effectiveStatus: effectiveStatus,
|
|
diskSource: diskSource,
|
|
}
|
|
}(node)
|
|
}
|
|
|
|
wg.Wait()
|
|
close(resultChan)
|
|
|
|
for res := range resultChan {
|
|
modelNodes = append(modelNodes, res.node)
|
|
nodeEffectiveStatus[res.node.Name] = res.effectiveStatus
|
|
nodeDiskSources[res.node.Name] = res.diskSource
|
|
}
|
|
|
|
return modelNodes, nodeEffectiveStatus, nodeDiskSources
|
|
}
|
|
|
|
func (m *Monitor) preserveNodesWhenEmpty(instanceName string, modelNodes []models.Node, prevInstanceNodes []models.Node) []models.Node {
|
|
if len(modelNodes) > 0 || len(prevInstanceNodes) == 0 {
|
|
return modelNodes
|
|
}
|
|
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Int("previousCount", len(prevInstanceNodes)).
|
|
Msg("No Proxmox nodes returned this cycle - preserving previous state")
|
|
|
|
// Mark connection health as degraded to reflect polling failure
|
|
m.setProviderConnectionHealth(InstanceTypePVE, instanceName, false)
|
|
|
|
preserved := make([]models.Node, 0, len(prevInstanceNodes))
|
|
now := time.Now()
|
|
for _, prevNode := range prevInstanceNodes {
|
|
nodeCopy := prevNode
|
|
|
|
// Keep recently seen nodes online during transient GetNodes gaps.
|
|
// This mirrors the node grace behavior used in regular node polling.
|
|
lastSeen := prevNode.LastSeen
|
|
if lastSeen.IsZero() {
|
|
m.mu.Lock()
|
|
if lastOnline, ok := m.nodeLastOnline[prevNode.ID]; ok {
|
|
lastSeen = lastOnline
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
withinGrace := !lastSeen.IsZero() && now.Sub(lastSeen) < nodeOfflineGracePeriod
|
|
if withinGrace {
|
|
if strings.TrimSpace(nodeCopy.Status) == "" || strings.EqualFold(nodeCopy.Status, "offline") {
|
|
nodeCopy.Status = "online"
|
|
}
|
|
if nodeCopy.ConnectionHealth == "" || strings.EqualFold(nodeCopy.ConnectionHealth, "error") {
|
|
nodeCopy.ConnectionHealth = "degraded"
|
|
}
|
|
preserved = append(preserved, nodeCopy)
|
|
continue
|
|
}
|
|
|
|
nodeCopy.Status = "offline"
|
|
nodeCopy.ConnectionHealth = "error"
|
|
nodeCopy.Uptime = 0
|
|
nodeCopy.CPU = 0
|
|
preserved = append(preserved, nodeCopy)
|
|
}
|
|
return preserved
|
|
}
|
|
|
|
func (m *Monitor) seedNodeDisplayNames(modelNodes []models.Node) {
|
|
for i := range modelNodes {
|
|
if modelNodes[i].DisplayName != "" {
|
|
m.alertManager.UpdateNodeDisplayName(modelNodes[i].Name, modelNodes[i].DisplayName)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) pollGuestsWithFallback(
|
|
ctx context.Context,
|
|
instanceName string,
|
|
instanceCfg *config.PVEInstance,
|
|
client PVEClientInterface,
|
|
nodes []proxmox.Node,
|
|
nodeEffectiveStatus map[string]string,
|
|
) error {
|
|
if !instanceCfg.MonitorVMs && !instanceCfg.MonitorContainers {
|
|
return nil
|
|
}
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
default:
|
|
}
|
|
|
|
// Always try the efficient cluster/resources endpoint first
|
|
// This endpoint works on both clustered and standalone nodes
|
|
// Testing confirmed it works on standalone nodes like pimox
|
|
useClusterEndpoint := m.pollVMsAndContainersEfficient(ctx, instanceName, instanceCfg.ClusterName, instanceCfg.IsCluster, client, nodeEffectiveStatus)
|
|
|
|
if !useClusterEndpoint {
|
|
// Fall back to traditional polling only if cluster/resources not available
|
|
// This should be rare - only for very old Proxmox versions
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Msg("cluster/resources endpoint not available, using traditional polling")
|
|
|
|
// Check if configuration needs updating
|
|
if instanceCfg.IsCluster {
|
|
isActuallyCluster, checkErr := client.IsClusterMember(ctx)
|
|
if checkErr == nil && !isActuallyCluster {
|
|
log.Warn().
|
|
Str("instance", instanceName).
|
|
Msg("Instance marked as cluster but is actually standalone - consider updating configuration")
|
|
instanceCfg.IsCluster = false
|
|
}
|
|
}
|
|
|
|
// Use optimized parallel polling for better performance
|
|
if instanceCfg.MonitorVMs {
|
|
m.pollVMsWithNodes(ctx, instanceName, instanceCfg.ClusterName, instanceCfg.IsCluster, client, nodes, nodeEffectiveStatus)
|
|
}
|
|
if instanceCfg.MonitorContainers {
|
|
m.pollContainersWithNodes(ctx, instanceName, instanceCfg.ClusterName, instanceCfg.IsCluster, client, nodes, nodeEffectiveStatus)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (m *Monitor) maybePollPhysicalDisksAsync(
|
|
ctx context.Context,
|
|
instanceName string,
|
|
instanceCfg *config.PVEInstance,
|
|
client PVEClientInterface,
|
|
nodes []proxmox.Node,
|
|
nodeEffectiveStatus map[string]string,
|
|
modelNodes []models.Node,
|
|
) {
|
|
// Poll physical disks for health monitoring (enabled by default unless explicitly disabled)
|
|
// Skip if MonitorPhysicalDisks is explicitly set to false
|
|
// Physical disk polling runs in a background goroutine since GetDisks can be slow
|
|
// and we don't want it to cause task timeouts. It has its own 5-minute interval anyway.
|
|
if instanceCfg.MonitorPhysicalDisks != nil && !*instanceCfg.MonitorPhysicalDisks {
|
|
log.Debug().Str("instance", instanceName).Msg("physical disk monitoring explicitly disabled")
|
|
// Keep any existing disk data visible (don't clear it)
|
|
return
|
|
}
|
|
|
|
// Enabled by default (when nil or true)
|
|
// Determine polling interval (default 5 minutes to avoid spinning up HDDs too frequently)
|
|
pollingInterval := 5 * time.Minute
|
|
if instanceCfg.PhysicalDiskPollingMinutes > 0 {
|
|
pollingInterval = time.Duration(instanceCfg.PhysicalDiskPollingMinutes) * time.Minute
|
|
}
|
|
|
|
// Check if enough time has elapsed since last poll
|
|
m.mu.Lock()
|
|
lastPoll, exists := m.lastPhysicalDiskPoll[instanceName]
|
|
shouldPoll := !exists || time.Since(lastPoll) >= pollingInterval
|
|
if shouldPoll {
|
|
m.lastPhysicalDiskPoll[instanceName] = time.Now()
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
if !shouldPoll {
|
|
log.Debug().
|
|
Str("instance", instanceName).
|
|
Dur("sinceLastPoll", time.Since(lastPoll)).
|
|
Dur("interval", pollingInterval).
|
|
Msg("Skipping physical disk poll - interval not elapsed")
|
|
// Refresh NVMe temperatures using the latest sensor data even when we skip the disk poll
|
|
readState := m.GetUnifiedReadStateOrSnapshot()
|
|
existing := physicalDisksForInstanceFromReadState(readState, instanceName)
|
|
if len(existing) > 0 {
|
|
nodes := nodesForInstanceFromReadState(readState, instanceName)
|
|
hosts := hostsFromReadState(readState)
|
|
updated := mergeNVMeTempsIntoDisks(existing, nodes)
|
|
updated = mergeHostAgentSMARTIntoDisks(updated, nodes, hosts)
|
|
m.state.UpdatePhysicalDisks(instanceName, updated)
|
|
}
|
|
return
|
|
}
|
|
|
|
// Run physical disk polling in background to avoid blocking the main task
|
|
go func(inst string, pveClient PVEClientInterface, nodeList []proxmox.Node, nodeStatus map[string]string, modelNodesCopy []models.Node) {
|
|
defer recoverFromPanic(fmt.Sprintf("pollPhysicalDisks-%s", inst))
|
|
|
|
// Use a generous timeout for disk polling
|
|
diskTimeout := 60 * time.Second
|
|
// Use monitor lifecycle context so shutdown can interrupt detached async polling.
|
|
parentCtx := m.runtimeCtx
|
|
if parentCtx == nil {
|
|
parentCtx = ctx
|
|
}
|
|
if parentCtx == nil {
|
|
parentCtx = context.Background()
|
|
}
|
|
diskCtx, diskCancel := context.WithTimeout(parentCtx, diskTimeout)
|
|
defer diskCancel()
|
|
|
|
log.Debug().
|
|
Int("nodeCount", len(nodeList)).
|
|
Dur("interval", pollingInterval).
|
|
Msg("Starting disk health polling")
|
|
|
|
// Get existing disks from state to preserve data for offline nodes
|
|
readState := m.GetUnifiedReadStateOrSnapshot()
|
|
nodesFromState := nodesForInstanceFromReadState(readState, inst)
|
|
hosts := hostsFromReadState(readState)
|
|
existingDisksMap := make(map[string]models.PhysicalDisk)
|
|
for _, disk := range physicalDisksForInstanceFromReadState(readState, inst) {
|
|
if disk.Instance == inst {
|
|
existingDisksMap[disk.ID] = disk
|
|
}
|
|
}
|
|
|
|
// Build a lookup map of node name → disk exclusion patterns by
|
|
// cross-referencing linked host agents. This lets --disk-exclude on the
|
|
// agent suppress server-side Proxmox disk health/wearout alerts.
|
|
diskExcludeByNode := make(map[string][]string)
|
|
hostByID := make(map[string]models.Host, len(hosts))
|
|
for _, h := range hosts {
|
|
hostByID[h.ID] = h
|
|
}
|
|
for _, n := range nodesFromState {
|
|
if n.LinkedAgentID == "" || n.Instance != inst {
|
|
continue
|
|
}
|
|
if linkedHost, ok := hostByID[n.LinkedAgentID]; ok && len(linkedHost.DiskExclude) > 0 {
|
|
diskExcludeByNode[n.Name] = linkedHost.DiskExclude
|
|
}
|
|
}
|
|
|
|
var allDisks []models.PhysicalDisk
|
|
polledNodes := make(map[string]bool) // Track which nodes we successfully polled
|
|
|
|
for _, node := range nodeList {
|
|
// Check if context timed out
|
|
select {
|
|
case <-diskCtx.Done():
|
|
log.Debug().
|
|
Str("instance", inst).
|
|
Msg("Physical disk polling timed out - preserving existing data")
|
|
return
|
|
default:
|
|
}
|
|
|
|
// Skip offline nodes but preserve their existing disk data
|
|
if nodeStatus[node.Node] != "online" {
|
|
log.Debug().Str("node", node.Node).Msg("skipping disk poll for offline node - preserving existing data")
|
|
continue
|
|
}
|
|
|
|
// Get disk list for this node
|
|
log.Debug().Str("node", node.Node).Msg("getting disk list for node")
|
|
disks, err := pveClient.GetDisks(diskCtx, node.Node)
|
|
if err != nil {
|
|
// Check if it's a permission error or if the endpoint doesn't exist
|
|
errStr := err.Error()
|
|
if strings.Contains(errStr, "401") || strings.Contains(errStr, "403") {
|
|
log.Warn().
|
|
Str("node", node.Node).
|
|
Err(err).
|
|
Msg("Insufficient permissions to access disk information - check API token permissions")
|
|
} else if strings.Contains(errStr, "404") || strings.Contains(errStr, "501") {
|
|
log.Info().
|
|
Str("node", node.Node).
|
|
Msg("Disk monitoring not available on this node (may be using non-standard storage)")
|
|
} else {
|
|
log.Warn().
|
|
Str("node", node.Node).
|
|
Err(err).
|
|
Msg("Failed to get disk list")
|
|
}
|
|
continue
|
|
}
|
|
|
|
// Mark this node as successfully polled
|
|
polledNodes[node.Node] = true
|
|
|
|
// Record each disk; alert evaluation happens after host-agent SMART merges
|
|
// so the canonical disk view includes post-merge health/wearout data.
|
|
for _, disk := range disks {
|
|
diskID := fmt.Sprintf("%s-%s-%s", inst, node.Node, strings.ReplaceAll(disk.DevPath, "/", "-"))
|
|
physicalDisk := models.PhysicalDisk{
|
|
ID: diskID,
|
|
Node: node.Node,
|
|
Instance: inst,
|
|
DevPath: disk.DevPath,
|
|
Model: disk.Model,
|
|
Serial: disk.Serial,
|
|
WWN: disk.WWN,
|
|
Type: disk.Type,
|
|
Size: disk.Size,
|
|
Health: disk.Health,
|
|
Wearout: disk.Wearout,
|
|
RPM: disk.RPM,
|
|
Used: disk.Used,
|
|
LastChecked: time.Now(),
|
|
}
|
|
|
|
allDisks = append(allDisks, physicalDisk)
|
|
}
|
|
}
|
|
|
|
// Preserve existing disk data for nodes that weren't polled (offline or error)
|
|
for _, existingDisk := range existingDisksMap {
|
|
// Only preserve if we didn't poll this node
|
|
if !polledNodes[existingDisk.Node] {
|
|
// Keep the existing disk data but update the LastChecked to indicate it's stale
|
|
allDisks = append(allDisks, existingDisk)
|
|
log.Debug().
|
|
Str("node", existingDisk.Node).
|
|
Str("disk", existingDisk.DevPath).
|
|
Msg("Preserving existing disk data for unpolled node")
|
|
}
|
|
}
|
|
|
|
allDisks = mergeNVMeTempsIntoDisks(allDisks, nodesFromState)
|
|
allDisks = mergeHostAgentSMARTIntoDisks(allDisks, nodesFromState, hosts)
|
|
for _, disk := range allDisks {
|
|
if !polledNodes[disk.Node] {
|
|
continue
|
|
}
|
|
|
|
log.Debug().
|
|
Str("node", disk.Node).
|
|
Str("disk", disk.DevPath).
|
|
Str("model", disk.Model).
|
|
Str("health", disk.Health).
|
|
Int("wearout", disk.Wearout).
|
|
Msg("Checking disk health")
|
|
|
|
if excludePatterns, ok := diskExcludeByNode[disk.Node]; ok && fsfilters.MatchesDeviceExclude(disk.DevPath, excludePatterns) {
|
|
healthyDisk := proxmoxDiskFromPhysicalDisk(disk)
|
|
healthyDisk.Health = "PASSED"
|
|
healthyDisk.Wearout = 100
|
|
m.alertManager.CheckDiskHealth(inst, disk.Node, healthyDisk)
|
|
continue
|
|
}
|
|
|
|
m.alertManager.CheckDiskHealth(inst, disk.Node, proxmoxDiskFromPhysicalDisk(disk))
|
|
}
|
|
|
|
// Write SMART metrics to persistent store
|
|
if m.metricsStore != nil {
|
|
now := time.Now()
|
|
for _, disk := range allDisks {
|
|
m.writeSMARTMetrics(disk, now)
|
|
}
|
|
}
|
|
|
|
// Update physical disks in state
|
|
log.Debug().
|
|
Str("instance", inst).
|
|
Int("diskCount", len(allDisks)).
|
|
Int("preservedCount", len(existingDisksMap)-len(polledNodes)).
|
|
Msg("Updating physical disks in state")
|
|
m.state.UpdatePhysicalDisks(inst, allDisks)
|
|
}(instanceName, client, nodes, nodeEffectiveStatus, modelNodes)
|
|
}
|
|
|
|
func proxmoxDiskFromPhysicalDisk(disk models.PhysicalDisk) proxmox.Disk {
|
|
return proxmox.Disk{
|
|
DevPath: disk.DevPath,
|
|
Model: disk.Model,
|
|
Serial: disk.Serial,
|
|
Type: disk.Type,
|
|
Health: disk.Health,
|
|
Wearout: disk.Wearout,
|
|
Size: disk.Size,
|
|
RPM: disk.RPM,
|
|
Used: disk.Used,
|
|
WWN: disk.WWN,
|
|
}
|
|
}
|
|
|
|
// pollPVEInstance polls a single PVE instance
|
|
func (m *Monitor) pollPVEInstance(ctx context.Context, instanceName string, client PVEClientInterface) {
|
|
defer recoverFromPanic(fmt.Sprintf("pollPVEInstance-%s", instanceName))
|
|
|
|
start := time.Now()
|
|
debugEnabled := logging.IsLevelEnabled(zerolog.DebugLevel)
|
|
var pollErr error
|
|
if m.pollMetrics != nil {
|
|
m.pollMetrics.IncInFlight("pve")
|
|
defer m.pollMetrics.DecInFlight("pve")
|
|
defer func() {
|
|
m.pollMetrics.RecordResult(PollResult{
|
|
InstanceName: instanceName,
|
|
InstanceType: "pve",
|
|
Success: pollErr == nil,
|
|
Error: pollErr,
|
|
StartTime: start,
|
|
EndTime: time.Now(),
|
|
})
|
|
}()
|
|
}
|
|
if m.stalenessTracker != nil {
|
|
defer func() {
|
|
if pollErr == nil {
|
|
m.stalenessTracker.UpdateSuccess(InstanceTypePVE, instanceName, nil)
|
|
} else {
|
|
m.stalenessTracker.UpdateError(InstanceTypePVE, instanceName)
|
|
}
|
|
}()
|
|
}
|
|
defer m.recordTaskResult(InstanceTypePVE, instanceName, pollErr)
|
|
|
|
// Check if context is cancelled
|
|
select {
|
|
case <-ctx.Done():
|
|
pollErr = ctx.Err()
|
|
if debugEnabled {
|
|
log.Debug().Str("instance", instanceName).Msg("polling cancelled")
|
|
}
|
|
return
|
|
default:
|
|
}
|
|
|
|
if debugEnabled {
|
|
log.Debug().Str("instance", instanceName).Msg("polling PVE instance")
|
|
}
|
|
|
|
// Get instance config
|
|
instanceCfg := m.getInstanceConfig(instanceName)
|
|
if instanceCfg == nil {
|
|
pollErr = fmt.Errorf("pve instance config not found for %s", instanceName)
|
|
return
|
|
}
|
|
|
|
// Poll nodes
|
|
nodes, updatedClient, err := m.fetchPVENodes(ctx, instanceName, instanceCfg, client)
|
|
if err != nil {
|
|
pollErr = err
|
|
return
|
|
}
|
|
client = updatedClient
|
|
|
|
// Check if client is a ClusterClient to determine health status
|
|
connectionHealthStr := m.updatePVEConnectionHealth(ctx, instanceName, client)
|
|
|
|
// Capture previous memory metrics so we can preserve them if detailed status fails
|
|
prevNodeMemory, prevInstanceNodes := m.snapshotPrevNodes(instanceName)
|
|
|
|
// Convert to models
|
|
modelNodes, nodeEffectiveStatus, nodeDiskSources := m.pollPVENodesParallel(
|
|
ctx,
|
|
instanceName,
|
|
instanceCfg,
|
|
client,
|
|
nodes,
|
|
connectionHealthStr,
|
|
prevNodeMemory,
|
|
prevInstanceNodes,
|
|
debugEnabled,
|
|
)
|
|
|
|
modelNodes = m.preserveNodesWhenEmpty(instanceName, modelNodes, prevInstanceNodes)
|
|
|
|
// Update state first so we have nodes available
|
|
m.state.UpdateNodesForInstance(instanceName, modelNodes)
|
|
|
|
// Storage fallback is used to provide disk metrics when the node summary only
|
|
// has the low-confidence /nodes figure or no disk truth at all.
|
|
// We run this asynchronously with a short timeout so it doesn't block VM/container polling.
|
|
// This addresses the issue where slow storage APIs (e.g., NFS mounts) can cause the entire
|
|
// polling task to timeout before reaching VM/container polling.
|
|
storageFallback := m.startStorageFallback(ctx, instanceName, instanceCfg, client, nodes, nodeEffectiveStatus)
|
|
|
|
// Pre-populate node display name cache so guest alerts created below
|
|
// can resolve friendly names. CheckNode() also does this, but it runs
|
|
// after guest polling — without this, the first alert notification for
|
|
// a guest would show the raw Proxmox node name.
|
|
m.seedNodeDisplayNames(modelNodes)
|
|
|
|
// Poll VMs and containers FIRST - this is the most critical data.
|
|
// This happens immediately after starting the storage fallback goroutine,
|
|
// so VM/container polling runs in parallel with (and is not blocked by) storage operations.
|
|
if err := m.pollGuestsWithFallback(ctx, instanceName, instanceCfg, client, nodes, nodeEffectiveStatus); err != nil {
|
|
pollErr = err
|
|
return
|
|
}
|
|
|
|
m.maybePollPhysicalDisksAsync(ctx, instanceName, instanceCfg, client, nodes, nodeEffectiveStatus, modelNodes)
|
|
// Note: Physical disk monitoring is now enabled by default with a 5-minute polling interval.
|
|
// Users can explicitly disable it in node settings. Disk data is preserved between polls.
|
|
|
|
// Wait for storage fallback to complete (with a short timeout) before using the data.
|
|
// This is non-blocking in the sense that VM/container polling has already completed by now.
|
|
// We give the storage fallback goroutine up to 2 additional seconds to finish if it's still running.
|
|
localStorageByNode := m.awaitStorageFallback(instanceName, storageFallback, 2*time.Second)
|
|
|
|
modelNodes = m.applyStorageFallbackAndRecordNodeMetrics(instanceName, client, modelNodes, nodeDiskSources, localStorageByNode)
|
|
|
|
// Periodically re-check cluster status for nodes marked as standalone
|
|
// This addresses issue #437 where clusters aren't detected on first attempt
|
|
m.detectClusterMembership(ctx, instanceName, instanceCfg, client)
|
|
|
|
// Update cluster endpoint online status if this is a cluster
|
|
m.updateClusterEndpointStatus(instanceName, instanceCfg, client, modelNodes)
|
|
|
|
if err := m.pollStorageAsync(ctx, instanceName, instanceCfg, client, nodes); err != nil {
|
|
pollErr = err
|
|
return
|
|
}
|
|
|
|
if err := m.pollPVEBackupsAsync(ctx, instanceName, instanceCfg, client, nodes, nodeEffectiveStatus); err != nil {
|
|
pollErr = err
|
|
return
|
|
}
|
|
}
|
|
|
|
func copyFloatPointer(src *float64) *float64 {
|
|
if src == nil {
|
|
return nil
|
|
}
|
|
val := *src
|
|
return &val
|
|
}
|
|
|
|
// matchesDatastoreExclude checks if a datastore name matches any exclusion pattern.
|
|
// Patterns can be exact names or wildcards (* for any characters).
|
|
// Examples: "exthdd*" matches "exthdd1500gb", "*backup*" matches "my-backup-store"
|