Pulse/internal/monitoring/monitor_agents.go

2608 lines
79 KiB
Go

package monitoring
import (
"crypto/sha1"
"encoding/hex"
"fmt"
"strings"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/config"
"github.com/rcourtman/pulse-go-rewrite/internal/logging"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
agentsdocker "github.com/rcourtman/pulse-go-rewrite/pkg/agents/docker"
agentshost "github.com/rcourtman/pulse-go-rewrite/pkg/agents/host"
"github.com/rcourtman/pulse-go-rewrite/pkg/fsfilters"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
)
func (m *Monitor) RemoveDockerHost(hostID string) (models.DockerHost, error) {
hostID = strings.TrimSpace(hostID)
if hostID == "" {
return models.DockerHost{}, fmt.Errorf("docker host id is required")
}
hostID = m.canonicalDockerHostID(hostID)
host, removed := m.state.RemoveDockerHost(hostID)
if !removed {
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().Str("dockerHostID", hostID).Msg("Docker host not present in state during removal; proceeding to clear alerts")
}
host = models.DockerHost{
ID: hostID,
Hostname: hostID,
DisplayName: hostID,
}
}
// Revoke the API token associated with this Docker host
if host.TokenID != "" {
tokenRemoved := m.config.RemoveAPIToken(host.TokenID)
if tokenRemoved != nil {
m.config.SortAPITokens()
if m.persistence != nil {
if err := m.persistence.SaveAPITokens(m.config.APITokens); err != nil {
log.Warn().Err(err).Str("tokenID", host.TokenID).Msg("failed to persist API token revocation after Docker host removal")
} else {
log.Info().Str("tokenID", host.TokenID).Str("tokenName", host.TokenName).Msg("API token revoked for removed Docker host")
}
}
}
}
// Track removal to prevent resurrection from cached reports
removedAt := time.Now()
m.mu.Lock()
m.removedDockerHosts[hostID] = removedAt
// Unbind the token so it can be reused with a different agent if needed
if host.TokenID != "" {
delete(m.dockerTokenBindings, host.TokenID)
log.Debug().
Str("tokenID", host.TokenID).
Str("dockerHostID", hostID).
Msg("Unbound Docker agent token from removed host")
}
if cmd, ok := m.dockerCommands[hostID]; ok {
delete(m.dockerCommandIndex, cmd.status.ID)
}
delete(m.dockerCommands, hostID)
m.mu.Unlock()
m.state.AddRemovedDockerHost(models.RemovedDockerHost{
ID: hostID,
Hostname: host.Hostname,
DisplayName: host.DisplayName,
RemovedAt: removedAt,
})
m.state.RemoveConnectionHealth(dockerConnectionPrefix + hostID)
if m.alertManager != nil {
m.alertManager.HandleDockerHostRemoved(host)
m.SyncAlertState()
}
log.Info().
Str("dockerHost", host.Hostname).
Str("dockerHostID", hostID).
Bool("removed", removed).
Msg("Docker host removed and alerts cleared")
return host, nil
}
// RemoveHostAgent removes a host agent from monitoring state and clears related data.
func (m *Monitor) RemoveHostAgent(hostID string) (models.Host, error) {
hostID = strings.TrimSpace(hostID)
if hostID == "" {
return models.Host{}, fmt.Errorf("host id is required")
}
host, removed := m.state.RemoveHost(hostID)
if !removed {
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().Str("hostID", hostID).Msg("host not present in state during removal")
}
host = models.Host{
ID: hostID,
Hostname: hostID,
}
}
tokenID := strings.TrimSpace(host.TokenID)
hostname := strings.TrimSpace(host.Hostname)
tokenStillUsed := false
if tokenID != "" && m.state != nil {
readState := m.snapshotBackedUnifiedReadState()
for _, other := range readState.Hosts() {
if other == nil {
continue
}
if strings.TrimSpace(other.TokenID()) == tokenID {
tokenStillUsed = true
break
}
}
if !tokenStillUsed {
for _, other := range readState.DockerHosts() {
if other == nil {
continue
}
if strings.TrimSpace(other.TokenID()) == tokenID {
tokenStillUsed = true
break
}
}
}
}
var tokenRemoved *config.APITokenRecord
if tokenID != "" && !tokenStillUsed {
tokenRemoved = m.config.RemoveAPIToken(tokenID)
if tokenRemoved != nil {
m.config.SortAPITokens()
if m.persistence != nil {
if err := m.persistence.SaveAPITokens(m.config.APITokens); err != nil {
log.Warn().Err(err).Str("tokenID", tokenID).Msg("failed to persist API token revocation after host agent removal")
} else {
log.Info().Str("tokenID", tokenID).Str("tokenName", host.TokenName).Msg("API token revoked for removed host agent")
}
}
}
} else if tokenID != "" && tokenStillUsed {
log.Info().
Str("tokenID", tokenID).
Str("hostID", hostID).
Msg("API token still used by other agents; skipping revocation during host removal")
}
if tokenID != "" {
m.mu.Lock()
if m.hostTokenBindings == nil {
m.hostTokenBindings = make(map[string]string)
}
if _, exists := m.hostTokenBindings[tokenID]; exists {
delete(m.hostTokenBindings, tokenID)
}
if hostname != "" {
key := fmt.Sprintf("%s:%s", tokenID, hostname)
if _, exists := m.hostTokenBindings[key]; exists {
delete(m.hostTokenBindings, key)
}
}
if tokenRemoved != nil {
prefix := tokenID + ":"
for key := range m.hostTokenBindings {
if strings.HasPrefix(key, prefix) {
delete(m.hostTokenBindings, key)
}
}
}
m.mu.Unlock()
log.Debug().
Str("tokenID", tokenID).
Str("hostID", hostID).
Bool("revoked", tokenRemoved != nil).
Msg("Unbound host agent token bindings after host removal")
}
removedAt := time.Now()
m.mu.Lock()
if m.removedHostAgents == nil {
m.removedHostAgents = make(map[string]time.Time)
}
m.removedHostAgents[hostID] = removedAt
m.mu.Unlock()
m.state.AddRemovedHostAgent(models.RemovedHostAgent{
ID: hostID,
Hostname: host.Hostname,
DisplayName: host.DisplayName,
RemovedAt: removedAt,
})
m.state.RemoveConnectionHealth(hostConnectionPrefix + hostID)
// Clear LinkedAgentID from any nodes that were linked to this host agent
unlinkedCount := m.state.UnlinkNodesFromHostAgent(hostID)
if unlinkedCount > 0 {
log.Info().
Str("hostID", hostID).
Int("unlinkedNodes", unlinkedCount).
Msg("Cleared host agent links from PVE nodes")
}
log.Info().
Str("host", host.Hostname).
Str("hostID", hostID).
Bool("removed", removed).
Msg("Host agent removed from monitoring")
if m.alertManager != nil {
m.alertManager.HandleHostRemoved(host)
}
return host, nil
}
// AllowHostAgentReenroll removes a host agent ID from the removal blocklist so it can report again.
func (m *Monitor) AllowHostAgentReenroll(hostID string) error {
hostID = strings.TrimSpace(hostID)
if hostID == "" {
return fmt.Errorf("host id is required")
}
m.mu.Lock()
if m.removedHostAgents == nil {
m.removedHostAgents = make(map[string]time.Time)
}
_, exists := m.removedHostAgents[hostID]
if exists {
delete(m.removedHostAgents, hostID)
}
m.mu.Unlock()
if !exists {
log.Info().
Str("hostID", hostID).
Msg("allow re-enroll requested but host agent was not blocked; ignoring")
return nil
}
m.state.RemoveRemovedHostAgent(hostID)
log.Info().
Str("hostID", hostID).
Msg("Host agent removal block cleared; host may report again")
return nil
}
func (m *Monitor) lookupRemovedHostAgent(identifier, hostname string) (time.Time, bool) {
identifier = strings.TrimSpace(identifier)
m.mu.RLock()
removedAt, wasRemoved := m.removedHostAgents[identifier]
m.mu.RUnlock()
if wasRemoved {
return removedAt, true
}
for _, entry := range m.state.GetRemovedHostAgents() {
if strings.TrimSpace(entry.ID) == identifier {
return entry.RemovedAt, true
}
if strings.TrimSpace(entry.Hostname) == hostname {
return entry.RemovedAt, true
}
}
return time.Time{}, false
}
// LinkHostAgent manually links a host agent to a specific PVE node.
// This is used when auto-linking can't disambiguate (e.g., multiple nodes with hostname "pve").
// After linking, the host agent's temperature/sensor data will appear on the correct node.
func (m *Monitor) LinkHostAgent(hostID, nodeID string) error {
hostID = strings.TrimSpace(hostID)
nodeID = strings.TrimSpace(nodeID)
if hostID == "" {
return fmt.Errorf("host id is required")
}
if nodeID == "" {
return fmt.Errorf("node id is required")
}
if err := m.state.LinkHostAgentToNode(hostID, nodeID); err != nil {
return fmt.Errorf("link host agent %q to node %q: %w", hostID, nodeID, err)
}
log.Info().
Str("hostID", hostID).
Str("nodeID", nodeID).
Msg("Manually linked host agent to PVE node")
return nil
}
// UnlinkHostAgent removes the link between a host agent and its PVE node.
// The agent will continue to report but will appear in the Managed Agents table
// instead of being merged with the PVE node in the Dashboard.
func (m *Monitor) UnlinkHostAgent(hostID string) error {
hostID = strings.TrimSpace(hostID)
if hostID == "" {
return fmt.Errorf("host id is required")
}
if !m.state.UnlinkHostAgent(hostID) {
return fmt.Errorf("host not found or not linked to a node")
}
log.Info().
Str("hostID", hostID).
Msg("Unlinked host agent from PVE node")
return nil
}
// HostAgentConfig represents server-side configuration for a host agent.
type HostAgentConfig struct {
CommandsEnabled *bool `json:"commandsEnabled,omitempty"` // nil = use agent default
Settings map[string]interface{} `json:"settings,omitempty"` // Merged profile settings
IssuedAt *time.Time `json:"issuedAt,omitempty"`
ExpiresAt *time.Time `json:"expiresAt,omitempty"`
Signature string `json:"signature,omitempty"`
}
// GetHostAgentConfig returns the server-side configuration for a host agent.
// The agent can poll this to apply remote config overrides.
// Uses in-memory caching to avoid disk I/O on every agent report (refs #1094).
func (m *Monitor) GetHostAgentConfig(hostID string) HostAgentConfig {
hostID = strings.TrimSpace(hostID)
if hostID == "" {
return HostAgentConfig{}
}
cfg := HostAgentConfig{}
// 1. Load Host Metadata (CommandsEnabled) - this is already in-memory
if m.hostMetadataStore != nil {
if meta := m.hostMetadataStore.Get(hostID); meta != nil {
cfg.CommandsEnabled = meta.CommandsEnabled
}
}
// 2. Load Profile Configuration from cache
if m.persistence != nil {
profiles, assignments := m.getAgentProfileCache()
var profileID string
for _, a := range assignments {
if a.AgentID == hostID {
profileID = a.ProfileID
break
}
}
if profileID != "" {
for _, p := range profiles {
if p.ID == profileID {
cfg.Settings = p.Config
break
}
}
}
}
return cfg
}
// getAgentProfileCache returns cached profiles and assignments, refreshing if stale.
func (m *Monitor) getAgentProfileCache() ([]models.AgentProfile, []models.AgentProfileAssignment) {
now := time.Now()
// Fast path: check if cache is valid
m.agentProfileCacheMu.RLock()
cache := m.agentProfileCache
if cache != nil && now.Sub(cache.loadedAt) < agentProfileCacheTTL {
profiles := cache.profiles
assignments := cache.assignments
m.agentProfileCacheMu.RUnlock()
return profiles, assignments
}
m.agentProfileCacheMu.RUnlock()
// Slow path: reload from disk
m.agentProfileCacheMu.Lock()
defer m.agentProfileCacheMu.Unlock()
// Double-check after acquiring write lock
if m.agentProfileCache != nil && now.Sub(m.agentProfileCache.loadedAt) < agentProfileCacheTTL {
return m.agentProfileCache.profiles, m.agentProfileCache.assignments
}
var profiles []models.AgentProfile
var assignments []models.AgentProfileAssignment
if loadedAssignments, err := m.persistence.LoadAgentProfileAssignments(); err != nil {
log.Warn().Err(err).Msg("failed to load agent profile assignments for cache")
} else {
assignments = loadedAssignments
}
if loadedProfiles, err := m.persistence.LoadAgentProfiles(); err != nil {
log.Warn().Err(err).Msg("failed to load agent profiles for cache")
} else {
profiles = loadedProfiles
}
m.agentProfileCache = &agentProfileCacheEntry{
profiles: profiles,
assignments: assignments,
loadedAt: now,
}
return profiles, assignments
}
// InvalidateAgentProfileCache clears the agent profile cache, forcing a reload on next access.
// Call this when profiles or assignments are modified.
func (m *Monitor) InvalidateAgentProfileCache() {
m.agentProfileCacheMu.Lock()
m.agentProfileCache = nil
m.agentProfileCacheMu.Unlock()
}
// UpdateHostAgentConfig updates the server-side configuration for a host agent.
// This allows the UI to remotely enable/disable features on agents.
func (m *Monitor) UpdateHostAgentConfig(hostID string, commandsEnabled *bool) error {
hostID = strings.TrimSpace(hostID)
if hostID == "" {
return fmt.Errorf("host id is required")
}
if m.hostMetadataStore == nil {
return fmt.Errorf("host metadata store not initialized")
}
// Get existing metadata or create new
meta := m.hostMetadataStore.Get(hostID)
if meta == nil {
meta = &config.HostMetadata{ID: hostID}
}
meta.CommandsEnabled = commandsEnabled
if err := m.hostMetadataStore.Set(hostID, meta); err != nil {
return fmt.Errorf("failed to save host config: %w", err)
}
// Also update the Host model in state for immediate UI feedback
// The agent will confirm on its next report, but this provides instant feedback
if commandsEnabled != nil {
m.state.SetHostCommandsEnabled(hostID, *commandsEnabled)
}
log.Info().
Str("hostId", hostID).
Interface("commandsEnabled", commandsEnabled).
Msg("Host agent config updated")
return nil
}
// HideDockerHost marks a docker host as hidden without removing it from state.
// Hidden hosts will not be shown in the frontend but will continue to accept updates.
func (m *Monitor) HideDockerHost(hostID string) (models.DockerHost, error) {
hostID = strings.TrimSpace(hostID)
if hostID == "" {
return models.DockerHost{}, fmt.Errorf("docker host id is required")
}
hostID = m.canonicalDockerHostID(hostID)
host, ok := m.state.SetDockerHostHidden(hostID, true)
if !ok {
return models.DockerHost{}, fmt.Errorf("docker host %q not found", hostID)
}
log.Info().
Str("dockerHost", host.Hostname).
Str("dockerHostID", hostID).
Msg("Docker host hidden from view")
return host, nil
}
// UnhideDockerHost marks a docker host as visible again.
func (m *Monitor) UnhideDockerHost(hostID string) (models.DockerHost, error) {
hostID = strings.TrimSpace(hostID)
if hostID == "" {
return models.DockerHost{}, fmt.Errorf("docker host id is required")
}
hostID = m.canonicalDockerHostID(hostID)
host, ok := m.state.SetDockerHostHidden(hostID, false)
if !ok {
return models.DockerHost{}, fmt.Errorf("docker host %q not found", hostID)
}
// Clear removal tracking if it was marked as removed
m.mu.Lock()
delete(m.removedDockerHosts, hostID)
m.mu.Unlock()
log.Info().
Str("dockerHost", host.Hostname).
Str("dockerHostID", hostID).
Msg("Docker host unhidden")
return host, nil
}
// MarkDockerHostPendingUninstall marks a docker host as pending uninstall.
// This is used when the user has run the uninstall command and is waiting for the host to go offline.
func (m *Monitor) MarkDockerHostPendingUninstall(hostID string) (models.DockerHost, error) {
hostID = strings.TrimSpace(hostID)
if hostID == "" {
return models.DockerHost{}, fmt.Errorf("docker host id is required")
}
hostID = m.canonicalDockerHostID(hostID)
host, ok := m.state.SetDockerHostPendingUninstall(hostID, true)
if !ok {
return models.DockerHost{}, fmt.Errorf("docker host %q not found", hostID)
}
log.Info().
Str("dockerHost", host.Hostname).
Str("dockerHostID", hostID).
Msg("Docker host marked as pending uninstall")
return host, nil
}
// SetDockerHostCustomDisplayName updates the custom display name for a docker host.
func (m *Monitor) SetDockerHostCustomDisplayName(hostID string, customName string) (models.DockerHost, error) {
hostID = strings.TrimSpace(hostID)
if hostID == "" {
return models.DockerHost{}, fmt.Errorf("docker host id is required")
}
hostID = m.canonicalDockerHostID(hostID)
customName = strings.TrimSpace(customName)
// Persist to Docker metadata store first
var hostMeta *config.DockerHostMetadata
if customName != "" {
hostMeta = &config.DockerHostMetadata{
CustomDisplayName: customName,
}
}
if err := m.dockerMetadataStore.SetHostMetadata(hostID, hostMeta); err != nil {
log.Error().Err(err).Str("hostID", hostID).Msg("failed to persist Docker host metadata")
return models.DockerHost{}, fmt.Errorf("failed to persist custom display name: %w", err)
}
// Update in-memory state
host, ok := m.state.SetDockerHostCustomDisplayName(hostID, customName)
if !ok {
return models.DockerHost{}, fmt.Errorf("docker host %q not found", hostID)
}
log.Info().
Str("dockerHost", host.Hostname).
Str("dockerHostID", hostID).
Str("customDisplayName", customName).
Msg("Docker host custom display name updated")
return host, nil
}
// AllowDockerHostReenroll removes a host ID from the removal blocklist so it can report again.
func (m *Monitor) AllowDockerHostReenroll(hostID string) error {
hostID = strings.TrimSpace(hostID)
if hostID == "" {
return fmt.Errorf("docker host id is required")
}
m.mu.Lock()
defer m.mu.Unlock()
if host, resolvedHostID, found := m.resolveDockerCommandHostLocked(hostID); found {
hostID = resolvedHostID
if _, exists := m.removedDockerHosts[hostID]; !exists {
event := log.Info().
Str("dockerHostID", hostID)
if hostname := strings.TrimSpace(host.Hostname()); hostname != "" {
event = event.Str("dockerHost", hostname)
}
event.Msg("allow re-enroll requested but host was not blocked; ignoring")
return nil
}
}
if _, exists := m.removedDockerHosts[hostID]; !exists {
event := log.Info().
Str("dockerHostID", hostID)
if host, found := m.stateDockerHostByIDLocked(hostID); found {
event = event.Str("dockerHost", host.Hostname)
}
event.Msg("allow re-enroll requested but host was not blocked; ignoring")
return nil
}
delete(m.removedDockerHosts, hostID)
if cmd, exists := m.dockerCommands[hostID]; exists {
delete(m.dockerCommandIndex, cmd.status.ID)
delete(m.dockerCommands, hostID)
}
m.state.SetDockerHostCommand(hostID, nil)
m.state.RemoveRemovedDockerHost(hostID)
log.Info().
Str("dockerHostID", hostID).
Msg("Docker host removal block cleared; host may report again")
return nil
}
// GetDockerHost retrieves a docker host by identifier if present in state.
func (m *Monitor) GetDockerHost(hostID string) (models.DockerHost, bool) {
hostID = strings.TrimSpace(hostID)
if hostID == "" {
return models.DockerHost{}, false
}
hostID = m.canonicalDockerHostID(hostID)
hosts := m.state.GetDockerHosts()
for _, host := range hosts {
if host.ID == hostID {
return host, true
}
}
return models.DockerHost{}, false
}
// GetDockerHosts returns a point-in-time snapshot of all Docker hosts Pulse knows about.
func (m *Monitor) GetDockerHosts() []models.DockerHost {
if m == nil || m.state == nil {
return nil
}
return m.state.GetDockerHosts()
}
func (m *Monitor) canonicalDockerHostID(hostID string) string {
hostID = normalizeDockerHostID(hostID)
if hostID == "" {
return ""
}
if _, resolvedHostID, found := m.resolveDockerHostView(hostID); found {
return resolvedHostID
}
return hostID
}
func (m *Monitor) resolveDockerHostView(hostID string) (*unifiedresources.DockerHostView, string, bool) {
hostID = normalizeDockerHostID(hostID)
if hostID == "" {
return nil, "", false
}
readState := m.GetUnifiedReadStateOrSnapshot()
if readState == nil {
return nil, "", false
}
for _, host := range readState.DockerHosts() {
if host == nil {
continue
}
candidateID := normalizeDockerHostID(host.ID())
sourceID := normalizeDockerHostID(host.HostSourceID())
if hostID != candidateID && hostID != sourceID {
continue
}
if sourceID == "" {
sourceID = candidateID
}
if sourceID == "" {
return nil, "", false
}
return host, sourceID, true
}
return nil, "", false
}
func (m *Monitor) snapshotBackedUnifiedReadState() unifiedresources.ReadState {
if m == nil || m.state == nil {
return nil
}
registry := unifiedresources.NewRegistry(nil)
registry.IngestSnapshot(m.state.GetSnapshot())
return unifiedresources.NewMonitorAdapter(registry)
}
// RebuildTokenBindings reconstructs agent-to-token binding maps from the current
// state of Docker hosts and host agents. This should be called after API tokens
// are reloaded from disk to ensure bindings remain consistent with the new token set.
// It preserves bindings for tokens that still exist and removes orphaned entries.
func (m *Monitor) RebuildTokenBindings() {
if m == nil || m.config == nil {
return
}
readState := m.GetUnifiedReadStateOrSnapshot()
if readState == nil {
return
}
// Build a set of valid token IDs from the current config
validTokens := make(map[string]struct{})
for _, token := range m.config.APITokens {
if token.ID != "" {
validTokens[token.ID] = struct{}{}
}
}
// Rebuild Docker token bindings
newDockerBindings := make(map[string]string)
for _, host := range readState.DockerHosts() {
if host == nil {
continue
}
tokenID := strings.TrimSpace(host.TokenID())
if tokenID == "" {
continue
}
// Only keep bindings for tokens that still exist in config
if _, valid := validTokens[tokenID]; !valid {
continue
}
// Use AgentID if available, otherwise fall back to host ID
agentID := strings.TrimSpace(host.AgentID())
if agentID == "" {
agentID = strings.TrimSpace(host.HostSourceID())
}
if agentID == "" {
agentID = strings.TrimSpace(host.ID())
}
if agentID != "" {
newDockerBindings[tokenID] = agentID
}
}
// Rebuild Host agent token bindings
newHostBindings := make(map[string]string)
for _, host := range readState.Hosts() {
if host == nil {
continue
}
tokenID := strings.TrimSpace(host.TokenID())
if tokenID == "" {
continue
}
// Only keep bindings for tokens that still exist in config
if _, valid := validTokens[tokenID]; !valid {
continue
}
hostname := strings.TrimSpace(host.Hostname())
agentID := strings.TrimSpace(host.AgentID())
if hostname == "" || agentID == "" {
continue
}
newHostBindings[fmt.Sprintf("%s:%s", tokenID, hostname)] = agentID
}
// Log what changed
m.mu.Lock()
defer m.mu.Unlock()
oldDockerCount := len(m.dockerTokenBindings)
oldHostCount := len(m.hostTokenBindings)
m.dockerTokenBindings = newDockerBindings
m.hostTokenBindings = newHostBindings
log.Info().
Int("dockerBindings", len(newDockerBindings)).
Int("hostBindings", len(newHostBindings)).
Int("previousDockerBindings", oldDockerCount).
Int("previousHostBindings", oldHostCount).
Int("validTokens", len(validTokens)).
Msg("Rebuilt agent token bindings after API token reload")
}
// ClearUnauthenticatedAgents removes all host agents and docker hosts from the state.
// This should be called when security is first configured to clear any agents that
// connected during the brief unauthenticated window before credentials were set up.
// This prevents stale/unauthorized agent data from appearing in the UI.
func (m *Monitor) ClearUnauthenticatedAgents() (int, int) {
if m == nil || m.state == nil {
return 0, 0
}
// Clear all hosts
hostCount := m.state.ClearAllHosts()
// Clear all docker hosts
dockerCount := m.state.ClearAllDockerHosts()
// Clear any token bindings since the tokens used by the old agents are invalid
m.mu.Lock()
m.dockerTokenBindings = make(map[string]string)
m.hostTokenBindings = make(map[string]string)
m.mu.Unlock()
if hostCount > 0 || dockerCount > 0 {
log.Info().
Int("hostsCleared", hostCount).
Int("dockerHostsCleared", dockerCount).
Msg("Cleared unauthenticated agents after security setup")
}
return hostCount, dockerCount
}
// QueueDockerHostStop queues a stop command for the specified docker host.
func (m *Monitor) QueueDockerHostStop(hostID string) (models.DockerHostCommandStatus, error) {
return m.queueDockerStopCommand(hostID)
}
// FetchDockerCommandForHost retrieves the next command payload (if any) for the host.
func (m *Monitor) FetchDockerCommandForHost(hostID string) (map[string]any, *models.DockerHostCommandStatus) {
return m.getDockerCommandPayload(hostID)
}
// AcknowledgeDockerHostCommand updates the lifecycle status for a docker host command.
func (m *Monitor) AcknowledgeDockerHostCommand(commandID, hostID, status, message string) (models.DockerHostCommandStatus, string, bool, error) {
return m.acknowledgeDockerCommand(commandID, hostID, status, message)
}
// ApplyDockerReport ingests a docker agent report into the shared state.
func (m *Monitor) ApplyDockerReport(report agentsdocker.Report, tokenRecord *config.APITokenRecord) (models.DockerHost, error) {
readState := m.snapshotBackedUnifiedReadState()
var dockerHosts []*unifiedresources.DockerHostView
if readState != nil {
dockerHosts = readState.DockerHosts()
}
identifier, legacyIDs, previous, hasPrevious := resolveDockerHostIdentifier(report, tokenRecord, dockerHosts)
if strings.TrimSpace(identifier) == "" {
return models.DockerHost{}, fmt.Errorf("docker report missing agent identifier")
}
// Check if this host was deliberately removed - reject report to prevent resurrection
m.mu.RLock()
removedAt, wasRemoved := m.removedDockerHosts[identifier]
if !wasRemoved {
for _, legacyID := range legacyIDs {
if legacyID == "" || legacyID == identifier {
continue
}
if ts, ok := m.removedDockerHosts[legacyID]; ok {
removedAt = ts
wasRemoved = true
break
}
}
}
m.mu.RUnlock()
if wasRemoved {
log.Info().
Str("dockerHostID", identifier).
Time("removedAt", removedAt).
Msg("Rejecting report from deliberately removed Docker host")
return models.DockerHost{}, fmt.Errorf("docker host %q had monitoring stopped at %v and cannot report again. Use Allow reconnect in Settings -> Infrastructure or rerun the installer with a docker:manage token to clear this block", identifier, removedAt.Format(time.RFC3339))
}
// Enforce token uniqueness: each token can only be bound to one agent
if tokenRecord != nil && tokenRecord.ID != "" {
tokenID := strings.TrimSpace(tokenRecord.ID)
agentID := strings.TrimSpace(report.Agent.ID)
if agentID == "" {
agentID = identifier
}
m.mu.Lock()
if boundAgentID, exists := m.dockerTokenBindings[tokenID]; exists {
if boundAgentID != agentID {
m.mu.Unlock()
// Find the conflicting host to provide helpful error message
conflictingHostname := "unknown"
for _, host := range dockerHosts {
if host == nil {
continue
}
hostSourceID := strings.TrimSpace(host.HostSourceID())
if host.AgentID() == boundAgentID || hostSourceID == boundAgentID || host.ID() == boundAgentID {
conflictingHostname = strings.TrimSpace(host.Name())
if conflictingHostname == "" {
conflictingHostname = strings.TrimSpace(host.Hostname())
}
break
}
}
tokenHint := tokenHintFromRecord(tokenRecord)
if tokenHint != "" {
tokenHint = " (" + tokenHint + ")"
}
log.Warn().
Str("tokenID", tokenID).
Str("tokenHint", tokenHint).
Str("reportingAgentID", agentID).
Str("boundAgentID", boundAgentID).
Str("conflictingHost", conflictingHostname).
Msg("Rejecting Docker report: token already bound to different agent")
return models.DockerHost{}, fmt.Errorf("API token%s is already in use by agent %q (host: %s). Each Docker agent must use a unique API token. Generate a new token for this agent", tokenHint, boundAgentID, conflictingHostname)
}
} else {
// First time seeing this token - bind it to this agent
m.dockerTokenBindings[tokenID] = agentID
log.Debug().
Str("tokenID", tokenID).
Str("agentID", agentID).
Str("hostname", report.Host.Hostname).
Msg("Bound Docker agent token to agent identity")
}
m.mu.Unlock()
}
hostname := strings.TrimSpace(report.Host.Hostname)
if hostname == "" {
return models.DockerHost{}, fmt.Errorf("docker report missing hostname")
}
timestamp := report.Timestamp
if timestamp.IsZero() {
timestamp = time.Now()
}
agentID := strings.TrimSpace(report.Agent.ID)
if agentID == "" {
agentID = identifier
}
displayName := strings.TrimSpace(report.Host.Name)
if displayName == "" {
displayName = hostname
}
runtime := strings.ToLower(strings.TrimSpace(report.Host.Runtime))
switch runtime {
case "", "auto", "default":
runtime = "docker"
case "docker", "podman":
// supported runtimes
default:
runtime = "docker"
}
runtimeVersion := strings.TrimSpace(report.Host.RuntimeVersion)
dockerVersion := strings.TrimSpace(report.Host.DockerVersion)
if runtimeVersion == "" {
runtimeVersion = dockerVersion
}
if dockerVersion == "" {
dockerVersion = runtimeVersion
}
containers := make([]models.DockerContainer, 0, len(report.Containers))
for _, payload := range report.Containers {
container := models.DockerContainer{
ID: payload.ID,
Name: payload.Name,
Image: payload.Image,
ImageDigest: payload.ImageDigest,
State: payload.State,
Status: payload.Status,
Health: payload.Health,
CPUPercent: safeFloat(payload.CPUPercent),
MemoryUsage: payload.MemoryUsageBytes,
MemoryLimit: payload.MemoryLimitBytes,
MemoryPercent: safeFloat(payload.MemoryPercent),
UptimeSeconds: payload.UptimeSeconds,
RestartCount: payload.RestartCount,
ExitCode: payload.ExitCode,
CreatedAt: payload.CreatedAt,
StartedAt: payload.StartedAt,
FinishedAt: payload.FinishedAt,
NetworkRXBytes: payload.NetworkRXBytes,
NetworkTXBytes: payload.NetworkTXBytes,
}
// Copy update status if provided by agent
if payload.UpdateStatus != nil {
container.UpdateStatus = &models.DockerContainerUpdateStatus{
UpdateAvailable: payload.UpdateStatus.UpdateAvailable,
CurrentDigest: payload.UpdateStatus.CurrentDigest,
LatestDigest: payload.UpdateStatus.LatestDigest,
LastChecked: payload.UpdateStatus.LastChecked,
Error: payload.UpdateStatus.Error,
}
}
if len(payload.Ports) > 0 {
ports := make([]models.DockerContainerPort, len(payload.Ports))
for i, port := range payload.Ports {
ports[i] = models.DockerContainerPort{
PrivatePort: port.PrivatePort,
PublicPort: port.PublicPort,
Protocol: port.Protocol,
IP: port.IP,
}
}
container.Ports = ports
}
if len(payload.Labels) > 0 {
labels := make(map[string]string, len(payload.Labels))
for k, v := range payload.Labels {
labels[k] = v
}
container.Labels = labels
}
if len(payload.Networks) > 0 {
networks := make([]models.DockerContainerNetworkLink, len(payload.Networks))
for i, net := range payload.Networks {
networks[i] = models.DockerContainerNetworkLink{
Name: net.Name,
IPv4: net.IPv4,
IPv6: net.IPv6,
}
}
container.Networks = networks
}
container.WritableLayerBytes = payload.WritableLayerBytes
container.RootFilesystemBytes = payload.RootFilesystemBytes
if payload.BlockIO != nil {
container.BlockIO = &models.DockerContainerBlockIO{
ReadBytes: payload.BlockIO.ReadBytes,
WriteBytes: payload.BlockIO.WriteBytes,
}
}
containerIdentifier := payload.ID
if strings.TrimSpace(containerIdentifier) == "" {
containerIdentifier = payload.Name
}
if strings.TrimSpace(containerIdentifier) != "" {
metrics := models.IOMetrics{
NetworkIn: clampToInt64(payload.NetworkRXBytes),
NetworkOut: clampToInt64(payload.NetworkTXBytes),
Timestamp: timestamp,
}
if payload.BlockIO != nil {
metrics.DiskRead = clampToInt64(payload.BlockIO.ReadBytes)
metrics.DiskWrite = clampToInt64(payload.BlockIO.WriteBytes)
}
readRate, writeRate, netInRate, netOutRate := m.rateTracker.CalculateRates(
fmt.Sprintf("docker:%s:%s", identifier, containerIdentifier),
metrics,
)
if container.BlockIO != nil && readRate >= 0 {
value := readRate
container.BlockIO.ReadRateBytesPerSecond = &value
}
if container.BlockIO != nil && writeRate >= 0 {
value := writeRate
container.BlockIO.WriteRateBytesPerSecond = &value
}
if netInRate >= 0 {
container.NetInRate = netInRate
}
if netOutRate >= 0 {
container.NetOutRate = netOutRate
}
}
if len(payload.Mounts) > 0 {
mounts := make([]models.DockerContainerMount, len(payload.Mounts))
for i, mount := range payload.Mounts {
mounts[i] = models.DockerContainerMount{
Type: mount.Type,
Source: mount.Source,
Destination: mount.Destination,
Mode: mount.Mode,
RW: mount.RW,
Propagation: mount.Propagation,
Name: mount.Name,
Driver: mount.Driver,
}
}
container.Mounts = mounts
}
containers = append(containers, container)
}
services := convertDockerServices(report.Services)
tasks := convertDockerTasks(report.Tasks)
swarmInfo := convertDockerSwarmInfo(report.Host.Swarm)
loadAverage := make([]float64, 0, len(report.Host.LoadAverage))
if len(report.Host.LoadAverage) > 0 {
loadAverage = append(loadAverage, report.Host.LoadAverage...)
}
var memory models.Memory
if report.Host.Memory.TotalBytes > 0 || report.Host.Memory.UsedBytes > 0 {
memory = models.Memory{
Total: report.Host.Memory.TotalBytes,
Used: report.Host.Memory.UsedBytes,
Free: report.Host.Memory.FreeBytes,
Usage: safeFloat(report.Host.Memory.Usage),
SwapTotal: report.Host.Memory.SwapTotal,
SwapUsed: report.Host.Memory.SwapUsed,
}
}
// Fallback: if gopsutil's memory reading failed but Docker's TotalMemoryBytes
// is valid (possibly already a fallback from the agent), use that for Total.
// This handles Docker-in-LXC scenarios where both Docker and gopsutil may
// fail to read memory stats, but the agent fix provides a valid fallback.
if memory.Total <= 0 && report.Host.TotalMemoryBytes > 0 {
memory.Total = report.Host.TotalMemoryBytes
}
// Additional fallback for Docker-in-LXC: gopsutil may read Total and Free
// correctly from cgroup limits but return 0 for Used. Calculate Used from
// Total - Free when this happens. This fixes the "0B / 7GB" display issue.
if memory.Used <= 0 && memory.Total > 0 && memory.Free > 0 {
memory.Used = memory.Total - memory.Free
if memory.Used < 0 {
memory.Used = 0
}
// Recalculate usage percentage
if memory.Total > 0 {
memory.Usage = safePercentage(float64(memory.Used), float64(memory.Total))
}
}
disks := make([]models.Disk, 0, len(report.Host.Disks))
for _, disk := range report.Host.Disks {
// Filter virtual/system filesystems (same as ApplyHostReport) to avoid
// inflated disk totals from tmpfs, overlayfs, etc.
if shouldSkip, _ := fsfilters.ShouldSkipFilesystem(disk.Type, disk.Mountpoint, uint64(disk.TotalBytes), uint64(disk.UsedBytes)); shouldSkip {
continue
}
disks = append(disks, models.Disk{
Total: disk.TotalBytes,
Used: disk.UsedBytes,
Free: disk.FreeBytes,
Usage: safeFloat(disk.Usage),
Mountpoint: disk.Mountpoint,
Type: disk.Type,
Device: disk.Device,
})
}
networkIfaces := make([]models.HostNetworkInterface, 0, len(report.Host.Network))
for _, iface := range report.Host.Network {
addresses := append([]string(nil), iface.Addresses...)
networkIfaces = append(networkIfaces, models.HostNetworkInterface{
Name: iface.Name,
MAC: iface.MAC,
Addresses: addresses,
RXBytes: iface.RXBytes,
TXBytes: iface.TXBytes,
SpeedMbps: iface.SpeedMbps,
})
}
agentVersion := normalizeAgentVersion(report.Agent.Version)
if agentVersion == "" && hasPrevious {
agentVersion = normalizeAgentVersion(previous.AgentVersion())
}
host := models.DockerHost{
ID: identifier,
AgentID: agentID,
Hostname: hostname,
DisplayName: displayName,
MachineID: strings.TrimSpace(report.Host.MachineID),
OS: report.Host.OS,
KernelVersion: report.Host.KernelVersion,
Architecture: report.Host.Architecture,
Runtime: runtime,
RuntimeVersion: runtimeVersion,
DockerVersion: dockerVersion,
CPUs: report.Host.TotalCPU,
TotalMemoryBytes: report.Host.TotalMemoryBytes,
UptimeSeconds: report.Host.UptimeSeconds,
CPUUsage: safeFloat(report.Host.CPUUsagePercent),
LoadAverage: loadAverage,
Memory: memory,
Disks: disks,
NetworkInterfaces: networkIfaces,
Status: "online",
LastSeen: timestamp,
IntervalSeconds: report.Agent.IntervalSeconds,
AgentVersion: agentVersion,
Containers: containers,
Services: services,
Tasks: tasks,
Swarm: swarmInfo,
IsLegacy: isLegacyAgent(report.Agent.Type),
}
if hasPrevious {
m.migrateDockerContainerMetadataForRecreatedContainers(identifier, previous.Containers(), host.Containers)
}
if tokenRecord != nil {
host.TokenID = tokenRecord.ID
host.TokenName = tokenRecord.Name
host.TokenHint = tokenHintFromRecord(tokenRecord)
if tokenRecord.LastUsedAt != nil {
t := tokenRecord.LastUsedAt.UTC()
host.TokenLastUsedAt = &t
} else {
t := time.Now().UTC()
host.TokenLastUsedAt = &t
}
} else if hasPrevious {
host.TokenID = previous.TokenID()
host.TokenName = previous.TokenName()
host.TokenHint = previous.TokenHint()
host.TokenLastUsedAt = previous.TokenLastUsedAt()
}
// Load custom display name from metadata store if not already set
if host.CustomDisplayName == "" {
if hostMeta := m.dockerMetadataStore.GetHostMetadata(identifier); hostMeta != nil {
host.CustomDisplayName = hostMeta.CustomDisplayName
}
}
m.state.UpsertDockerHost(host)
m.state.SetConnectionHealth(dockerConnectionPrefix+host.ID, true)
// Check if the host was previously hidden and is now visible again
if hasPrevious && previous.Hidden() && !host.Hidden {
log.Info().
Str("dockerHost", host.Hostname).
Str("dockerHostID", host.ID).
Msg("Docker host auto-unhidden after receiving report")
}
// Check if the host was pending uninstall - if so, log a warning that uninstall failed and clear the flag
if hasPrevious && previous.PendingUninstall() {
log.Warn().
Str("dockerHost", host.Hostname).
Str("dockerHostID", host.ID).
Msg("Docker host reporting again after pending uninstall - uninstall may have failed")
// Clear the pending uninstall flag since the host is clearly still active
m.state.SetDockerHostPendingUninstall(host.ID, false)
}
if m.alertManager != nil {
m.alertManager.CheckDockerHost(host)
}
// Record Docker HOST and CONTAINER metrics for sparkline charts unless the
// canonical mock sampler owns history continuity.
if !shouldSkipNativeMockStateMetricWrites() {
now := time.Now()
hostMetricKey := fmt.Sprintf("dockerHost:%s", host.ID)
// Record host Disk usage (use first disk or calculate total)
var hostDiskPercent float64
if len(host.Disks) > 0 {
hostDiskPercent = host.Disks[0].Usage
}
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(hostMetricKey, "cpu", host.CPUUsage, now)
m.metricsHistory.AddGuestMetric(hostMetricKey, "memory", host.Memory.Usage, now)
m.metricsHistory.AddGuestMetric(hostMetricKey, "disk", hostDiskPercent, now)
}
if m.metricsStore != nil {
m.metricsStore.Write("dockerHost", host.ID, "cpu", host.CPUUsage, now)
m.metricsStore.Write("dockerHost", host.ID, "memory", host.Memory.Usage, now)
m.metricsStore.Write("dockerHost", host.ID, "disk", hostDiskPercent, now)
}
// Use a prefixed key (docker:containerID) to distinguish from Proxmox containers.
for _, container := range containers {
if container.ID == "" {
continue
}
metricKey := fmt.Sprintf("docker:%s", container.ID)
var diskPercent float64
if container.RootFilesystemBytes > 0 && container.WritableLayerBytes > 0 {
diskPercent = float64(container.WritableLayerBytes) / float64(container.RootFilesystemBytes) * 100
if diskPercent > 100 {
diskPercent = 100
}
}
var diskReadRate float64
var diskWriteRate float64
if container.BlockIO != nil {
if container.BlockIO.ReadRateBytesPerSecond != nil {
diskReadRate = *container.BlockIO.ReadRateBytesPerSecond
}
if container.BlockIO.WriteRateBytesPerSecond != nil {
diskWriteRate = *container.BlockIO.WriteRateBytesPerSecond
}
}
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(metricKey, "cpu", container.CPUPercent, now)
m.metricsHistory.AddGuestMetric(metricKey, "memory", container.MemoryPercent, now)
m.metricsHistory.AddGuestMetric(metricKey, "disk", diskPercent, now)
if container.NetInRate >= 0 {
m.metricsHistory.AddGuestMetric(metricKey, "netin", container.NetInRate, now)
}
if container.NetOutRate >= 0 {
m.metricsHistory.AddGuestMetric(metricKey, "netout", container.NetOutRate, now)
}
if diskReadRate >= 0 {
m.metricsHistory.AddGuestMetric(metricKey, "diskread", diskReadRate, now)
}
if diskWriteRate >= 0 {
m.metricsHistory.AddGuestMetric(metricKey, "diskwrite", diskWriteRate, now)
}
}
if m.metricsStore != nil {
m.metricsStore.Write("dockerContainer", container.ID, "cpu", container.CPUPercent, now)
m.metricsStore.Write("dockerContainer", container.ID, "memory", container.MemoryPercent, now)
m.metricsStore.Write("dockerContainer", container.ID, "disk", diskPercent, now)
if container.NetInRate >= 0 {
m.metricsStore.Write("dockerContainer", container.ID, "netin", container.NetInRate, now)
}
if container.NetOutRate >= 0 {
m.metricsStore.Write("dockerContainer", container.ID, "netout", container.NetOutRate, now)
}
if diskReadRate >= 0 {
m.metricsStore.Write("dockerContainer", container.ID, "diskread", diskReadRate, now)
}
if diskWriteRate >= 0 {
m.metricsStore.Write("dockerContainer", container.ID, "diskwrite", diskWriteRate, now)
}
}
}
}
log.Debug().
Str("dockerHost", host.Hostname).
Int("containers", len(containers)).
Msg("Docker host report processed")
return host, nil
}
// ApplyHostReport ingests a host agent report into the shared state.
func (m *Monitor) ApplyHostReport(report agentshost.Report, tokenRecord *config.APITokenRecord) (models.Host, error) {
hostname := strings.TrimSpace(report.Host.Hostname)
if hostname == "" {
return models.Host{}, fmt.Errorf("host report missing hostname")
}
baseIdentifier := strings.TrimSpace(report.Host.ID)
if baseIdentifier != "" {
baseIdentifier = sanitizeDockerHostSuffix(baseIdentifier)
}
if baseIdentifier == "" {
if machine := sanitizeDockerHostSuffix(report.Host.MachineID); machine != "" {
baseIdentifier = machine
}
}
if baseIdentifier == "" {
if agentID := sanitizeDockerHostSuffix(report.Agent.ID); agentID != "" {
baseIdentifier = agentID
}
}
if baseIdentifier == "" {
if hostName := sanitizeDockerHostSuffix(hostname); hostName != "" {
baseIdentifier = hostName
}
}
if baseIdentifier == "" {
seedParts := uniqueNonEmptyStrings(
report.Host.MachineID,
report.Agent.ID,
report.Host.Hostname,
)
if len(seedParts) == 0 {
seedParts = []string{hostname}
}
seed := strings.Join(seedParts, "|")
sum := sha1.Sum([]byte(seed))
baseIdentifier = fmt.Sprintf("agent-%s", hex.EncodeToString(sum[:6]))
}
readState := m.snapshotBackedUnifiedReadState()
var existingHosts []*unifiedresources.HostView
if readState != nil {
existingHosts = readState.Hosts()
}
identifier := baseIdentifier
if tokenRecord != nil && strings.TrimSpace(tokenRecord.ID) != "" {
tokenID := strings.TrimSpace(tokenRecord.ID)
bindingKey := fmt.Sprintf("%s:%s", tokenID, hostname)
m.mu.Lock()
if m.hostTokenBindings == nil {
m.hostTokenBindings = make(map[string]string)
}
boundID := strings.TrimSpace(m.hostTokenBindings[bindingKey])
m.mu.Unlock()
// If we already have a binding for this token+hostname, use it to keep host IDs stable
// even if another colliding host disappears later.
if boundID != "" {
identifier = boundID
} else {
bindingID := baseIdentifier
for _, candidate := range existingHosts {
if candidate == nil || candidate.AgentID() != bindingID {
continue
}
if strings.TrimSpace(candidate.Hostname()) == hostname && strings.TrimSpace(candidate.TokenID()) == tokenID {
break
}
seed := strings.Join([]string{tokenID, hostname, bindingID}, "|")
sum := sha1.Sum([]byte(seed))
suffix := hex.EncodeToString(sum[:4])
base := bindingID
if base == "" {
base = "agent"
}
if len(base) > 40 {
base = base[:40]
}
bindingID = fmt.Sprintf("%s-%s", base, suffix)
break
}
m.mu.Lock()
if m.hostTokenBindings == nil {
m.hostTokenBindings = make(map[string]string)
}
if existing := strings.TrimSpace(m.hostTokenBindings[bindingKey]); existing != "" {
identifier = existing
} else {
m.hostTokenBindings[bindingKey] = bindingID
log.Debug().
Str("tokenID", tokenID).
Str("hostID", bindingID).
Str("hostname", hostname).
Msg("Bound host agent token to hostname")
identifier = bindingID
}
m.mu.Unlock()
}
}
removedAt, wasRemoved := m.lookupRemovedHostAgent(identifier, hostname)
if wasRemoved {
log.Info().
Str("hostID", identifier).
Time("removedAt", removedAt).
Msg("Rejecting report from deliberately removed host agent")
return models.Host{}, fmt.Errorf("host agent %q had monitoring stopped at %v and cannot report again. Use Allow reconnect in Settings -> Infrastructure before reconnecting this host", identifier, removedAt.Format(time.RFC3339))
}
var previous *unifiedresources.HostView
var hasPrevious bool
for _, candidate := range existingHosts {
if candidate != nil && candidate.AgentID() == identifier {
previous = candidate
hasPrevious = true
break
}
}
displayName := strings.TrimSpace(report.Host.DisplayName)
if displayName == "" {
displayName = hostname
}
timestamp := report.Timestamp
if timestamp.IsZero() {
timestamp = time.Now().UTC()
}
memory := models.Memory{
Total: report.Metrics.Memory.TotalBytes,
Used: report.Metrics.Memory.UsedBytes,
Free: report.Metrics.Memory.FreeBytes,
Usage: safeFloat(report.Metrics.Memory.Usage),
SwapTotal: report.Metrics.Memory.SwapTotal,
SwapUsed: report.Metrics.Memory.SwapUsed,
}
// Fallback for LXC environments: gopsutil may read Total and Free correctly
// from cgroup limits but return 0 for Used. Calculate Used from Total - Free.
if memory.Used <= 0 && memory.Total > 0 && memory.Free > 0 {
memory.Used = memory.Total - memory.Free
if memory.Used < 0 {
memory.Used = 0
}
}
if memory.Usage <= 0 && memory.Total > 0 {
memory.Usage = safePercentage(float64(memory.Used), float64(memory.Total))
}
disks := make([]models.Disk, 0, len(report.Disks))
for _, disk := range report.Disks {
// Filter virtual/system filesystems and read-only filesystems to avoid cluttering
// the UI with tmpfs, devtmpfs, /dev, /run, /sys, docker overlay mounts, snap mounts,
// immutable OS images, etc. (issues #505, #690, #790).
if shouldSkip, _ := fsfilters.ShouldSkipFilesystem(disk.Type, disk.Mountpoint, uint64(disk.TotalBytes), uint64(disk.UsedBytes)); shouldSkip {
continue
}
usage := safeFloat(disk.Usage)
if usage <= 0 && disk.TotalBytes > 0 {
usage = safePercentage(float64(disk.UsedBytes), float64(disk.TotalBytes))
}
disks = append(disks, models.Disk{
Total: disk.TotalBytes,
Used: disk.UsedBytes,
Free: disk.FreeBytes,
Usage: usage,
Mountpoint: disk.Mountpoint,
Type: disk.Type,
Device: disk.Device,
})
}
diskIO := make([]models.DiskIO, 0, len(report.DiskIO))
for _, io := range report.DiskIO {
diskIO = append(diskIO, models.DiskIO{
Device: io.Device,
ReadBytes: io.ReadBytes,
WriteBytes: io.WriteBytes,
ReadOps: io.ReadOps,
WriteOps: io.WriteOps,
ReadTime: io.ReadTime,
WriteTime: io.WriteTime,
IOTime: io.IOTime,
})
}
network := make([]models.HostNetworkInterface, 0, len(report.Network))
for _, nic := range report.Network {
network = append(network, models.HostNetworkInterface{
Name: nic.Name,
MAC: nic.MAC,
Addresses: append([]string(nil), nic.Addresses...),
RXBytes: nic.RXBytes,
TXBytes: nic.TXBytes,
SpeedMbps: nic.SpeedMbps,
})
}
raid := make([]models.HostRAIDArray, 0, len(report.RAID))
for _, array := range report.RAID {
devices := make([]models.HostRAIDDevice, 0, len(array.Devices))
for _, dev := range array.Devices {
devices = append(devices, models.HostRAIDDevice{
Device: dev.Device,
State: dev.State,
Slot: dev.Slot,
})
}
raid = append(raid, models.HostRAIDArray{
Device: array.Device,
Name: array.Name,
Level: array.Level,
State: array.State,
TotalDevices: array.TotalDevices,
ActiveDevices: array.ActiveDevices,
WorkingDevices: array.WorkingDevices,
FailedDevices: array.FailedDevices,
SpareDevices: array.SpareDevices,
UUID: array.UUID,
Devices: devices,
RebuildPercent: array.RebuildPercent,
RebuildSpeed: array.RebuildSpeed,
})
}
// Convert Ceph data from agent report
var cephData *models.HostCephCluster
if report.Ceph != nil {
cephData = convertAgentCephToModels(report.Ceph)
}
var unraidData *models.HostUnraidStorage
if report.Unraid != nil {
disks := make([]models.HostUnraidDisk, 0, len(report.Unraid.Disks))
for _, disk := range report.Unraid.Disks {
disks = append(disks, models.HostUnraidDisk{
Name: strings.TrimSpace(disk.Name),
Device: strings.TrimSpace(disk.Device),
Role: strings.TrimSpace(disk.Role),
Status: strings.TrimSpace(disk.Status),
RawStatus: strings.TrimSpace(disk.RawStatus),
Serial: strings.TrimSpace(disk.Serial),
Filesystem: strings.TrimSpace(disk.Filesystem),
SizeBytes: disk.SizeBytes,
Slot: disk.Slot,
})
}
unraidData = &models.HostUnraidStorage{
ArrayStarted: report.Unraid.ArrayStarted,
ArrayState: strings.TrimSpace(report.Unraid.ArrayState),
SyncAction: strings.TrimSpace(report.Unraid.SyncAction),
SyncProgress: report.Unraid.SyncProgress,
SyncErrors: report.Unraid.SyncErrors,
NumProtected: report.Unraid.NumProtected,
NumDisabled: report.Unraid.NumDisabled,
NumInvalid: report.Unraid.NumInvalid,
NumMissing: report.Unraid.NumMissing,
Disks: disks,
}
}
host := models.Host{
ID: identifier,
Hostname: hostname,
DisplayName: displayName,
Platform: strings.TrimSpace(strings.ToLower(report.Host.Platform)),
OSName: strings.TrimSpace(report.Host.OSName),
OSVersion: strings.TrimSpace(report.Host.OSVersion),
KernelVersion: strings.TrimSpace(report.Host.KernelVersion),
Architecture: strings.TrimSpace(report.Host.Architecture),
CPUCount: report.Host.CPUCount,
CPUUsage: safeFloat(report.Metrics.CPUUsagePercent),
LoadAverage: append([]float64(nil), report.Host.LoadAverage...),
Memory: memory,
Disks: disks,
DiskIO: diskIO,
NetworkInterfaces: network,
Sensors: models.HostSensorSummary{
TemperatureCelsius: cloneStringFloatMap(report.Sensors.TemperatureCelsius),
FanRPM: cloneStringFloatMap(report.Sensors.FanRPM),
Additional: cloneStringFloatMap(report.Sensors.Additional),
SMART: convertAgentSMARTToModels(report.Sensors.SMART),
},
RAID: raid,
Unraid: unraidData,
Ceph: cephData,
Status: "online",
UptimeSeconds: report.Host.UptimeSeconds,
IntervalSeconds: report.Agent.IntervalSeconds,
LastSeen: timestamp,
AgentVersion: strings.TrimSpace(report.Agent.Version),
MachineID: strings.TrimSpace(report.Host.MachineID),
CommandsEnabled: report.Agent.CommandsEnabled,
ReportIP: strings.TrimSpace(report.Host.ReportIP),
Tags: append([]string(nil), report.Tags...),
DiskExclude: append([]string(nil), report.Agent.DiskExclude...),
IsLegacy: isLegacyAgent(report.Agent.Type),
}
// Apply any pending commands execution override from server config
// This ensures the UI remains stable when the user toggles this setting,
// even if the agent hasn't yet picked up the new config in this report cycle.
if cfg := m.GetHostAgentConfig(identifier); cfg.CommandsEnabled != nil {
host.CommandsEnabled = *cfg.CommandsEnabled
}
if len(host.LoadAverage) == 0 {
host.LoadAverage = nil
}
if len(host.Disks) == 0 {
host.Disks = nil
}
if len(host.DiskIO) == 0 {
host.DiskIO = nil
}
if len(host.NetworkInterfaces) == 0 {
host.NetworkInterfaces = nil
}
if len(host.RAID) == 0 {
host.RAID = nil
}
if tokenRecord != nil {
host.TokenID = tokenRecord.ID
host.TokenName = tokenRecord.Name
host.TokenHint = tokenHintFromRecord(tokenRecord)
if tokenRecord.LastUsedAt != nil {
t := tokenRecord.LastUsedAt.UTC()
host.TokenLastUsedAt = &t
} else {
now := time.Now().UTC()
host.TokenLastUsedAt = &now
}
} else if hasPrevious {
host.TokenID = previous.TokenID()
host.TokenName = previous.TokenName()
host.TokenHint = previous.TokenHint()
host.TokenLastUsedAt = previous.TokenLastUsedAt()
}
// Link host agent to matching PVE node/VM/container by hostname
// This prevents duplication when users install agents on PVE cluster nodes
linkedNodeID, linkedVMID, linkedContainerID := m.findLinkedProxmoxEntityWithHints(
hostname,
report.Host.ReportIP,
report.Network,
)
if linkedNodeID != "" {
host.LinkedNodeID = linkedNodeID
log.Debug().
Str("hostId", identifier).
Str("hostname", hostname).
Str("linkedNodeId", linkedNodeID).
Msg("Linked host agent to PVE node")
}
if linkedVMID != "" {
host.LinkedVMID = linkedVMID
log.Debug().
Str("hostId", identifier).
Str("hostname", hostname).
Str("linkedVmId", linkedVMID).
Msg("Linked host agent to VM")
}
if linkedContainerID != "" {
host.LinkedContainerID = linkedContainerID
log.Debug().
Str("hostId", identifier).
Str("hostname", hostname).
Str("linkedContainerId", linkedContainerID).
Msg("Linked host agent to container")
}
// Compute I/O rates from cumulative counters before adding to state.
// Network and disk bytes from the agent are cumulative totals since boot;
// the RateTracker converts them to bytes/second, just like VMs and containers.
now := time.Now()
var totalRXBytes, totalTXBytes uint64
for _, nic := range host.NetworkInterfaces {
totalRXBytes += nic.RXBytes
totalTXBytes += nic.TXBytes
}
var totalDiskReadBytes, totalDiskWriteBytes uint64
var totalDiskBusyMs uint64
for _, d := range host.DiskIO {
totalDiskReadBytes += d.ReadBytes
totalDiskWriteBytes += d.WriteBytes
totalDiskBusyMs += d.IOTime
}
hostRateKey := fmt.Sprintf("agent:%s", host.ID)
currentMetrics := IOMetrics{
DiskRead: int64(totalDiskReadBytes),
DiskWrite: int64(totalDiskWriteBytes),
DiskBusy: int64(totalDiskBusyMs),
NetworkIn: int64(totalRXBytes),
NetworkOut: int64(totalTXBytes),
Timestamp: now,
}
diskReadRate, diskWriteRate, netInRate, netOutRate := m.rateTracker.CalculateRates(hostRateKey, currentMetrics)
// Store computed rates on the host model so they flow through to unified resources
if netInRate >= 0 {
host.NetInRate = netInRate
}
if netOutRate >= 0 {
host.NetOutRate = netOutRate
}
if diskReadRate >= 0 {
host.DiskReadRate = diskReadRate
}
if diskWriteRate >= 0 {
host.DiskWriteRate = diskWriteRate
}
m.state.UpsertHost(host)
m.state.SetConnectionHealth(hostConnectionPrefix+host.ID, true)
// Update the linked PVE node to point back to this host agent
if host.LinkedNodeID != "" {
m.linkNodeToHostAgent(host.LinkedNodeID, host.ID)
}
// If host reports Ceph data, also update the global CephClusters state
if report.Ceph != nil {
cephCluster := convertAgentCephToGlobalCluster(report.Ceph, hostname, identifier, timestamp)
m.state.UpsertCephCluster(cephCluster)
log.Debug().
Str("hostId", identifier).
Str("hostname", hostname).
Str("fsid", cephCluster.FSID).
Str("health", cephCluster.Health).
Int("osds", cephCluster.NumOSDs).
Msg("Updated Ceph cluster from host agent")
}
if m.alertManager != nil {
m.alertManager.CheckHost(host)
}
// Record host-agent metrics for sparkline charts.
hostMetricKey := fmt.Sprintf("agent:%s", host.ID)
var hostDiskPercent float64
if len(host.Disks) > 0 {
hostDiskPercent = host.Disks[0].Usage
}
if !shouldSkipNativeMockStateMetricWrites() {
if m.metricsHistory != nil {
m.metricsHistory.AddGuestMetric(hostMetricKey, "cpu", host.CPUUsage, now)
m.metricsHistory.AddGuestMetric(hostMetricKey, "memory", host.Memory.Usage, now)
m.metricsHistory.AddGuestMetric(hostMetricKey, "disk", hostDiskPercent, now)
if netInRate >= 0 {
m.metricsHistory.AddGuestMetric(hostMetricKey, "netin", netInRate, now)
}
if netOutRate >= 0 {
m.metricsHistory.AddGuestMetric(hostMetricKey, "netout", netOutRate, now)
}
if diskReadRate >= 0 {
m.metricsHistory.AddGuestMetric(hostMetricKey, "diskread", diskReadRate, now)
}
if diskWriteRate >= 0 {
m.metricsHistory.AddGuestMetric(hostMetricKey, "diskwrite", diskWriteRate, now)
}
}
m.writeHostPhysicalDiskIOMetrics(host, now)
if m.metricsStore != nil {
m.metricsStore.Write("agent", host.ID, "cpu", host.CPUUsage, now)
m.metricsStore.Write("agent", host.ID, "memory", host.Memory.Usage, now)
m.metricsStore.Write("agent", host.ID, "disk", hostDiskPercent, now)
m.writeHostSMARTMetrics(host, now)
if netInRate >= 0 {
m.metricsStore.Write("agent", host.ID, "netin", netInRate, now)
}
if netOutRate >= 0 {
m.metricsStore.Write("agent", host.ID, "netout", netOutRate, now)
}
if diskReadRate >= 0 {
m.metricsStore.Write("agent", host.ID, "diskread", diskReadRate, now)
}
if diskWriteRate >= 0 {
m.metricsStore.Write("agent", host.ID, "diskwrite", diskWriteRate, now)
}
}
}
// Store cluster peer sensor data if present and evict stale entries
m.applyClusterSensors(report.ClusterSensors, timestamp)
return host, nil
}
func (m *Monitor) writeHostSMARTMetrics(host models.Host, now time.Time) {
if shouldSkipNativeMockStateMetricWrites() || m.metricsStore == nil {
return
}
for _, disk := range host.Sensors.SMART {
resourceID := unifiedresources.HostSMARTDiskSourceID(host, disk)
if resourceID == "" {
continue
}
if disk.Temperature > 0 {
m.metricsStore.Write("disk", resourceID, "smart_temp", float64(disk.Temperature), now)
}
attrs := disk.Attributes
if attrs == nil {
continue
}
if attrs.PowerOnHours != nil {
m.metricsStore.Write("disk", resourceID, "smart_power_on_hours", float64(*attrs.PowerOnHours), now)
}
if attrs.PowerCycles != nil {
m.metricsStore.Write("disk", resourceID, "smart_power_cycles", float64(*attrs.PowerCycles), now)
}
if attrs.ReallocatedSectors != nil {
m.metricsStore.Write("disk", resourceID, "smart_reallocated_sectors", float64(*attrs.ReallocatedSectors), now)
}
if attrs.PendingSectors != nil {
m.metricsStore.Write("disk", resourceID, "smart_pending_sectors", float64(*attrs.PendingSectors), now)
}
if attrs.OfflineUncorrectable != nil {
m.metricsStore.Write("disk", resourceID, "smart_offline_uncorrectable", float64(*attrs.OfflineUncorrectable), now)
}
if attrs.UDMACRCErrors != nil {
m.metricsStore.Write("disk", resourceID, "smart_crc_errors", float64(*attrs.UDMACRCErrors), now)
}
if attrs.PercentageUsed != nil {
m.metricsStore.Write("disk", resourceID, "smart_percentage_used", float64(*attrs.PercentageUsed), now)
}
if attrs.AvailableSpare != nil {
m.metricsStore.Write("disk", resourceID, "smart_available_spare", float64(*attrs.AvailableSpare), now)
}
if attrs.MediaErrors != nil {
m.metricsStore.Write("disk", resourceID, "smart_media_errors", float64(*attrs.MediaErrors), now)
}
if attrs.UnsafeShutdowns != nil {
m.metricsStore.Write("disk", resourceID, "smart_unsafe_shutdowns", float64(*attrs.UnsafeShutdowns), now)
}
}
}
func normalizeHostDiskDevice(device string) string {
return strings.TrimSpace(strings.TrimPrefix(strings.TrimSpace(device), "/dev/"))
}
func hostDiskIOMetricResourceID(host models.Host, io models.DiskIO) string {
device := normalizeHostDiskDevice(io.Device)
if device == "" {
return ""
}
for _, disk := range host.Sensors.SMART {
if disk.Standby {
continue
}
if strings.EqualFold(normalizeHostDiskDevice(disk.Device), device) {
return unifiedresources.HostSMARTDiskSourceID(host, disk)
}
}
return ""
}
func (m *Monitor) writeHostPhysicalDiskIOMetrics(host models.Host, now time.Time) {
if shouldSkipNativeMockStateMetricWrites() || (m.metricsHistory == nil && m.metricsStore == nil) {
return
}
if len(host.DiskIO) == 0 || len(host.Sensors.SMART) == 0 {
return
}
seenResourceIDs := make(map[string]struct{}, len(host.DiskIO))
for _, io := range host.DiskIO {
resourceID := hostDiskIOMetricResourceID(host, io)
if resourceID == "" {
continue
}
if _, seen := seenResourceIDs[resourceID]; seen {
continue
}
seenResourceIDs[resourceID] = struct{}{}
trackerKey := fmt.Sprintf("disk:%s:%s", host.ID, normalizeHostDiskDevice(io.Device))
current := IOMetrics{
DiskRead: int64(io.ReadBytes),
DiskWrite: int64(io.WriteBytes),
DiskBusy: int64(io.IOTime),
Timestamp: now,
}
readRate, writeRate, busyPct, _, _ := m.rateTracker.CalculateRatesWithBusy(trackerKey, current)
if readRate >= 0 {
if m.metricsHistory != nil {
m.metricsHistory.AddDiskMetric(resourceID, "diskread", readRate, now)
}
if m.metricsStore != nil {
m.metricsStore.Write("disk", resourceID, "diskread", readRate, now)
}
}
if writeRate >= 0 {
if m.metricsHistory != nil {
m.metricsHistory.AddDiskMetric(resourceID, "diskwrite", writeRate, now)
}
if m.metricsStore != nil {
m.metricsStore.Write("disk", resourceID, "diskwrite", writeRate, now)
}
}
if busyPct >= 0 {
if m.metricsHistory != nil {
m.metricsHistory.AddDiskMetric(resourceID, "disk", busyPct, now)
}
if m.metricsStore != nil {
m.metricsStore.Write("disk", resourceID, "disk", busyPct, now)
}
}
}
}
// applyClusterSensors stores temperature data collected from Proxmox cluster
// siblings via SSH. Each entry is keyed by lowercase node name so that
// getHostAgentTemperatureByID can use it as a fallback.
func (m *Monitor) applyClusterSensors(entries []agentshost.ClusterNodeSensors, reportTime time.Time) {
// Fast path: nothing to add and cache is empty — skip lock
if len(entries) == 0 {
m.clusterSensorsMu.RLock()
empty := len(m.clusterSensorsCache) == 0
m.clusterSensorsMu.RUnlock()
if empty {
return
}
}
m.clusterSensorsMu.Lock()
defer m.clusterSensorsMu.Unlock()
for _, entry := range entries {
nodeName := strings.ToLower(strings.TrimSpace(entry.NodeName))
if nodeName == "" {
continue
}
if len(entry.Sensors.TemperatureCelsius) == 0 {
continue
}
m.clusterSensorsCache[nodeName] = clusterSensorsCacheEntry{
sensors: models.HostSensorSummary{
TemperatureCelsius: cloneStringFloatMap(entry.Sensors.TemperatureCelsius),
FanRPM: cloneStringFloatMap(entry.Sensors.FanRPM),
Additional: cloneStringFloatMap(entry.Sensors.Additional),
},
updatedAt: reportTime,
}
}
// Evict stale entries to prevent unbounded cache growth.
// Cluster sizes are small (3-16 nodes) so this is cheap.
const staleThreshold = 5 * time.Minute
now := time.Now()
for key, entry := range m.clusterSensorsCache {
if now.Sub(entry.updatedAt) > staleThreshold {
delete(m.clusterSensorsCache, key)
}
}
}
// findLinkedProxmoxEntity searches for a PVE node, VM, or container with a matching hostname.
// Returns the IDs of matched entities (empty string if no match).
// When multiple entities match the same hostname (e.g., two PVE instances both have a node
// named "pve"), this function returns empty strings to avoid incorrect linking. Users should
// manually link agents to nodes via the UI in such cases.
func (m *Monitor) findLinkedProxmoxEntity(hostname string) (nodeID, vmID, containerID string) {
return m.findLinkedProxmoxEntityWithHints(hostname, "", nil)
}
func collectReportedHostIPs(
reportIP string,
network []agentshost.NetworkInterface,
) map[string]struct{} {
ips := make(map[string]struct{})
if normalized := unifiedresources.NormalizeIP(reportIP); normalized != "" {
ips[normalized] = struct{}{}
}
for _, nic := range network {
for _, address := range nic.Addresses {
if normalized := unifiedresources.NormalizeIP(address); normalized != "" {
ips[normalized] = struct{}{}
}
}
}
return ips
}
func endpointHostMatchesReportedHints(
endpointHost string,
reportedHostname string,
reportedIPs map[string]struct{},
) bool {
normalizedEndpointHost := strings.TrimSpace(strings.ToLower(endpointHost))
if normalizedEndpointHost == "" {
return false
}
if normalizedReportedIP := unifiedresources.NormalizeIP(normalizedEndpointHost); normalizedReportedIP != "" {
_, ok := reportedIPs[normalizedReportedIP]
return ok
}
normalizedReportedHostname := strings.TrimSpace(strings.ToLower(reportedHostname))
return normalizedReportedHostname != "" && normalizedEndpointHost == normalizedReportedHostname
}
func (m *Monitor) findLinkedProxmoxEntityWithHints(
hostname string,
reportIP string,
network []agentshost.NetworkInterface,
) (nodeID, vmID, containerID string) {
if hostname == "" {
return "", "", ""
}
// Normalize hostname for comparison (lowercase, strip domain)
normalizedHostname := strings.ToLower(hostname)
shortHostname := normalizedHostname
if idx := strings.Index(normalizedHostname, "."); idx > 0 {
shortHostname = normalizedHostname[:idx]
}
matchHostname := func(name string) bool {
normalized := strings.ToLower(name)
if normalized == normalizedHostname || normalized == shortHostname {
return true
}
// Also check short version of the candidate
if idx := strings.Index(normalized, "."); idx > 0 {
if normalized[:idx] == shortHostname {
return true
}
}
return false
}
readState := m.GetUnifiedReadStateOrSnapshot()
if readState == nil {
return "", "", ""
}
type linkedEntityMatch struct {
id string
instance string
}
reportedIPs := collectReportedHostIPs(reportIP, network)
// First, try to match the configured PVE node endpoint against the host report.
// This is stronger than node-name matching and can disambiguate clustered nodes
// that share the same short hostname but have different management addresses.
var endpointMatchedNodes []linkedEntityMatch
for _, node := range readState.Nodes() {
if endpointHostMatchesReportedHints(extractHostname(node.HostURL()), hostname, reportedIPs) {
endpointMatchedNodes = append(endpointMatchedNodes, linkedEntityMatch{
id: node.SourceID(),
instance: node.Instance(),
})
}
}
if len(endpointMatchedNodes) == 1 {
return endpointMatchedNodes[0].id, "", ""
}
if len(endpointMatchedNodes) > 1 {
log.Warn().
Str("hostname", hostname).
Str("reportIP", strings.TrimSpace(reportIP)).
Int("matchCount", len(endpointMatchedNodes)).
Msg("Multiple PVE node endpoints match host report hints - cannot auto-link host agent. Manual linking required via UI.")
return "", "", ""
}
// Check PVE nodes first - but detect ambiguity when multiple nodes match
var matchingNodes []linkedEntityMatch
for _, node := range readState.Nodes() {
if matchHostname(node.Name()) {
matchingNodes = append(matchingNodes, linkedEntityMatch{
id: node.SourceID(),
instance: node.Instance(),
})
}
}
if len(matchingNodes) == 1 {
return matchingNodes[0].id, "", ""
}
if len(matchingNodes) > 1 {
// Multiple nodes with the same hostname - can't auto-link, would cause data mixing
log.Warn().
Str("hostname", hostname).
Int("matchCount", len(matchingNodes)).
Strs("instances", func() []string {
instances := make([]string, len(matchingNodes))
for i, n := range matchingNodes {
instances[i] = n.instance
}
return instances
}()).
Msg("Multiple PVE nodes match hostname - cannot auto-link host agent. Manual linking required via UI.")
return "", "", ""
}
// Check VMs - same pattern for ambiguity detection
var matchingVMs []linkedEntityMatch
for _, vm := range readState.VMs() {
if matchHostname(vm.Name()) {
matchingVMs = append(matchingVMs, linkedEntityMatch{
id: vm.SourceID(),
})
}
}
if len(matchingVMs) == 1 {
return "", matchingVMs[0].id, ""
}
if len(matchingVMs) > 1 {
log.Warn().
Str("hostname", hostname).
Int("matchCount", len(matchingVMs)).
Msg("Multiple VMs match hostname - cannot auto-link host agent. Manual linking required via UI.")
return "", "", ""
}
// Check containers - same pattern
var matchingCTs []linkedEntityMatch
for _, ct := range readState.Containers() {
if matchHostname(ct.Name()) {
matchingCTs = append(matchingCTs, linkedEntityMatch{
id: ct.SourceID(),
})
}
}
if len(matchingCTs) == 1 {
return "", "", matchingCTs[0].id
}
if len(matchingCTs) > 1 {
log.Warn().
Str("hostname", hostname).
Int("matchCount", len(matchingCTs)).
Msg("Multiple containers match hostname - cannot auto-link host agent. Manual linking required via UI.")
return "", "", ""
}
return "", "", ""
}
// linkNodeToHostAgent updates a PVE node to link to its host agent.
func (m *Monitor) linkNodeToHostAgent(nodeID, hostAgentID string) {
m.state.LinkNodeToHostAgent(nodeID, hostAgentID)
}
const (
removedDockerHostsTTL = 24 * time.Hour // Clean up removed hosts tracking after 24 hours
removedHostAgentsTTL = 24 * time.Hour // Clean up removed host agent tracking after 24 hours
)
// recoverFromPanic recovers from panics in monitoring goroutines and logs them.
// This prevents a panic in one component from crashing the entire monitoring system.
func recoverFromPanic(goroutineName string) {
if r := recover(); r != nil {
log.Error().
Str("goroutine", goroutineName).
Interface("panic", r).
Stack().
Msg("Recovered from panic in monitoring goroutine")
}
}
// cleanupRemovedDockerHosts removes entries from the removed hosts map that are older than 24 hours.
func (m *Monitor) cleanupRemovedDockerHosts(now time.Time) {
// Collect IDs to remove first to avoid holding lock during state update
var toRemove []string
m.mu.Lock()
for hostID, removedAt := range m.removedDockerHosts {
if now.Sub(removedAt) > removedDockerHostsTTL {
toRemove = append(toRemove, hostID)
}
}
m.mu.Unlock()
// Remove from state and map without holding both locks
for _, hostID := range toRemove {
m.state.RemoveRemovedDockerHost(hostID)
m.mu.Lock()
removedAt := m.removedDockerHosts[hostID]
delete(m.removedDockerHosts, hostID)
m.mu.Unlock()
log.Debug().
Str("dockerHostID", hostID).
Time("removedAt", removedAt).
Msg("Cleaned up old removed Docker host entry")
}
}
// cleanupRemovedHostAgents removes entries from the removed host-agent map that are older than 24 hours.
func (m *Monitor) cleanupRemovedHostAgents(now time.Time) {
var toRemove []string
m.mu.Lock()
for hostID, removedAt := range m.removedHostAgents {
if now.Sub(removedAt) > removedHostAgentsTTL {
toRemove = append(toRemove, hostID)
}
}
m.mu.Unlock()
for _, hostID := range toRemove {
m.state.RemoveRemovedHostAgent(hostID)
m.mu.Lock()
removedAt := m.removedHostAgents[hostID]
delete(m.removedHostAgents, hostID)
m.mu.Unlock()
log.Debug().
Str("hostID", hostID).
Time("removedAt", removedAt).
Msg("Cleaned up old removed host agent entry")
}
}
// cleanupGuestMetadataCache removes stale guest metadata cache and limiter entries.
// Entries older than 2x the cache TTL (10 minutes) are removed to prevent unbounded growth
// when VMs are deleted or moved.
func (m *Monitor) cleanupGuestMetadataCache(now time.Time) {
const maxAge = 2 * guestMetadataCacheTTL // 10 minutes
m.guestMetadataMu.Lock()
for key, entry := range m.guestMetadataCache {
if now.Sub(entry.fetchedAt) > maxAge {
delete(m.guestMetadataCache, key)
log.Debug().
Str("key", key).
Time("fetchedAt", entry.fetchedAt).
Msg("Cleaned up stale guest metadata cache entry")
}
}
m.guestMetadataMu.Unlock()
m.guestMetadataLimiterMu.Lock()
defer m.guestMetadataLimiterMu.Unlock()
for key, nextAllowed := range m.guestMetadataLimiter {
// Keep near-term limiter state; remove long-idle keys.
if now.Sub(nextAllowed) > maxAge {
delete(m.guestMetadataLimiter, key)
log.Debug().
Str("key", key).
Time("nextAllowed", nextAllowed).
Msg("Cleaned up stale guest metadata limiter entry")
}
}
}
// cleanupTrackingMaps removes stale entries from various tracking maps to prevent unbounded memory growth.
// This cleans up auth tracking, polling timestamps, and circuit breaker state for resources
// that haven't been accessed in over 24 hours.
func (m *Monitor) cleanupTrackingMaps(now time.Time) {
const staleThreshold = 24 * time.Hour
cutoff := now.Add(-staleThreshold)
cleaned := 0
activeKeys := m.activeSchedulerKeys()
m.mu.Lock()
defer m.mu.Unlock()
// Clean up auth tracking maps - entries older than 24 hours
for nodeID, ts := range m.lastAuthAttempt {
if ts.Before(cutoff) {
delete(m.lastAuthAttempt, nodeID)
delete(m.authFailures, nodeID)
cleaned++
}
}
// Clean up last cluster check timestamps
for instanceID, ts := range m.lastClusterCheck {
if ts.Before(cutoff) {
delete(m.lastClusterCheck, instanceID)
cleaned++
}
}
// Clean up last physical disk poll timestamps
for instanceID, ts := range m.lastPhysicalDiskPoll {
if ts.Before(cutoff) {
delete(m.lastPhysicalDiskPoll, instanceID)
cleaned++
}
}
// Clean up last PVE backup poll timestamps
for instanceID, ts := range m.lastPVEBackupPoll {
if ts.Before(cutoff) {
delete(m.lastPVEBackupPoll, instanceID)
cleaned++
}
}
// Clean up last PBS backup poll timestamps
for instanceID, ts := range m.lastPBSBackupPoll {
if ts.Before(cutoff) {
delete(m.lastPBSBackupPoll, instanceID)
cleaned++
}
}
// Only clean up circuit breakers for inactive keys that have been idle
// for longer than the stale threshold
for key, breaker := range m.circuitBreakers {
if _, active := activeKeys[key]; !active {
// Key is not in active clients - check if breaker is stale
if breaker != nil {
_, _, _, _, lastTransition := breaker.stateDetails()
if now.Sub(lastTransition) > staleThreshold {
delete(m.circuitBreakers, key)
delete(m.failureCounts, key)
delete(m.lastOutcome, key)
cleaned++
}
}
}
}
if cleaned > 0 {
log.Debug().
Int("entriesCleaned", cleaned).
Msg("Cleaned stale entries from monitor tracking maps")
}
}
// cleanupDiagnosticSnapshots removes stale diagnostic snapshots.
// Snapshots older than 1 hour are removed to prevent unbounded growth
// when nodes/VMs are deleted or reconfigured.
func (m *Monitor) cleanupDiagnosticSnapshots(now time.Time) {
const maxAge = 1 * time.Hour
m.diagMu.Lock()
defer m.diagMu.Unlock()
for key, snapshot := range m.nodeSnapshots {
if now.Sub(snapshot.RetrievedAt) > maxAge {
delete(m.nodeSnapshots, key)
log.Debug().
Str("key", key).
Time("retrievedAt", snapshot.RetrievedAt).
Msg("Cleaned up stale node snapshot")
}
}
for key, snapshot := range m.guestSnapshots {
if now.Sub(snapshot.RetrievedAt) > maxAge {
delete(m.guestSnapshots, key)
log.Debug().
Str("key", key).
Time("retrievedAt", snapshot.RetrievedAt).
Msg("Cleaned up stale guest snapshot")
}
}
}
// cleanupRRDCache removes stale RRD memory cache entries.
// Entries older than their short-lived TTL windows are removed to prevent
// unbounded growth when nodes or guests disappear from the poll set.
func (m *Monitor) cleanupRRDCache(now time.Time) {
const maxAge = 2 * nodeRRDCacheTTL // 1 minute
m.rrdCacheMu.Lock()
defer m.rrdCacheMu.Unlock()
for key, entry := range m.nodeRRDMemCache {
if now.Sub(entry.fetchedAt) > maxAge {
delete(m.nodeRRDMemCache, key)
log.Debug().
Str("node", key).
Time("fetchedAt", entry.fetchedAt).
Msg("Cleaned up stale RRD cache entry")
}
}
for key, entry := range m.vmRRDMemCache {
if now.Sub(entry.fetchedAt) > maxAge {
delete(m.vmRRDMemCache, key)
}
}
for key, entry := range m.vmAgentMemCache {
if now.Sub(entry.fetchedAt) > vmAgentMemCleanupMaxAge {
delete(m.vmAgentMemCache, key)
}
}
}
// cleanupMetricsHistory removes stale entries from the metrics history.
// This prevents unbounded memory growth when containers/VMs are deleted.
func (m *Monitor) cleanupMetricsHistory() {
if m.metricsHistory != nil {
m.metricsHistory.Cleanup()
}
}
// cleanupRateTracker removes stale entries from the rate tracker.
// Entries older than 24 hours are removed to prevent unbounded memory growth.
func (m *Monitor) cleanupRateTracker(now time.Time) {
const staleThreshold = 24 * time.Hour
cutoff := now.Add(-staleThreshold)
if m.rateTracker != nil {
if removed := m.rateTracker.Cleanup(cutoff); removed > 0 {
log.Debug().
Int("entriesRemoved", removed).
Msg("Cleaned up stale rate tracker entries")
}
}
}
// evaluateDockerAgents updates health for Docker hosts based on last report time.
func (m *Monitor) evaluateDockerAgents(now time.Time) {
hosts := m.state.GetDockerHosts()
for _, host := range hosts {
interval := host.IntervalSeconds
if interval <= 0 {
interval = int(dockerMinimumHealthWindow / time.Second)
}
window := time.Duration(interval) * time.Second * dockerOfflineGraceMultiplier
if window < dockerMinimumHealthWindow {
window = dockerMinimumHealthWindow
} else if window > dockerMaximumHealthWindow {
window = dockerMaximumHealthWindow
}
healthy := !host.LastSeen.IsZero() && now.Sub(host.LastSeen) <= window
key := dockerConnectionPrefix + host.ID
m.state.SetConnectionHealth(key, healthy)
hostCopy := host
if healthy {
hostCopy.Status = "online"
m.state.SetDockerHostStatus(host.ID, "online")
if m.alertManager != nil {
m.alertManager.HandleDockerHostOnline(hostCopy)
}
} else {
hostCopy.Status = "offline"
m.state.SetDockerHostStatus(host.ID, "offline")
if m.alertManager != nil {
m.alertManager.HandleDockerHostOffline(hostCopy)
}
}
}
}
// evaluateHostAgents updates health for host agents based on last report time.
func (m *Monitor) evaluateHostAgents(now time.Time) {
hosts := m.state.GetHosts()
for _, host := range hosts {
interval := host.IntervalSeconds
if interval <= 0 {
interval = int(hostMinimumHealthWindow / time.Second)
}
window := time.Duration(interval) * time.Second * hostOfflineGraceMultiplier
if window < hostMinimumHealthWindow {
window = hostMinimumHealthWindow
} else if window > hostMaximumHealthWindow {
window = hostMaximumHealthWindow
}
age := now.Sub(host.LastSeen)
healthy := !host.LastSeen.IsZero() && age <= window
key := hostConnectionPrefix + host.ID
m.state.SetConnectionHealth(key, healthy)
hostCopy := host
if healthy {
hostCopy.Status = "online"
// Log status transition from offline to online
if host.Status == "offline" {
log.Debug().
Str("hostID", host.ID).
Str("hostname", host.Hostname).
Dur("age", age).
Dur("window", window).
Msg("Host agent back online")
}
m.state.SetHostStatus(host.ID, "online")
if m.alertManager != nil {
m.alertManager.HandleHostOnline(hostCopy)
}
} else {
hostCopy.Status = "offline"
// Log status transition from online to offline with diagnostic info
if host.Status == "online" || host.Status == "" {
log.Debug().
Str("hostID", host.ID).
Str("hostname", host.Hostname).
Time("lastSeen", host.LastSeen).
Dur("age", age).
Dur("window", window).
Int("intervalSeconds", host.IntervalSeconds).
Bool("lastSeenZero", host.LastSeen.IsZero()).
Msg("Host agent appears offline")
}
m.state.SetHostStatus(host.ID, "offline")
if m.alertManager != nil {
m.alertManager.HandleHostOffline(hostCopy)
}
}
}
}
// sortContent sorts comma-separated content values for consistent display