Pulse/internal/monitoring/kubernetes_agents.go
rcourtman 2e0da42a81 chore: reliability and maintenance improvements
Host agent:
- Add SHA256 checksum verification for downloaded binaries
- Verify checksum file matches expected bundle filename

WebSocket:
- Add write failure tracking with graceful disconnection
- Increase write deadline to 30s for large state payloads
- Better handling for slow clients (Raspberry Pi, slow networks)

Monitoring:
- Remove unused temperature proxy imports
- Add monitor polling improvements
- Expand test coverage

Other:
- Update package.json dependencies
- Fix generate-release-notes.sh path handling
- Minor reporting engine cleanup
2026-01-22 00:45:04 +00:00

389 lines
13 KiB
Go

package monitoring
import (
"crypto/sha256"
"encoding/hex"
"fmt"
"strings"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/config"
"github.com/rcourtman/pulse-go-rewrite/internal/logging"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
agentsk8s "github.com/rcourtman/pulse-go-rewrite/pkg/agents/kubernetes"
"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
)
const (
removedKubernetesClustersTTL = 24 * time.Hour
)
func normalizeKubernetesClusterIdentifier(report agentsk8s.Report) string {
if v := strings.TrimSpace(report.Cluster.ID); v != "" {
return v
}
if v := strings.TrimSpace(report.Agent.ID); v != "" {
return v
}
stableKey := strings.TrimSpace(report.Cluster.Server) + "|" + strings.TrimSpace(report.Cluster.Context) + "|" + strings.TrimSpace(report.Cluster.Name)
stableKey = strings.TrimSpace(stableKey)
if stableKey == "||" || stableKey == "" {
return ""
}
sum := sha256.Sum256([]byte(stableKey))
return hex.EncodeToString(sum[:])
}
// ApplyKubernetesReport ingests a Kubernetes agent report into state.
func (m *Monitor) ApplyKubernetesReport(report agentsk8s.Report, tokenRecord *config.APITokenRecord) (models.KubernetesCluster, error) {
identifier := normalizeKubernetesClusterIdentifier(report)
if strings.TrimSpace(identifier) == "" {
return models.KubernetesCluster{}, fmt.Errorf("kubernetes report missing cluster identifier")
}
// Check if this cluster was deliberately removed - reject report to prevent resurrection
m.mu.RLock()
removedAt, wasRemoved := m.removedKubernetesClusters[identifier]
m.mu.RUnlock()
if wasRemoved {
log.Info().
Str("k8sClusterID", identifier).
Time("removedAt", removedAt).
Msg("Rejecting report from deliberately removed Kubernetes cluster")
return models.KubernetesCluster{}, fmt.Errorf("kubernetes cluster %q was removed at %v and cannot report again. Use Allow re-enroll in Settings -> Agents -> Removed Kubernetes Clusters or rerun the installer with a kubernetes:manage token to clear this block", identifier, removedAt.Format(time.RFC3339))
}
// Enforce token uniqueness: each token can only be bound to one cluster agent
if tokenRecord != nil && tokenRecord.ID != "" {
tokenID := strings.TrimSpace(tokenRecord.ID)
agentID := strings.TrimSpace(report.Agent.ID)
if agentID == "" {
agentID = identifier
}
m.mu.Lock()
if boundAgentID, exists := m.kubernetesTokenBindings[tokenID]; exists {
if boundAgentID != agentID {
m.mu.Unlock()
tokenHint := tokenHintFromRecord(tokenRecord)
if tokenHint != "" {
tokenHint = " (" + tokenHint + ")"
}
log.Warn().
Str("tokenID", tokenID).
Str("tokenHint", tokenHint).
Str("reportingAgentID", agentID).
Str("boundAgentID", boundAgentID).
Msg("Rejecting Kubernetes report: token already bound to different agent")
return models.KubernetesCluster{}, fmt.Errorf("API token%s is already in use by agent %q. Each Kubernetes agent must use a unique API token. Generate a new token for this agent", tokenHint, boundAgentID)
}
} else {
m.kubernetesTokenBindings[tokenID] = agentID
log.Debug().
Str("tokenID", tokenID).
Str("agentID", agentID).
Str("clusterID", identifier).
Msg("Bound Kubernetes agent token to agent identity")
}
m.mu.Unlock()
}
timestamp := report.Timestamp
if timestamp.IsZero() {
timestamp = time.Now()
}
agentID := strings.TrimSpace(report.Agent.ID)
if agentID == "" {
agentID = identifier
}
name := strings.TrimSpace(report.Cluster.Name)
displayName := name
if displayName == "" {
displayName = identifier
}
nodes := make([]models.KubernetesNode, 0, len(report.Nodes))
for _, n := range report.Nodes {
roles := append([]string(nil), n.Roles...)
nodes = append(nodes, models.KubernetesNode{
UID: strings.TrimSpace(n.UID),
Name: strings.TrimSpace(n.Name),
Ready: n.Ready,
Unschedulable: n.Unschedulable,
KubeletVersion: strings.TrimSpace(n.KubeletVersion),
ContainerRuntimeVersion: strings.TrimSpace(n.ContainerRuntimeVersion),
OSImage: strings.TrimSpace(n.OSImage),
KernelVersion: strings.TrimSpace(n.KernelVersion),
Architecture: strings.TrimSpace(n.Architecture),
CapacityCPU: n.Capacity.CPUCores,
CapacityMemoryBytes: n.Capacity.MemoryBytes,
CapacityPods: n.Capacity.Pods,
AllocCPU: n.Allocatable.CPUCores,
AllocMemoryBytes: n.Allocatable.MemoryBytes,
AllocPods: n.Allocatable.Pods,
Roles: roles,
})
}
pods := make([]models.KubernetesPod, 0, len(report.Pods))
for _, p := range report.Pods {
labels := make(map[string]string, len(p.Labels))
for k, v := range p.Labels {
labels[k] = v
}
containers := make([]models.KubernetesPodContainer, 0, len(p.Containers))
for _, c := range p.Containers {
containers = append(containers, models.KubernetesPodContainer{
Name: strings.TrimSpace(c.Name),
Image: strings.TrimSpace(c.Image),
Ready: c.Ready,
RestartCount: c.RestartCount,
State: strings.TrimSpace(c.State),
Reason: strings.TrimSpace(c.Reason),
Message: strings.TrimSpace(c.Message),
})
}
pods = append(pods, models.KubernetesPod{
UID: strings.TrimSpace(p.UID),
Name: strings.TrimSpace(p.Name),
Namespace: strings.TrimSpace(p.Namespace),
NodeName: strings.TrimSpace(p.NodeName),
Phase: strings.TrimSpace(p.Phase),
Reason: strings.TrimSpace(p.Reason),
Message: strings.TrimSpace(p.Message),
QoSClass: strings.TrimSpace(p.QoSClass),
CreatedAt: p.CreatedAt,
StartTime: p.StartTime,
Restarts: p.Restarts,
Labels: labels,
OwnerKind: strings.TrimSpace(p.OwnerKind),
OwnerName: strings.TrimSpace(p.OwnerName),
Containers: containers,
})
}
deployments := make([]models.KubernetesDeployment, 0, len(report.Deployments))
for _, d := range report.Deployments {
labels := make(map[string]string, len(d.Labels))
for k, v := range d.Labels {
labels[k] = v
}
deployments = append(deployments, models.KubernetesDeployment{
UID: strings.TrimSpace(d.UID),
Name: strings.TrimSpace(d.Name),
Namespace: strings.TrimSpace(d.Namespace),
DesiredReplicas: d.DesiredReplicas,
UpdatedReplicas: d.UpdatedReplicas,
ReadyReplicas: d.ReadyReplicas,
AvailableReplicas: d.AvailableReplicas,
Labels: labels,
})
}
agentVersion := normalizeAgentVersion(report.Agent.Version)
cluster := models.KubernetesCluster{
ID: identifier,
AgentID: agentID,
Name: name,
DisplayName: displayName,
Server: strings.TrimSpace(report.Cluster.Server),
Context: strings.TrimSpace(report.Cluster.Context),
Version: strings.TrimSpace(report.Cluster.Version),
Status: "online",
LastSeen: timestamp,
IntervalSeconds: report.Agent.IntervalSeconds,
AgentVersion: agentVersion,
Nodes: nodes,
Pods: pods,
Deployments: deployments,
}
if tokenRecord != nil {
cluster.TokenID = strings.TrimSpace(tokenRecord.ID)
cluster.TokenName = strings.TrimSpace(tokenRecord.Name)
cluster.TokenHint = tokenHintFromRecord(tokenRecord)
cluster.TokenLastUsedAt = tokenRecord.LastUsedAt
}
m.state.UpsertKubernetesCluster(cluster)
m.state.SetConnectionHealth(kubernetesConnectionPrefix+identifier, true)
return cluster, nil
}
// RemoveKubernetesCluster removes a kubernetes cluster from the shared state and clears related data.
func (m *Monitor) RemoveKubernetesCluster(clusterID string) (models.KubernetesCluster, error) {
clusterID = strings.TrimSpace(clusterID)
if clusterID == "" {
return models.KubernetesCluster{}, fmt.Errorf("kubernetes cluster id is required")
}
cluster, removed := m.state.RemoveKubernetesCluster(clusterID)
if !removed {
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().Str("k8sClusterID", clusterID).Msg("Kubernetes cluster not present in state during removal; proceeding")
}
cluster = models.KubernetesCluster{
ID: clusterID,
Name: clusterID,
DisplayName: clusterID,
}
}
// Revoke the API token associated with this Kubernetes cluster
if cluster.TokenID != "" {
tokenRemoved := m.config.RemoveAPIToken(cluster.TokenID)
if tokenRemoved != nil {
m.config.SortAPITokens()
if m.persistence != nil {
if err := m.persistence.SaveAPITokens(m.config.APITokens); err != nil {
log.Warn().Err(err).Str("tokenID", cluster.TokenID).Msg("Failed to persist API token revocation after Kubernetes cluster removal")
} else {
log.Info().Str("tokenID", cluster.TokenID).Str("tokenName", cluster.TokenName).Msg("API token revoked for removed Kubernetes cluster")
}
}
}
}
removedAt := time.Now()
m.mu.Lock()
m.removedKubernetesClusters[clusterID] = removedAt
if cluster.TokenID != "" {
delete(m.kubernetesTokenBindings, cluster.TokenID)
log.Debug().
Str("tokenID", cluster.TokenID).
Str("k8sClusterID", clusterID).
Msg("Unbound Kubernetes agent token from removed cluster")
}
m.mu.Unlock()
m.state.AddRemovedKubernetesCluster(models.RemovedKubernetesCluster{
ID: clusterID,
Name: cluster.Name,
DisplayName: cluster.DisplayName,
RemovedAt: removedAt,
})
m.state.RemoveConnectionHealth(kubernetesConnectionPrefix + clusterID)
log.Info().
Str("k8sClusterID", clusterID).
Bool("removed", removed).
Msg("Kubernetes cluster removed")
return cluster, nil
}
// AllowKubernetesClusterReenroll clears the removal block for a kubernetes cluster.
func (m *Monitor) AllowKubernetesClusterReenroll(clusterID string) error {
clusterID = strings.TrimSpace(clusterID)
if clusterID == "" {
return fmt.Errorf("kubernetes cluster id is required")
}
m.mu.Lock()
_, exists := m.removedKubernetesClusters[clusterID]
if !exists {
m.mu.Unlock()
return nil
}
delete(m.removedKubernetesClusters, clusterID)
m.mu.Unlock()
m.state.RemoveRemovedKubernetesCluster(clusterID)
return nil
}
// UnhideKubernetesCluster clears the hidden flag.
func (m *Monitor) UnhideKubernetesCluster(clusterID string) (models.KubernetesCluster, error) {
clusterID = strings.TrimSpace(clusterID)
if clusterID == "" {
return models.KubernetesCluster{}, fmt.Errorf("kubernetes cluster id is required")
}
cluster, ok := m.state.SetKubernetesClusterHidden(clusterID, false)
if !ok {
return models.KubernetesCluster{}, fmt.Errorf("kubernetes cluster %q not found", clusterID)
}
return cluster, nil
}
// MarkKubernetesClusterPendingUninstall sets the pending uninstall flag on a cluster.
func (m *Monitor) MarkKubernetesClusterPendingUninstall(clusterID string) (models.KubernetesCluster, error) {
clusterID = strings.TrimSpace(clusterID)
if clusterID == "" {
return models.KubernetesCluster{}, fmt.Errorf("kubernetes cluster id is required")
}
cluster, ok := m.state.SetKubernetesClusterPendingUninstall(clusterID, true)
if !ok {
return models.KubernetesCluster{}, fmt.Errorf("kubernetes cluster %q not found", clusterID)
}
return cluster, nil
}
// SetKubernetesClusterCustomDisplayName updates the custom display name for a cluster.
func (m *Monitor) SetKubernetesClusterCustomDisplayName(clusterID, customName string) (models.KubernetesCluster, error) {
clusterID = strings.TrimSpace(clusterID)
if clusterID == "" {
return models.KubernetesCluster{}, fmt.Errorf("kubernetes cluster id is required")
}
cluster, ok := m.state.SetKubernetesClusterCustomDisplayName(clusterID, customName)
if !ok {
return models.KubernetesCluster{}, fmt.Errorf("kubernetes cluster %q not found", clusterID)
}
return cluster, nil
}
func (m *Monitor) evaluateKubernetesAgents(now time.Time) {
clusters := m.state.GetKubernetesClusters()
for _, cluster := range clusters {
interval := cluster.IntervalSeconds
if interval <= 0 {
interval = int(kubernetesMinimumHealthWindow / time.Second)
}
window := time.Duration(interval) * time.Second * kubernetesOfflineGraceMultiplier
if window < kubernetesMinimumHealthWindow {
window = kubernetesMinimumHealthWindow
} else if window > kubernetesMaximumHealthWindow {
window = kubernetesMaximumHealthWindow
}
healthy := !cluster.LastSeen.IsZero() && now.Sub(cluster.LastSeen) <= window
key := kubernetesConnectionPrefix + cluster.ID
m.state.SetConnectionHealth(key, healthy)
if healthy {
m.state.SetKubernetesClusterStatus(cluster.ID, "online")
} else {
m.state.SetKubernetesClusterStatus(cluster.ID, "offline")
}
}
}
func (m *Monitor) cleanupRemovedKubernetesClusters(now time.Time) {
m.mu.Lock()
defer m.mu.Unlock()
for clusterID, removedAt := range m.removedKubernetesClusters {
if now.Sub(removedAt) > removedKubernetesClustersTTL {
delete(m.removedKubernetesClusters, clusterID)
m.state.RemoveRemovedKubernetesCluster(clusterID)
if logging.IsLevelEnabled(zerolog.DebugLevel) {
log.Debug().
Str("k8sClusterID", clusterID).
Time("removedAt", removedAt).
Msg("Cleaned up stale removed Kubernetes cluster block")
}
}
}
}