mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 19:41:17 +00:00
Host agent: - Add SHA256 checksum verification for downloaded binaries - Verify checksum file matches expected bundle filename WebSocket: - Add write failure tracking with graceful disconnection - Increase write deadline to 30s for large state payloads - Better handling for slow clients (Raspberry Pi, slow networks) Monitoring: - Remove unused temperature proxy imports - Add monitor polling improvements - Expand test coverage Other: - Update package.json dependencies - Fix generate-release-notes.sh path handling - Minor reporting engine cleanup
389 lines
13 KiB
Go
389 lines
13 KiB
Go
package monitoring
|
|
|
|
import (
|
|
"crypto/sha256"
|
|
"encoding/hex"
|
|
"fmt"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/config"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/logging"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/models"
|
|
agentsk8s "github.com/rcourtman/pulse-go-rewrite/pkg/agents/kubernetes"
|
|
"github.com/rs/zerolog"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
const (
|
|
removedKubernetesClustersTTL = 24 * time.Hour
|
|
)
|
|
|
|
func normalizeKubernetesClusterIdentifier(report agentsk8s.Report) string {
|
|
if v := strings.TrimSpace(report.Cluster.ID); v != "" {
|
|
return v
|
|
}
|
|
if v := strings.TrimSpace(report.Agent.ID); v != "" {
|
|
return v
|
|
}
|
|
|
|
stableKey := strings.TrimSpace(report.Cluster.Server) + "|" + strings.TrimSpace(report.Cluster.Context) + "|" + strings.TrimSpace(report.Cluster.Name)
|
|
stableKey = strings.TrimSpace(stableKey)
|
|
if stableKey == "||" || stableKey == "" {
|
|
return ""
|
|
}
|
|
|
|
sum := sha256.Sum256([]byte(stableKey))
|
|
return hex.EncodeToString(sum[:])
|
|
}
|
|
|
|
// ApplyKubernetesReport ingests a Kubernetes agent report into state.
|
|
func (m *Monitor) ApplyKubernetesReport(report agentsk8s.Report, tokenRecord *config.APITokenRecord) (models.KubernetesCluster, error) {
|
|
identifier := normalizeKubernetesClusterIdentifier(report)
|
|
if strings.TrimSpace(identifier) == "" {
|
|
return models.KubernetesCluster{}, fmt.Errorf("kubernetes report missing cluster identifier")
|
|
}
|
|
|
|
// Check if this cluster was deliberately removed - reject report to prevent resurrection
|
|
m.mu.RLock()
|
|
removedAt, wasRemoved := m.removedKubernetesClusters[identifier]
|
|
m.mu.RUnlock()
|
|
if wasRemoved {
|
|
log.Info().
|
|
Str("k8sClusterID", identifier).
|
|
Time("removedAt", removedAt).
|
|
Msg("Rejecting report from deliberately removed Kubernetes cluster")
|
|
return models.KubernetesCluster{}, fmt.Errorf("kubernetes cluster %q was removed at %v and cannot report again. Use Allow re-enroll in Settings -> Agents -> Removed Kubernetes Clusters or rerun the installer with a kubernetes:manage token to clear this block", identifier, removedAt.Format(time.RFC3339))
|
|
}
|
|
|
|
// Enforce token uniqueness: each token can only be bound to one cluster agent
|
|
if tokenRecord != nil && tokenRecord.ID != "" {
|
|
tokenID := strings.TrimSpace(tokenRecord.ID)
|
|
agentID := strings.TrimSpace(report.Agent.ID)
|
|
if agentID == "" {
|
|
agentID = identifier
|
|
}
|
|
|
|
m.mu.Lock()
|
|
if boundAgentID, exists := m.kubernetesTokenBindings[tokenID]; exists {
|
|
if boundAgentID != agentID {
|
|
m.mu.Unlock()
|
|
tokenHint := tokenHintFromRecord(tokenRecord)
|
|
if tokenHint != "" {
|
|
tokenHint = " (" + tokenHint + ")"
|
|
}
|
|
log.Warn().
|
|
Str("tokenID", tokenID).
|
|
Str("tokenHint", tokenHint).
|
|
Str("reportingAgentID", agentID).
|
|
Str("boundAgentID", boundAgentID).
|
|
Msg("Rejecting Kubernetes report: token already bound to different agent")
|
|
return models.KubernetesCluster{}, fmt.Errorf("API token%s is already in use by agent %q. Each Kubernetes agent must use a unique API token. Generate a new token for this agent", tokenHint, boundAgentID)
|
|
}
|
|
} else {
|
|
m.kubernetesTokenBindings[tokenID] = agentID
|
|
log.Debug().
|
|
Str("tokenID", tokenID).
|
|
Str("agentID", agentID).
|
|
Str("clusterID", identifier).
|
|
Msg("Bound Kubernetes agent token to agent identity")
|
|
}
|
|
m.mu.Unlock()
|
|
}
|
|
|
|
timestamp := report.Timestamp
|
|
if timestamp.IsZero() {
|
|
timestamp = time.Now()
|
|
}
|
|
|
|
agentID := strings.TrimSpace(report.Agent.ID)
|
|
if agentID == "" {
|
|
agentID = identifier
|
|
}
|
|
|
|
name := strings.TrimSpace(report.Cluster.Name)
|
|
displayName := name
|
|
if displayName == "" {
|
|
displayName = identifier
|
|
}
|
|
|
|
nodes := make([]models.KubernetesNode, 0, len(report.Nodes))
|
|
for _, n := range report.Nodes {
|
|
roles := append([]string(nil), n.Roles...)
|
|
nodes = append(nodes, models.KubernetesNode{
|
|
UID: strings.TrimSpace(n.UID),
|
|
Name: strings.TrimSpace(n.Name),
|
|
Ready: n.Ready,
|
|
Unschedulable: n.Unschedulable,
|
|
KubeletVersion: strings.TrimSpace(n.KubeletVersion),
|
|
ContainerRuntimeVersion: strings.TrimSpace(n.ContainerRuntimeVersion),
|
|
OSImage: strings.TrimSpace(n.OSImage),
|
|
KernelVersion: strings.TrimSpace(n.KernelVersion),
|
|
Architecture: strings.TrimSpace(n.Architecture),
|
|
CapacityCPU: n.Capacity.CPUCores,
|
|
CapacityMemoryBytes: n.Capacity.MemoryBytes,
|
|
CapacityPods: n.Capacity.Pods,
|
|
AllocCPU: n.Allocatable.CPUCores,
|
|
AllocMemoryBytes: n.Allocatable.MemoryBytes,
|
|
AllocPods: n.Allocatable.Pods,
|
|
Roles: roles,
|
|
})
|
|
}
|
|
|
|
pods := make([]models.KubernetesPod, 0, len(report.Pods))
|
|
for _, p := range report.Pods {
|
|
labels := make(map[string]string, len(p.Labels))
|
|
for k, v := range p.Labels {
|
|
labels[k] = v
|
|
}
|
|
containers := make([]models.KubernetesPodContainer, 0, len(p.Containers))
|
|
for _, c := range p.Containers {
|
|
containers = append(containers, models.KubernetesPodContainer{
|
|
Name: strings.TrimSpace(c.Name),
|
|
Image: strings.TrimSpace(c.Image),
|
|
Ready: c.Ready,
|
|
RestartCount: c.RestartCount,
|
|
State: strings.TrimSpace(c.State),
|
|
Reason: strings.TrimSpace(c.Reason),
|
|
Message: strings.TrimSpace(c.Message),
|
|
})
|
|
}
|
|
pods = append(pods, models.KubernetesPod{
|
|
UID: strings.TrimSpace(p.UID),
|
|
Name: strings.TrimSpace(p.Name),
|
|
Namespace: strings.TrimSpace(p.Namespace),
|
|
NodeName: strings.TrimSpace(p.NodeName),
|
|
Phase: strings.TrimSpace(p.Phase),
|
|
Reason: strings.TrimSpace(p.Reason),
|
|
Message: strings.TrimSpace(p.Message),
|
|
QoSClass: strings.TrimSpace(p.QoSClass),
|
|
CreatedAt: p.CreatedAt,
|
|
StartTime: p.StartTime,
|
|
Restarts: p.Restarts,
|
|
Labels: labels,
|
|
OwnerKind: strings.TrimSpace(p.OwnerKind),
|
|
OwnerName: strings.TrimSpace(p.OwnerName),
|
|
Containers: containers,
|
|
})
|
|
}
|
|
|
|
deployments := make([]models.KubernetesDeployment, 0, len(report.Deployments))
|
|
for _, d := range report.Deployments {
|
|
labels := make(map[string]string, len(d.Labels))
|
|
for k, v := range d.Labels {
|
|
labels[k] = v
|
|
}
|
|
deployments = append(deployments, models.KubernetesDeployment{
|
|
UID: strings.TrimSpace(d.UID),
|
|
Name: strings.TrimSpace(d.Name),
|
|
Namespace: strings.TrimSpace(d.Namespace),
|
|
DesiredReplicas: d.DesiredReplicas,
|
|
UpdatedReplicas: d.UpdatedReplicas,
|
|
ReadyReplicas: d.ReadyReplicas,
|
|
AvailableReplicas: d.AvailableReplicas,
|
|
Labels: labels,
|
|
})
|
|
}
|
|
|
|
agentVersion := normalizeAgentVersion(report.Agent.Version)
|
|
|
|
cluster := models.KubernetesCluster{
|
|
ID: identifier,
|
|
AgentID: agentID,
|
|
Name: name,
|
|
DisplayName: displayName,
|
|
Server: strings.TrimSpace(report.Cluster.Server),
|
|
Context: strings.TrimSpace(report.Cluster.Context),
|
|
Version: strings.TrimSpace(report.Cluster.Version),
|
|
Status: "online",
|
|
LastSeen: timestamp,
|
|
IntervalSeconds: report.Agent.IntervalSeconds,
|
|
AgentVersion: agentVersion,
|
|
Nodes: nodes,
|
|
Pods: pods,
|
|
Deployments: deployments,
|
|
}
|
|
|
|
if tokenRecord != nil {
|
|
cluster.TokenID = strings.TrimSpace(tokenRecord.ID)
|
|
cluster.TokenName = strings.TrimSpace(tokenRecord.Name)
|
|
cluster.TokenHint = tokenHintFromRecord(tokenRecord)
|
|
cluster.TokenLastUsedAt = tokenRecord.LastUsedAt
|
|
}
|
|
|
|
m.state.UpsertKubernetesCluster(cluster)
|
|
m.state.SetConnectionHealth(kubernetesConnectionPrefix+identifier, true)
|
|
|
|
return cluster, nil
|
|
}
|
|
|
|
// RemoveKubernetesCluster removes a kubernetes cluster from the shared state and clears related data.
|
|
func (m *Monitor) RemoveKubernetesCluster(clusterID string) (models.KubernetesCluster, error) {
|
|
clusterID = strings.TrimSpace(clusterID)
|
|
if clusterID == "" {
|
|
return models.KubernetesCluster{}, fmt.Errorf("kubernetes cluster id is required")
|
|
}
|
|
|
|
cluster, removed := m.state.RemoveKubernetesCluster(clusterID)
|
|
if !removed {
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().Str("k8sClusterID", clusterID).Msg("Kubernetes cluster not present in state during removal; proceeding")
|
|
}
|
|
cluster = models.KubernetesCluster{
|
|
ID: clusterID,
|
|
Name: clusterID,
|
|
DisplayName: clusterID,
|
|
}
|
|
}
|
|
|
|
// Revoke the API token associated with this Kubernetes cluster
|
|
if cluster.TokenID != "" {
|
|
tokenRemoved := m.config.RemoveAPIToken(cluster.TokenID)
|
|
if tokenRemoved != nil {
|
|
m.config.SortAPITokens()
|
|
|
|
if m.persistence != nil {
|
|
if err := m.persistence.SaveAPITokens(m.config.APITokens); err != nil {
|
|
log.Warn().Err(err).Str("tokenID", cluster.TokenID).Msg("Failed to persist API token revocation after Kubernetes cluster removal")
|
|
} else {
|
|
log.Info().Str("tokenID", cluster.TokenID).Str("tokenName", cluster.TokenName).Msg("API token revoked for removed Kubernetes cluster")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
removedAt := time.Now()
|
|
|
|
m.mu.Lock()
|
|
m.removedKubernetesClusters[clusterID] = removedAt
|
|
if cluster.TokenID != "" {
|
|
delete(m.kubernetesTokenBindings, cluster.TokenID)
|
|
log.Debug().
|
|
Str("tokenID", cluster.TokenID).
|
|
Str("k8sClusterID", clusterID).
|
|
Msg("Unbound Kubernetes agent token from removed cluster")
|
|
}
|
|
m.mu.Unlock()
|
|
|
|
m.state.AddRemovedKubernetesCluster(models.RemovedKubernetesCluster{
|
|
ID: clusterID,
|
|
Name: cluster.Name,
|
|
DisplayName: cluster.DisplayName,
|
|
RemovedAt: removedAt,
|
|
})
|
|
|
|
m.state.RemoveConnectionHealth(kubernetesConnectionPrefix + clusterID)
|
|
|
|
log.Info().
|
|
Str("k8sClusterID", clusterID).
|
|
Bool("removed", removed).
|
|
Msg("Kubernetes cluster removed")
|
|
|
|
return cluster, nil
|
|
}
|
|
|
|
// AllowKubernetesClusterReenroll clears the removal block for a kubernetes cluster.
|
|
func (m *Monitor) AllowKubernetesClusterReenroll(clusterID string) error {
|
|
clusterID = strings.TrimSpace(clusterID)
|
|
if clusterID == "" {
|
|
return fmt.Errorf("kubernetes cluster id is required")
|
|
}
|
|
|
|
m.mu.Lock()
|
|
_, exists := m.removedKubernetesClusters[clusterID]
|
|
if !exists {
|
|
m.mu.Unlock()
|
|
return nil
|
|
}
|
|
delete(m.removedKubernetesClusters, clusterID)
|
|
m.mu.Unlock()
|
|
|
|
m.state.RemoveRemovedKubernetesCluster(clusterID)
|
|
return nil
|
|
}
|
|
|
|
// UnhideKubernetesCluster clears the hidden flag.
|
|
func (m *Monitor) UnhideKubernetesCluster(clusterID string) (models.KubernetesCluster, error) {
|
|
clusterID = strings.TrimSpace(clusterID)
|
|
if clusterID == "" {
|
|
return models.KubernetesCluster{}, fmt.Errorf("kubernetes cluster id is required")
|
|
}
|
|
|
|
cluster, ok := m.state.SetKubernetesClusterHidden(clusterID, false)
|
|
if !ok {
|
|
return models.KubernetesCluster{}, fmt.Errorf("kubernetes cluster %q not found", clusterID)
|
|
}
|
|
return cluster, nil
|
|
}
|
|
|
|
// MarkKubernetesClusterPendingUninstall sets the pending uninstall flag on a cluster.
|
|
func (m *Monitor) MarkKubernetesClusterPendingUninstall(clusterID string) (models.KubernetesCluster, error) {
|
|
clusterID = strings.TrimSpace(clusterID)
|
|
if clusterID == "" {
|
|
return models.KubernetesCluster{}, fmt.Errorf("kubernetes cluster id is required")
|
|
}
|
|
|
|
cluster, ok := m.state.SetKubernetesClusterPendingUninstall(clusterID, true)
|
|
if !ok {
|
|
return models.KubernetesCluster{}, fmt.Errorf("kubernetes cluster %q not found", clusterID)
|
|
}
|
|
return cluster, nil
|
|
}
|
|
|
|
// SetKubernetesClusterCustomDisplayName updates the custom display name for a cluster.
|
|
func (m *Monitor) SetKubernetesClusterCustomDisplayName(clusterID, customName string) (models.KubernetesCluster, error) {
|
|
clusterID = strings.TrimSpace(clusterID)
|
|
if clusterID == "" {
|
|
return models.KubernetesCluster{}, fmt.Errorf("kubernetes cluster id is required")
|
|
}
|
|
|
|
cluster, ok := m.state.SetKubernetesClusterCustomDisplayName(clusterID, customName)
|
|
if !ok {
|
|
return models.KubernetesCluster{}, fmt.Errorf("kubernetes cluster %q not found", clusterID)
|
|
}
|
|
return cluster, nil
|
|
}
|
|
|
|
func (m *Monitor) evaluateKubernetesAgents(now time.Time) {
|
|
clusters := m.state.GetKubernetesClusters()
|
|
for _, cluster := range clusters {
|
|
interval := cluster.IntervalSeconds
|
|
if interval <= 0 {
|
|
interval = int(kubernetesMinimumHealthWindow / time.Second)
|
|
}
|
|
|
|
window := time.Duration(interval) * time.Second * kubernetesOfflineGraceMultiplier
|
|
if window < kubernetesMinimumHealthWindow {
|
|
window = kubernetesMinimumHealthWindow
|
|
} else if window > kubernetesMaximumHealthWindow {
|
|
window = kubernetesMaximumHealthWindow
|
|
}
|
|
|
|
healthy := !cluster.LastSeen.IsZero() && now.Sub(cluster.LastSeen) <= window
|
|
key := kubernetesConnectionPrefix + cluster.ID
|
|
m.state.SetConnectionHealth(key, healthy)
|
|
if healthy {
|
|
m.state.SetKubernetesClusterStatus(cluster.ID, "online")
|
|
} else {
|
|
m.state.SetKubernetesClusterStatus(cluster.ID, "offline")
|
|
}
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) cleanupRemovedKubernetesClusters(now time.Time) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
for clusterID, removedAt := range m.removedKubernetesClusters {
|
|
if now.Sub(removedAt) > removedKubernetesClustersTTL {
|
|
delete(m.removedKubernetesClusters, clusterID)
|
|
m.state.RemoveRemovedKubernetesCluster(clusterID)
|
|
if logging.IsLevelEnabled(zerolog.DebugLevel) {
|
|
log.Debug().
|
|
Str("k8sClusterID", clusterID).
|
|
Time("removedAt", removedAt).
|
|
Msg("Cleaned up stale removed Kubernetes cluster block")
|
|
}
|
|
}
|
|
}
|
|
}
|