Pulse/cmd/pulse-agent/main.go
rcourtman 1d64b4c31a fix: show Removed Docker Hosts section in UI for re-enrollment
The 'Removed Docker Hosts' section was not appearing in Settings -> Agents
even when hosts were blocked from re-enrolling. This prevented users from
using the 'Allow re-enroll' button to unblock their Docker agents.

Root cause: The WebSocket store was missing:
1. The 'removedDockerHosts' property in its initial state
2. A handler to process removedDockerHosts data from WebSocket messages

This meant the backend was correctly sending the data, but the frontend
was completely ignoring it.

Changes:
- Add removedDockerHosts to WebSocket store initial state and message handler
- Add removedDockerHosts to App.tsx fallback state for consistency
- Add missing BroadcastState call after AllowDockerHostReenroll succeeds

Also includes previous fixes from this session:
- Add PULSE_AGENT_URL as alias for PULSE_AGENT_CONNECT_URL (config.go)
- Add runtime Docker/Podman auto-detection in pulse-agent (main.go)

Fixes issue reported by darthrater78 in discussion #845
2025-12-19 17:57:04 +00:00

690 lines
20 KiB
Go

package main
import (
"context"
"flag"
"fmt"
"net/http"
"os"
"os/exec"
"os/signal"
"strconv"
"strings"
"sync/atomic"
"syscall"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/rcourtman/pulse-go-rewrite/internal/agentupdate"
"github.com/rcourtman/pulse-go-rewrite/internal/dockeragent"
"github.com/rcourtman/pulse-go-rewrite/internal/hostagent"
"github.com/rcourtman/pulse-go-rewrite/internal/kubernetesagent"
"github.com/rcourtman/pulse-go-rewrite/internal/utils"
"github.com/rs/zerolog"
"golang.org/x/sync/errgroup"
)
var (
Version = "dev"
// Prometheus metrics
agentInfo = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "pulse_agent_info",
Help: "Information about the Pulse agent",
}, []string{"version", "host_enabled", "docker_enabled", "kubernetes_enabled"})
agentUp = promauto.NewGauge(prometheus.GaugeOpts{
Name: "pulse_agent_up",
Help: "Whether the Pulse agent is running (1 = up, 0 = down)",
})
)
type multiValue []string
func (m *multiValue) String() string {
return strings.Join(*m, ",")
}
func (m *multiValue) Set(value string) error {
*m = append(*m, value)
return nil
}
func main() {
// 1. Parse Configuration
cfg := loadConfig()
// 2. Setup Logging
zerolog.SetGlobalLevel(cfg.LogLevel)
logger := zerolog.New(os.Stdout).Level(cfg.LogLevel).With().Timestamp().Logger()
cfg.Logger = &logger
// 3. Check if running as Windows service
ranAsService, err := runAsWindowsService(cfg, logger)
if err != nil {
logger.Fatal().Err(err).Msg("Windows service failed")
}
if ranAsService {
return
}
// 4. Setup Context & Signal Handling
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer cancel()
g, ctx := errgroup.WithContext(ctx)
logger.Info().
Str("version", Version).
Str("pulse_url", cfg.PulseURL).
Bool("host_agent", cfg.EnableHost).
Bool("docker_agent", cfg.EnableDocker).
Bool("kubernetes_agent", cfg.EnableKubernetes).
Bool("proxmox_mode", cfg.EnableProxmox).
Bool("auto_update", !cfg.DisableAutoUpdate).
Msg("Starting Pulse Unified Agent")
// 5. Set prometheus info metric
agentInfo.WithLabelValues(
Version,
fmt.Sprintf("%t", cfg.EnableHost),
fmt.Sprintf("%t", cfg.EnableDocker),
fmt.Sprintf("%t", cfg.EnableKubernetes),
).Set(1)
agentUp.Set(1)
// 6. Start Health/Metrics Server
var ready atomic.Bool
if cfg.HealthAddr != "" {
startHealthServer(ctx, cfg.HealthAddr, &ready, &logger)
}
// 7. Start Auto-Updater
updater := agentupdate.New(agentupdate.Config{
PulseURL: cfg.PulseURL,
APIToken: cfg.APIToken,
AgentName: "pulse-agent",
CurrentVersion: Version,
CheckInterval: 1 * time.Hour,
InsecureSkipVerify: cfg.InsecureSkipVerify,
Logger: &logger,
Disabled: cfg.DisableAutoUpdate,
})
g.Go(func() error {
updater.RunLoop(ctx)
return nil
})
// 8. Start Host Agent (if enabled)
if cfg.EnableHost {
hostCfg := hostagent.Config{
PulseURL: cfg.PulseURL,
APIToken: cfg.APIToken,
Interval: cfg.Interval,
HostnameOverride: cfg.HostnameOverride,
AgentID: cfg.AgentID,
AgentType: "unified",
AgentVersion: Version,
Tags: cfg.Tags,
InsecureSkipVerify: cfg.InsecureSkipVerify,
LogLevel: cfg.LogLevel,
Logger: &logger,
EnableProxmox: cfg.EnableProxmox,
ProxmoxType: cfg.ProxmoxType,
}
agent, err := hostagent.New(hostCfg)
if err != nil {
logger.Fatal().Err(err).Msg("Failed to initialize host agent")
}
g.Go(func() error {
logger.Info().Msg("Host agent module started")
return agent.Run(ctx)
})
}
// Auto-detect Docker/Podman if not explicitly configured
if !cfg.EnableDocker && !cfg.DockerConfigured {
// Check for docker binary
if _, err := exec.LookPath("docker"); err == nil {
logger.Info().Msg("Auto-detected Docker binary, enabling Docker monitoring")
cfg.EnableDocker = true
} else if _, err := exec.LookPath("podman"); err == nil {
logger.Info().Msg("Auto-detected Podman binary, enabling Docker monitoring")
cfg.EnableDocker = true
} else {
logger.Debug().Msg("Docker/Podman not found, skipping Docker monitoring")
}
}
// 9. Start Docker Agent (if enabled)
var dockerAgent *dockeragent.Agent
if cfg.EnableDocker {
dockerCfg := dockeragent.Config{
PulseURL: cfg.PulseURL,
APIToken: cfg.APIToken,
Interval: cfg.Interval,
HostnameOverride: cfg.HostnameOverride,
AgentID: cfg.AgentID,
AgentType: "unified",
AgentVersion: Version,
InsecureSkipVerify: cfg.InsecureSkipVerify,
DisableAutoUpdate: true,
LogLevel: cfg.LogLevel,
Logger: &logger,
SwarmScope: "node",
IncludeContainers: true,
IncludeServices: true,
IncludeTasks: true,
CollectDiskMetrics: true,
}
dockerAgent, err = dockeragent.New(dockerCfg)
if err != nil {
// Docker isn't available yet - start retry loop in background
logger.Warn().Err(err).Msg("Docker not available, will retry with exponential backoff")
g.Go(func() error {
agent := initDockerWithRetry(ctx, dockerCfg, &logger)
if agent != nil {
dockerAgent = agent
logger.Info().Msg("Docker agent module started (after retry)")
return agent.Run(ctx)
}
// Docker never became available, continue without it
return nil
})
} else {
g.Go(func() error {
logger.Info().Msg("Docker agent module started")
return dockerAgent.Run(ctx)
})
}
}
// 10. Start Kubernetes Agent (if enabled)
if cfg.EnableKubernetes {
kubeCfg := kubernetesagent.Config{
PulseURL: cfg.PulseURL,
APIToken: cfg.APIToken,
Interval: cfg.Interval,
AgentID: cfg.AgentID,
AgentType: "unified",
AgentVersion: Version,
InsecureSkipVerify: cfg.InsecureSkipVerify,
LogLevel: cfg.LogLevel,
Logger: &logger,
KubeconfigPath: cfg.KubeconfigPath,
KubeContext: cfg.KubeContext,
IncludeNamespaces: cfg.KubeIncludeNamespaces,
ExcludeNamespaces: cfg.KubeExcludeNamespaces,
IncludeAllPods: cfg.KubeIncludeAllPods,
IncludeAllDeployments: cfg.KubeIncludeAllDeployments,
MaxPods: cfg.KubeMaxPods,
}
agent, err := kubernetesagent.New(kubeCfg)
if err != nil {
logger.Warn().Err(err).Msg("Kubernetes not available, will retry with exponential backoff")
g.Go(func() error {
retried := initKubernetesWithRetry(ctx, kubeCfg, &logger)
if retried != nil {
logger.Info().Msg("Kubernetes agent module started (after retry)")
return retried.Run(ctx)
}
return nil
})
} else {
g.Go(func() error {
logger.Info().Msg("Kubernetes agent module started")
return agent.Run(ctx)
})
}
}
// Mark as ready after all agents started
ready.Store(true)
// 11. Wait for all agents to exit
if err := g.Wait(); err != nil && err != context.Canceled {
logger.Error().Err(err).Msg("Agent terminated with error")
agentUp.Set(0)
cleanupDockerAgent(dockerAgent, &logger)
os.Exit(1)
}
// 12. Cleanup
agentUp.Set(0)
cleanupDockerAgent(dockerAgent, &logger)
logger.Info().Msg("Pulse Unified Agent stopped")
}
func cleanupDockerAgent(agent *dockeragent.Agent, logger *zerolog.Logger) {
if agent == nil {
return
}
if err := agent.Close(); err != nil {
logger.Warn().Err(err).Msg("Failed to close docker agent")
}
}
func healthHandler(ready *atomic.Bool) http.Handler {
mux := http.NewServeMux()
// Liveness probe - always returns 200 if server is running
mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write([]byte("ok"))
})
// Readiness probe - returns 200 only when agents are initialized
mux.HandleFunc("/readyz", func(w http.ResponseWriter, r *http.Request) {
if ready.Load() {
w.WriteHeader(http.StatusOK)
w.Write([]byte("ok"))
} else {
w.WriteHeader(http.StatusServiceUnavailable)
w.Write([]byte("not ready"))
}
})
// Prometheus metrics
mux.Handle("/metrics", promhttp.Handler())
return mux
}
func startHealthServer(ctx context.Context, addr string, ready *atomic.Bool, logger *zerolog.Logger) {
srv := &http.Server{
Addr: addr,
Handler: healthHandler(ready),
ReadTimeout: 5 * time.Second,
WriteTimeout: 10 * time.Second,
IdleTimeout: 30 * time.Second,
}
go func() {
<-ctx.Done()
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := srv.Shutdown(shutdownCtx); err != nil && err != http.ErrServerClosed {
logger.Warn().Err(err).Msg("Failed to shut down health server")
}
}()
go func() {
logger.Info().Str("addr", addr).Msg("Health/metrics server listening")
if err := srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
logger.Warn().Err(err).Msg("Health server stopped unexpectedly")
}
}()
}
type Config struct {
PulseURL string
APIToken string
Interval time.Duration
HostnameOverride string
AgentID string
Tags []string
InsecureSkipVerify bool
LogLevel zerolog.Level
Logger *zerolog.Logger
// Module flags
EnableHost bool
EnableDocker bool
DockerConfigured bool
EnableKubernetes bool
EnableProxmox bool
ProxmoxType string // "pve", "pbs", or "" for auto-detect
// Auto-update
DisableAutoUpdate bool
// Health/metrics server
HealthAddr string
// Kubernetes
KubeconfigPath string
KubeContext string
KubeIncludeNamespaces []string
KubeExcludeNamespaces []string
KubeIncludeAllPods bool
KubeIncludeAllDeployments bool
KubeMaxPods int
}
func loadConfig() Config {
// Environment Variables
envURL := utils.GetenvTrim("PULSE_URL")
envToken := utils.GetenvTrim("PULSE_TOKEN")
envInterval := utils.GetenvTrim("PULSE_INTERVAL")
envHostname := utils.GetenvTrim("PULSE_HOSTNAME")
envAgentID := utils.GetenvTrim("PULSE_AGENT_ID")
envInsecure := utils.GetenvTrim("PULSE_INSECURE_SKIP_VERIFY")
envTags := utils.GetenvTrim("PULSE_TAGS")
envLogLevel := utils.GetenvTrim("LOG_LEVEL")
envEnableHost := utils.GetenvTrim("PULSE_ENABLE_HOST")
envEnableDocker := utils.GetenvTrim("PULSE_ENABLE_DOCKER")
envEnableKubernetes := utils.GetenvTrim("PULSE_ENABLE_KUBERNETES")
envEnableProxmox := utils.GetenvTrim("PULSE_ENABLE_PROXMOX")
envProxmoxType := utils.GetenvTrim("PULSE_PROXMOX_TYPE")
envDisableAutoUpdate := utils.GetenvTrim("PULSE_DISABLE_AUTO_UPDATE")
envHealthAddr := utils.GetenvTrim("PULSE_HEALTH_ADDR")
envKubeconfig := utils.GetenvTrim("PULSE_KUBECONFIG")
envKubeContext := utils.GetenvTrim("PULSE_KUBE_CONTEXT")
envKubeIncludeNamespaces := utils.GetenvTrim("PULSE_KUBE_INCLUDE_NAMESPACES")
envKubeExcludeNamespaces := utils.GetenvTrim("PULSE_KUBE_EXCLUDE_NAMESPACES")
envKubeIncludeAllPods := utils.GetenvTrim("PULSE_KUBE_INCLUDE_ALL_PODS")
envKubeIncludeAllDeployments := utils.GetenvTrim("PULSE_KUBE_INCLUDE_ALL_DEPLOYMENTS")
envKubeMaxPods := utils.GetenvTrim("PULSE_KUBE_MAX_PODS")
// Defaults
defaultInterval := 30 * time.Second
if envInterval != "" {
if parsed, err := time.ParseDuration(envInterval); err == nil {
defaultInterval = parsed
}
}
defaultEnableHost := true
if envEnableHost != "" {
defaultEnableHost = utils.ParseBool(envEnableHost)
}
defaultEnableDocker := false
if envEnableDocker != "" {
defaultEnableDocker = utils.ParseBool(envEnableDocker)
}
defaultEnableKubernetes := false
if envEnableKubernetes != "" {
defaultEnableKubernetes = utils.ParseBool(envEnableKubernetes)
}
defaultEnableProxmox := false
if envEnableProxmox != "" {
defaultEnableProxmox = utils.ParseBool(envEnableProxmox)
}
defaultHealthAddr := envHealthAddr
if defaultHealthAddr == "" {
defaultHealthAddr = ":9191"
}
// Flags
urlFlag := flag.String("url", envURL, "Pulse server URL")
tokenFlag := flag.String("token", envToken, "Pulse API token")
intervalFlag := flag.Duration("interval", defaultInterval, "Reporting interval")
hostnameFlag := flag.String("hostname", envHostname, "Override hostname")
agentIDFlag := flag.String("agent-id", envAgentID, "Override agent identifier")
insecureFlag := flag.Bool("insecure", utils.ParseBool(envInsecure), "Skip TLS verification")
logLevelFlag := flag.String("log-level", defaultLogLevel(envLogLevel), "Log level")
enableHostFlag := flag.Bool("enable-host", defaultEnableHost, "Enable Host Agent module")
enableDockerFlag := flag.Bool("enable-docker", defaultEnableDocker, "Enable Docker Agent module")
enableKubernetesFlag := flag.Bool("enable-kubernetes", defaultEnableKubernetes, "Enable Kubernetes Agent module")
enableProxmoxFlag := flag.Bool("enable-proxmox", defaultEnableProxmox, "Enable Proxmox mode (creates API token, registers node)")
proxmoxTypeFlag := flag.String("proxmox-type", envProxmoxType, "Proxmox type: pve or pbs (auto-detected if not specified)")
disableAutoUpdateFlag := flag.Bool("disable-auto-update", utils.ParseBool(envDisableAutoUpdate), "Disable automatic updates")
healthAddrFlag := flag.String("health-addr", defaultHealthAddr, "Health/metrics server address (empty to disable)")
kubeconfigFlag := flag.String("kubeconfig", envKubeconfig, "Path to kubeconfig (optional; uses in-cluster config if available)")
kubeContextFlag := flag.String("kube-context", envKubeContext, "Kubeconfig context (optional)")
kubeIncludeAllPodsFlag := flag.Bool("kube-include-all-pods", utils.ParseBool(envKubeIncludeAllPods), "Include all non-succeeded pods (may be large)")
kubeIncludeAllDeploymentsFlag := flag.Bool("kube-include-all-deployments", utils.ParseBool(envKubeIncludeAllDeployments), "Include all deployments, not just problem ones")
kubeMaxPodsFlag := flag.Int("kube-max-pods", defaultInt(envKubeMaxPods, 200), "Max pods included in report")
showVersion := flag.Bool("version", false, "Print the agent version and exit")
var tagFlags multiValue
flag.Var(&tagFlags, "tag", "Tag to apply (repeatable)")
var kubeIncludeNamespaceFlags multiValue
flag.Var(&kubeIncludeNamespaceFlags, "kube-include-namespace", "Namespace to include (repeatable; default is all)")
var kubeExcludeNamespaceFlags multiValue
flag.Var(&kubeExcludeNamespaceFlags, "kube-exclude-namespace", "Namespace to exclude (repeatable)")
flag.Parse()
if *showVersion {
fmt.Println(Version)
os.Exit(0)
}
// Validation
pulseURL := strings.TrimSpace(*urlFlag)
if pulseURL == "" {
pulseURL = "http://localhost:7655"
}
token := strings.TrimSpace(*tokenFlag)
if token == "" {
fmt.Fprintln(os.Stderr, "error: Pulse API token is required")
os.Exit(1)
}
logLevel, err := parseLogLevel(*logLevelFlag)
if err != nil {
logLevel = zerolog.InfoLevel
}
tags := gatherTags(envTags, tagFlags)
kubeIncludeNamespaces := gatherCSV(envKubeIncludeNamespaces, kubeIncludeNamespaceFlags)
kubeExcludeNamespaces := gatherCSV(envKubeExcludeNamespaces, kubeExcludeNamespaceFlags)
// Check if Docker was explicitly configured via flag or env
dockerConfigured := envEnableDocker != ""
if !dockerConfigured {
flag.Visit(func(f *flag.Flag) {
if f.Name == "enable-docker" {
dockerConfigured = true
}
})
}
return Config{
PulseURL: pulseURL,
APIToken: token,
Interval: *intervalFlag,
HostnameOverride: strings.TrimSpace(*hostnameFlag),
AgentID: strings.TrimSpace(*agentIDFlag),
Tags: tags,
InsecureSkipVerify: *insecureFlag,
LogLevel: logLevel,
EnableHost: *enableHostFlag,
EnableDocker: *enableDockerFlag,
DockerConfigured: dockerConfigured,
EnableKubernetes: *enableKubernetesFlag,
EnableProxmox: *enableProxmoxFlag,
ProxmoxType: strings.TrimSpace(*proxmoxTypeFlag),
DisableAutoUpdate: *disableAutoUpdateFlag,
HealthAddr: strings.TrimSpace(*healthAddrFlag),
KubeconfigPath: strings.TrimSpace(*kubeconfigFlag),
KubeContext: strings.TrimSpace(*kubeContextFlag),
KubeIncludeNamespaces: kubeIncludeNamespaces,
KubeExcludeNamespaces: kubeExcludeNamespaces,
KubeIncludeAllPods: *kubeIncludeAllPodsFlag,
KubeIncludeAllDeployments: *kubeIncludeAllDeploymentsFlag,
KubeMaxPods: *kubeMaxPodsFlag,
}
}
func gatherTags(env string, flags []string) []string {
tags := make([]string, 0)
if env != "" {
for _, tag := range strings.Split(env, ",") {
tag = strings.TrimSpace(tag)
if tag != "" {
tags = append(tags, tag)
}
}
}
for _, tag := range flags {
tag = strings.TrimSpace(tag)
if tag != "" {
tags = append(tags, tag)
}
}
return tags
}
func gatherCSV(env string, flags []string) []string {
values := make([]string, 0)
if env != "" {
for _, value := range strings.Split(env, ",") {
value = strings.TrimSpace(value)
if value != "" {
values = append(values, value)
}
}
}
for _, value := range flags {
value = strings.TrimSpace(value)
if value != "" {
values = append(values, value)
}
}
return values
}
func defaultInt(value string, fallback int) int {
value = strings.TrimSpace(value)
if value == "" {
return fallback
}
parsed, err := strconv.Atoi(value)
if err != nil {
return fallback
}
return parsed
}
func parseLogLevel(value string) (zerolog.Level, error) {
normalized := strings.ToLower(strings.TrimSpace(value))
if normalized == "" {
return zerolog.InfoLevel, nil
}
return zerolog.ParseLevel(normalized)
}
func defaultLogLevel(envValue string) string {
if strings.TrimSpace(envValue) == "" {
return "info"
}
return envValue
}
// initDockerWithRetry attempts to initialize the Docker agent with exponential backoff.
// It returns the agent when Docker becomes available, or nil if the context is cancelled.
// Retry intervals: 5s, 10s, 20s, 40s, 80s, 160s, then cap at 5 minutes.
func initDockerWithRetry(ctx context.Context, cfg dockeragent.Config, logger *zerolog.Logger) *dockeragent.Agent {
const (
initialDelay = 5 * time.Second
maxDelay = 5 * time.Minute
multiplier = 2.0
)
delay := initialDelay
attempt := 0
for {
select {
case <-ctx.Done():
logger.Info().Msg("Docker retry cancelled, context done")
return nil
default:
}
attempt++
logger.Info().
Int("attempt", attempt).
Str("next_retry", delay.String()).
Msg("Waiting to retry Docker connection")
select {
case <-ctx.Done():
logger.Info().Msg("Docker retry cancelled during wait")
return nil
case <-time.After(delay):
}
agent, err := dockeragent.New(cfg)
if err == nil {
logger.Info().
Int("attempts", attempt).
Msg("Successfully connected to Docker after retry")
return agent
}
logger.Warn().
Err(err).
Int("attempt", attempt).
Str("next_retry", (time.Duration(float64(delay) * multiplier)).String()).
Msg("Docker still not available, will retry")
// Calculate next delay with exponential backoff, capped at maxDelay
delay = time.Duration(float64(delay) * multiplier)
if delay > maxDelay {
delay = maxDelay
}
}
}
// initKubernetesWithRetry attempts to initialize the Kubernetes agent with exponential backoff.
// It returns the agent when Kubernetes becomes available, or nil if the context is cancelled.
// Retry intervals: 5s, 10s, 20s, 40s, 80s, 160s, then cap at 5 minutes.
func initKubernetesWithRetry(ctx context.Context, cfg kubernetesagent.Config, logger *zerolog.Logger) *kubernetesagent.Agent {
const (
initialDelay = 5 * time.Second
maxDelay = 5 * time.Minute
multiplier = 2.0
)
delay := initialDelay
attempt := 0
for {
select {
case <-ctx.Done():
logger.Info().Msg("Kubernetes retry cancelled, context done")
return nil
default:
}
attempt++
logger.Info().
Int("attempt", attempt).
Str("next_retry", delay.String()).
Msg("Waiting to retry Kubernetes connection")
select {
case <-ctx.Done():
logger.Info().Msg("Kubernetes retry cancelled during wait")
return nil
case <-time.After(delay):
}
agent, err := kubernetesagent.New(cfg)
if err == nil {
logger.Info().
Int("attempts", attempt).
Msg("Successfully connected to Kubernetes after retry")
return agent
}
logger.Warn().
Err(err).
Int("attempt", attempt).
Str("next_retry", (time.Duration(float64(delay) * multiplier)).String()).
Msg("Kubernetes still not available, will retry")
delay = time.Duration(float64(delay) * multiplier)
if delay > maxDelay {
delay = maxDelay
}
}
}