Pulse/internal/cloudcp/health_monitor.go
2026-03-18 16:06:30 +00:00

137 lines
3.6 KiB
Go

package cloudcp
import (
"context"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/cloudcp/cpmetrics"
"github.com/rcourtman/pulse-go-rewrite/internal/cloudcp/docker"
"github.com/rcourtman/pulse-go-rewrite/internal/cloudcp/registry"
"github.com/rs/zerolog/log"
)
// MonitorConfig holds health monitor settings.
type MonitorConfig struct {
Interval time.Duration // how often to check (default 60s)
RestartOnFail bool // restart unhealthy containers
FailThreshold int // consecutive failures before restart (default 3)
}
// Monitor periodically health-checks active tenant containers and optionally
// restarts unhealthy ones.
type Monitor struct {
registry *registry.TenantRegistry
docker *docker.Manager
cfg MonitorConfig
failures map[string]int
}
// NewMonitor creates a health monitor.
func NewMonitor(reg *registry.TenantRegistry, mgr *docker.Manager, cfg MonitorConfig) *Monitor {
if cfg.Interval == 0 {
cfg.Interval = 60 * time.Second
}
if cfg.FailThreshold == 0 {
cfg.FailThreshold = 3
}
return &Monitor{
registry: reg,
docker: mgr,
cfg: cfg,
failures: make(map[string]int),
}
}
// Run starts the health check loop. It blocks until ctx is cancelled.
func (m *Monitor) Run(ctx context.Context) {
log.Info().
Dur("interval", m.cfg.Interval).
Bool("restart_on_fail", m.cfg.RestartOnFail).
Msg("Health monitor started")
ticker := time.NewTicker(m.cfg.Interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
log.Info().Msg("Health monitor stopped")
return
case <-ticker.C:
m.checkAll(ctx)
}
}
}
func (m *Monitor) checkAll(ctx context.Context) {
activeContainerIDs := make(map[string]struct{})
tenants, err := m.registry.ListByState(registry.TenantStateActive)
if err != nil {
log.Error().Err(err).Msg("Health monitor: failed to list active tenants")
return
}
for _, tenant := range tenants {
if ctx.Err() != nil {
return
}
if tenant.ContainerID == "" {
continue
}
activeContainerIDs[tenant.ContainerID] = struct{}{}
healthy, err := m.docker.HealthCheck(ctx, tenant.ContainerID)
if err != nil {
log.Warn().Err(err).
Str("tenant_id", tenant.ID).
Str("container_id", tenant.ContainerID).
Msg("Health check error")
}
now := time.Now().UTC()
tenant.LastHealthCheck = &now
tenant.HealthCheckOK = healthy
if healthy {
cpmetrics.HealthCheckResults.WithLabelValues("healthy").Inc()
m.failures[tenant.ContainerID] = 0
} else {
cpmetrics.HealthCheckResults.WithLabelValues("unhealthy").Inc()
m.failures[tenant.ContainerID] = m.failures[tenant.ContainerID] + 1
}
if err := m.registry.Update(tenant); err != nil {
log.Error().
Err(err).
Str("tenant_id", tenant.ID).
Str("container_id", tenant.ContainerID).
Msg("Failed to update health status")
continue
}
if !healthy && m.cfg.RestartOnFail && m.failures[tenant.ContainerID] >= m.cfg.FailThreshold {
log.Warn().
Str("tenant_id", tenant.ID).
Str("container_id", tenant.ContainerID).
Int("consecutive_failures", m.failures[tenant.ContainerID]).
Int("fail_threshold", m.cfg.FailThreshold).
Msg("Container unhealthy, attempting restart")
if err := m.docker.Stop(ctx, tenant.ContainerID); err != nil {
log.Error().
Err(err).
Str("tenant_id", tenant.ID).
Str("container_id", tenant.ContainerID).
Msg("Failed to stop unhealthy container")
}
// Docker restart policy (unless-stopped) will restart the container
m.failures[tenant.ContainerID] = 0
}
}
for containerID := range m.failures {
if _, ok := activeContainerIDs[containerID]; !ok {
delete(m.failures, containerID)
}
}
}