mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-07 00:37:36 +00:00
137 lines
3.6 KiB
Go
137 lines
3.6 KiB
Go
package cloudcp
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/cloudcp/cpmetrics"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/cloudcp/docker"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/cloudcp/registry"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
// MonitorConfig holds health monitor settings.
|
|
type MonitorConfig struct {
|
|
Interval time.Duration // how often to check (default 60s)
|
|
RestartOnFail bool // restart unhealthy containers
|
|
FailThreshold int // consecutive failures before restart (default 3)
|
|
}
|
|
|
|
// Monitor periodically health-checks active tenant containers and optionally
|
|
// restarts unhealthy ones.
|
|
type Monitor struct {
|
|
registry *registry.TenantRegistry
|
|
docker *docker.Manager
|
|
cfg MonitorConfig
|
|
failures map[string]int
|
|
}
|
|
|
|
// NewMonitor creates a health monitor.
|
|
func NewMonitor(reg *registry.TenantRegistry, mgr *docker.Manager, cfg MonitorConfig) *Monitor {
|
|
if cfg.Interval == 0 {
|
|
cfg.Interval = 60 * time.Second
|
|
}
|
|
if cfg.FailThreshold == 0 {
|
|
cfg.FailThreshold = 3
|
|
}
|
|
return &Monitor{
|
|
registry: reg,
|
|
docker: mgr,
|
|
cfg: cfg,
|
|
failures: make(map[string]int),
|
|
}
|
|
}
|
|
|
|
// Run starts the health check loop. It blocks until ctx is cancelled.
|
|
func (m *Monitor) Run(ctx context.Context) {
|
|
log.Info().
|
|
Dur("interval", m.cfg.Interval).
|
|
Bool("restart_on_fail", m.cfg.RestartOnFail).
|
|
Msg("Health monitor started")
|
|
|
|
ticker := time.NewTicker(m.cfg.Interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
log.Info().Msg("Health monitor stopped")
|
|
return
|
|
case <-ticker.C:
|
|
m.checkAll(ctx)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (m *Monitor) checkAll(ctx context.Context) {
|
|
activeContainerIDs := make(map[string]struct{})
|
|
tenants, err := m.registry.ListByState(registry.TenantStateActive)
|
|
if err != nil {
|
|
log.Error().Err(err).Msg("Health monitor: failed to list active tenants")
|
|
return
|
|
}
|
|
|
|
for _, tenant := range tenants {
|
|
if ctx.Err() != nil {
|
|
return
|
|
}
|
|
if tenant.ContainerID == "" {
|
|
continue
|
|
}
|
|
activeContainerIDs[tenant.ContainerID] = struct{}{}
|
|
|
|
healthy, err := m.docker.HealthCheck(ctx, tenant.ContainerID)
|
|
if err != nil {
|
|
log.Warn().Err(err).
|
|
Str("tenant_id", tenant.ID).
|
|
Str("container_id", tenant.ContainerID).
|
|
Msg("Health check error")
|
|
}
|
|
|
|
now := time.Now().UTC()
|
|
tenant.LastHealthCheck = &now
|
|
tenant.HealthCheckOK = healthy
|
|
|
|
if healthy {
|
|
cpmetrics.HealthCheckResults.WithLabelValues("healthy").Inc()
|
|
m.failures[tenant.ContainerID] = 0
|
|
} else {
|
|
cpmetrics.HealthCheckResults.WithLabelValues("unhealthy").Inc()
|
|
m.failures[tenant.ContainerID] = m.failures[tenant.ContainerID] + 1
|
|
}
|
|
|
|
if err := m.registry.Update(tenant); err != nil {
|
|
log.Error().
|
|
Err(err).
|
|
Str("tenant_id", tenant.ID).
|
|
Str("container_id", tenant.ContainerID).
|
|
Msg("Failed to update health status")
|
|
continue
|
|
}
|
|
|
|
if !healthy && m.cfg.RestartOnFail && m.failures[tenant.ContainerID] >= m.cfg.FailThreshold {
|
|
log.Warn().
|
|
Str("tenant_id", tenant.ID).
|
|
Str("container_id", tenant.ContainerID).
|
|
Int("consecutive_failures", m.failures[tenant.ContainerID]).
|
|
Int("fail_threshold", m.cfg.FailThreshold).
|
|
Msg("Container unhealthy, attempting restart")
|
|
|
|
if err := m.docker.Stop(ctx, tenant.ContainerID); err != nil {
|
|
log.Error().
|
|
Err(err).
|
|
Str("tenant_id", tenant.ID).
|
|
Str("container_id", tenant.ContainerID).
|
|
Msg("Failed to stop unhealthy container")
|
|
}
|
|
// Docker restart policy (unless-stopped) will restart the container
|
|
m.failures[tenant.ContainerID] = 0
|
|
}
|
|
}
|
|
|
|
for containerID := range m.failures {
|
|
if _, ok := activeContainerIDs[containerID]; !ok {
|
|
delete(m.failures, containerID)
|
|
}
|
|
}
|
|
}
|