Pulse/internal/monitoring/scheduler.go
rcourtman 9b1709a05b feat: enhance scheduler health API with rich instance metadata
Add comprehensive instance-level diagnostics to /api/monitoring/scheduler/health

**New Response Structure:**

Enhanced "instances" array with per-instance details:
- Instance metadata: displayName, type, connection URL
- Poll status: last success/error timestamps, error messages, error category
- Circuit breaker: state, timestamps, failure counts, retry windows
- Dead letter: present flag, reason, attempt history, retry schedule

**Implementation:**

Data structures:
- instanceInfo: cache of display names, URLs, types
- pollStatus: tracks successes/errors with timestamps and categories
- dlqInsight: DLQ entry metadata (reason, attempts, schedule)
- circuitBreaker: enhanced with stateSince, lastTransition

Tracking logic:
- buildInstanceInfoCache: populate metadata from config on startup
- recordTaskResult: track poll outcomes, error details, categories
- sendToDeadLetter: capture DLQ insights (reason, timestamps)
- circuitBreaker: record state transitions with timestamps

**Backward Compatible:**
- Existing fields (deadLetter, breakers, staleness) unchanged
- New "instances" array is additive
- Old clients can ignore new fields

**Testing:**
- Unit test: TestSchedulerHealth_EnhancedResponse validates all fields
- Integration tests: still passing (55s)
- All error tracking and breaker history verified

**Operator Benefits:**
- Diagnose issues without log digging
- See error messages directly in API
- Understand breaker states and retry schedules
- Track DLQ entries with full context
- Single API call for complete instance health view

Example: Quickly identify "401 unauthorized" on specific PBS instance,
see it's in DLQ after 5 retries, and know when next retry scheduled.

Part of Phase 2 follow-up work to improve observability.
2025-10-20 15:13:38 +00:00

404 lines
9.7 KiB
Go

package monitoring
import (
"context"
"math/rand"
"sort"
"sync"
"time"
"github.com/rs/zerolog/log"
)
// InstanceType represents a polling target category.
type InstanceType string
const (
InstanceTypePVE InstanceType = "pve"
InstanceTypePBS InstanceType = "pbs"
InstanceTypePMG InstanceType = "pmg"
)
// StalenessSource provides normalized freshness hints for an instance.
type StalenessSource interface {
StalenessScore(instanceType InstanceType, instanceName string) (float64, bool)
}
// IntervalSelector chooses the next polling cadence for an instance.
type IntervalSelector interface {
SelectInterval(req IntervalRequest) time.Duration
}
// TaskEnqueuer receives scheduled tasks for downstream execution.
type TaskEnqueuer interface {
Enqueue(ctx context.Context, task ScheduledTask) error
}
// IntervalRequest bundles the context required to compute the next polling interval.
type IntervalRequest struct {
Now time.Time
BaseInterval time.Duration
MinInterval time.Duration
MaxInterval time.Duration
LastInterval time.Duration
LastSuccess time.Time
LastScheduled time.Time
StalenessScore float64
ErrorCount int
QueueDepth int
InstanceKey string
InstanceType InstanceType
}
// InstanceDescriptor describes a monitored endpoint for scheduling purposes.
type InstanceDescriptor struct {
Name string
Type InstanceType
LastSuccess time.Time
LastFailure time.Time
LastScheduled time.Time
LastInterval time.Duration
ErrorCount int
Metadata map[string]any
}
// ScheduledTask represents a single polling opportunity planned by the scheduler.
type ScheduledTask struct {
InstanceName string
InstanceType InstanceType
NextRun time.Time
Interval time.Duration
Priority float64
Metadata map[string]any
}
// SchedulerConfig contains tunables for the adaptive scheduler.
type SchedulerConfig struct {
BaseInterval time.Duration
MinInterval time.Duration
MaxInterval time.Duration
}
// DefaultSchedulerConfig returns conservative defaults that preserve current behaviour.
func DefaultSchedulerConfig() SchedulerConfig {
return SchedulerConfig{
BaseInterval: 10 * time.Second,
MinInterval: 5 * time.Second,
MaxInterval: 5 * time.Minute,
}
}
// AdaptiveScheduler orchestrates poll execution plans using pluggable scoring strategies.
type AdaptiveScheduler struct {
cfg SchedulerConfig
staleness StalenessSource
interval IntervalSelector
enqueuer TaskEnqueuer
mu sync.RWMutex
lastPlan map[string]ScheduledTask
}
// NewAdaptiveScheduler constructs a scheduler with safe defaults.
func NewAdaptiveScheduler(cfg SchedulerConfig, staleness StalenessSource, interval IntervalSelector, enqueuer TaskEnqueuer) *AdaptiveScheduler {
if cfg.BaseInterval <= 0 {
cfg.BaseInterval = DefaultSchedulerConfig().BaseInterval
}
if cfg.MinInterval <= 0 {
cfg.MinInterval = DefaultSchedulerConfig().MinInterval
}
if cfg.MaxInterval <= 0 || cfg.MaxInterval < cfg.MinInterval {
cfg.MaxInterval = DefaultSchedulerConfig().MaxInterval
}
if staleness == nil {
staleness = noopStalenessSource{}
}
if interval == nil {
interval = newAdaptiveIntervalSelector(cfg)
}
if enqueuer == nil {
enqueuer = noopTaskEnqueuer{}
}
return &AdaptiveScheduler{
cfg: cfg,
staleness: staleness,
interval: interval,
enqueuer: enqueuer,
lastPlan: make(map[string]ScheduledTask),
}
}
// BuildPlan produces an ordered set of scheduled tasks for the supplied inventory.
func (s *AdaptiveScheduler) BuildPlan(now time.Time, inventory []InstanceDescriptor, queueDepth int) []ScheduledTask {
if len(inventory) == 0 {
return nil
}
s.mu.Lock()
defer s.mu.Unlock()
tasks := make([]ScheduledTask, 0, len(inventory))
for _, inst := range inventory {
score, ok := s.staleness.StalenessScore(inst.Type, inst.Name)
if !ok {
score = 0
}
lastScheduled := inst.LastScheduled
lastInterval := inst.LastInterval
if cached, exists := s.lastPlan[schedulerKey(inst.Type, inst.Name)]; exists {
if lastScheduled.IsZero() {
lastScheduled = cached.NextRun
}
if lastInterval == 0 {
lastInterval = cached.Interval
}
}
if lastInterval == 0 {
lastInterval = s.cfg.BaseInterval
}
currentDepth := queueDepth + len(tasks)
req := IntervalRequest{
Now: now,
BaseInterval: s.cfg.BaseInterval,
MinInterval: s.cfg.MinInterval,
MaxInterval: s.cfg.MaxInterval,
LastInterval: lastInterval,
LastSuccess: inst.LastSuccess,
LastScheduled: lastScheduled,
StalenessScore: score,
ErrorCount: inst.ErrorCount,
QueueDepth: currentDepth,
InstanceKey: schedulerKey(inst.Type, inst.Name),
InstanceType: inst.Type,
}
nextInterval := s.interval.SelectInterval(req)
if nextInterval <= 0 {
nextInterval = s.cfg.BaseInterval
}
if nextInterval < s.cfg.MinInterval {
nextInterval = s.cfg.MinInterval
}
if nextInterval > s.cfg.MaxInterval {
nextInterval = s.cfg.MaxInterval
}
nextRun := now
if !lastScheduled.IsZero() {
nextRun = lastScheduled.Add(nextInterval)
} else if !inst.LastSuccess.IsZero() {
nextRun = inst.LastSuccess.Add(nextInterval)
}
if nextRun.Before(now) {
nextRun = now
}
task := ScheduledTask{
InstanceName: inst.Name,
InstanceType: inst.Type,
NextRun: nextRun,
Interval: nextInterval,
Priority: score,
Metadata: inst.Metadata,
}
s.lastPlan[schedulerKey(inst.Type, inst.Name)] = task
tasks = append(tasks, task)
}
sort.Slice(tasks, func(i, j int) bool {
if tasks[i].NextRun.Equal(tasks[j].NextRun) {
if tasks[i].Priority == tasks[j].Priority {
return tasks[i].InstanceName < tasks[j].InstanceName
}
return tasks[i].Priority > tasks[j].Priority
}
return tasks[i].NextRun.Before(tasks[j].NextRun)
})
return tasks
}
// FilterDue returns tasks whose NextRun is at or before now.
func (s *AdaptiveScheduler) FilterDue(now time.Time, tasks []ScheduledTask) []ScheduledTask {
if len(tasks) == 0 {
return nil
}
due := make([]ScheduledTask, 0, len(tasks))
for _, task := range tasks {
if !task.NextRun.After(now) {
due = append(due, task)
}
}
return due
}
// DispatchDue enqueues due tasks using the configured sink for tracking purposes.
func (s *AdaptiveScheduler) DispatchDue(ctx context.Context, now time.Time, tasks []ScheduledTask) []ScheduledTask {
if s == nil {
return tasks
}
due := s.FilterDue(now, tasks)
if len(due) == 0 {
return due
}
for _, task := range due {
if err := s.enqueuer.Enqueue(ctx, task); err != nil {
log.Warn().
Err(err).
Str("instance", task.InstanceName).
Str("type", string(task.InstanceType)).
Msg("Failed to enqueue scheduled task")
}
}
return due
}
// LastScheduled returns the last recorded task for the given instance, if any.
func (s *AdaptiveScheduler) LastScheduled(instanceType InstanceType, instanceName string) (ScheduledTask, bool) {
if s == nil {
return ScheduledTask{}, false
}
s.mu.RLock()
defer s.mu.RUnlock()
task, ok := s.lastPlan[schedulerKey(instanceType, instanceName)]
return task, ok
}
type noopStalenessSource struct{}
func (noopStalenessSource) StalenessScore(instanceType InstanceType, instanceName string) (float64, bool) {
return 0, false
}
type fixedIntervalSelector struct {
interval time.Duration
}
func (f *fixedIntervalSelector) SelectInterval(req IntervalRequest) time.Duration {
if f.interval > 0 {
return f.interval
}
return req.BaseInterval
}
type adaptiveIntervalSelector struct {
mu sync.Mutex
state map[string]time.Duration
rng *rand.Rand
alpha float64
jitterFraction float64
queueStretch float64
errorPenalty float64
}
func newAdaptiveIntervalSelector(cfg SchedulerConfig) *adaptiveIntervalSelector {
return &adaptiveIntervalSelector{
state: make(map[string]time.Duration),
rng: rand.New(rand.NewSource(time.Now().UnixNano())),
alpha: 0.6,
jitterFraction: 0.05,
queueStretch: 0.1,
errorPenalty: 0.6,
}
}
func (a *adaptiveIntervalSelector) SelectInterval(req IntervalRequest) time.Duration {
min := req.MinInterval
max := req.MaxInterval
if max <= 0 || max < min {
max = min
}
score := clampFloat(req.StalenessScore, 0, 1)
span := float64(max - min)
target := time.Duration(float64(min) + span*(1-score))
if target < min {
target = min
}
if target > max {
target = max
}
if req.ErrorCount > 0 {
penalty := 1 + a.errorPenalty*float64(req.ErrorCount)
if penalty > 0 {
target = time.Duration(float64(target) / penalty)
if target < min {
target = min
}
}
}
if req.QueueDepth > 1 {
stretch := 1 + a.queueStretch*float64(req.QueueDepth-1)
target = time.Duration(float64(target) * stretch)
if target > max {
target = max
}
}
base := req.LastInterval
if base <= 0 {
base = req.BaseInterval
}
var smoothed time.Duration
key := req.InstanceKey
if key == "" {
key = string(req.InstanceType)
}
a.mu.Lock()
prev, ok := a.state[key]
if ok {
base = prev
}
smoothed = time.Duration(a.alpha*float64(target) + (1-a.alpha)*float64(base))
if smoothed < min {
smoothed = min
}
if smoothed > max {
smoothed = max
}
a.state[key] = smoothed
var jitter float64
if a.jitterFraction > 0 && smoothed > 0 {
jitter = (a.rng.Float64()*2 - 1) * a.jitterFraction
}
a.mu.Unlock()
if jitter != 0 {
smoothed = time.Duration(float64(smoothed) * (1 + jitter))
}
if smoothed < min {
smoothed = min
}
if smoothed > max {
smoothed = max
}
return smoothed
}
func clampFloat(v, min, max float64) float64 {
if v < min {
return min
}
if v > max {
return max
}
return v
}
type noopTaskEnqueuer struct{}
func (noopTaskEnqueuer) Enqueue(ctx context.Context, task ScheduledTask) error {
return nil
}