Pulse/internal/monitoring/circuit_breaker.go
rcourtman 9b1709a05b feat: enhance scheduler health API with rich instance metadata
Add comprehensive instance-level diagnostics to /api/monitoring/scheduler/health

**New Response Structure:**

Enhanced "instances" array with per-instance details:
- Instance metadata: displayName, type, connection URL
- Poll status: last success/error timestamps, error messages, error category
- Circuit breaker: state, timestamps, failure counts, retry windows
- Dead letter: present flag, reason, attempt history, retry schedule

**Implementation:**

Data structures:
- instanceInfo: cache of display names, URLs, types
- pollStatus: tracks successes/errors with timestamps and categories
- dlqInsight: DLQ entry metadata (reason, attempts, schedule)
- circuitBreaker: enhanced with stateSince, lastTransition

Tracking logic:
- buildInstanceInfoCache: populate metadata from config on startup
- recordTaskResult: track poll outcomes, error details, categories
- sendToDeadLetter: capture DLQ insights (reason, timestamps)
- circuitBreaker: record state transitions with timestamps

**Backward Compatible:**
- Existing fields (deadLetter, breakers, staleness) unchanged
- New "instances" array is additive
- Old clients can ignore new fields

**Testing:**
- Unit test: TestSchedulerHealth_EnhancedResponse validates all fields
- Integration tests: still passing (55s)
- All error tracking and breaker history verified

**Operator Benefits:**
- Diagnose issues without log digging
- See error messages directly in API
- Understand breaker states and retry schedules
- Track DLQ entries with full context
- Single API call for complete instance health view

Example: Quickly identify "401 unauthorized" on specific PBS instance,
see it's in DLQ after 5 retries, and know when next retry scheduled.

Part of Phase 2 follow-up work to improve observability.
2025-10-20 15:13:38 +00:00

155 lines
3.3 KiB
Go

package monitoring
import (
"sync"
"time"
)
type breakerState int
const (
breakerClosed breakerState = iota
breakerOpen
breakerHalfOpen
)
type circuitBreaker struct {
mu sync.Mutex
state breakerState
failureCount int
openedAt time.Time
lastAttempt time.Time
retryInterval time.Duration
maxDelay time.Duration
openThreshold int
halfOpenWindow time.Duration
stateSince time.Time
lastTransition time.Time
}
func newCircuitBreaker(openThreshold int, retryInterval, maxDelay, halfOpenWindow time.Duration) *circuitBreaker {
if openThreshold <= 0 {
openThreshold = 3
}
if retryInterval <= 0 {
retryInterval = 5 * time.Second
}
if maxDelay <= 0 {
maxDelay = 5 * time.Minute
}
if halfOpenWindow <= 0 {
halfOpenWindow = 30 * time.Second
}
now := time.Now()
return &circuitBreaker{
state: breakerClosed,
retryInterval: retryInterval,
maxDelay: maxDelay,
openThreshold: openThreshold,
halfOpenWindow: halfOpenWindow,
stateSince: now,
lastTransition: now,
}
}
func (b *circuitBreaker) allow(now time.Time) bool {
b.mu.Lock()
defer b.mu.Unlock()
switch b.state {
case breakerClosed:
return true
case breakerOpen:
if now.Sub(b.openedAt) >= b.retryInterval {
b.state = breakerHalfOpen
b.lastAttempt = now
b.stateSince = now
b.lastTransition = now
return true
}
return false
case breakerHalfOpen:
if now.Sub(b.lastAttempt) >= b.halfOpenWindow {
b.lastAttempt = now
return true
}
return false
default:
return true
}
}
func (b *circuitBreaker) recordSuccess() {
b.mu.Lock()
defer b.mu.Unlock()
if b.state != breakerClosed {
now := time.Now()
b.state = breakerClosed
b.failureCount = 0
b.stateSince = now
b.lastTransition = now
}
}
func (b *circuitBreaker) recordFailure(now time.Time) {
b.mu.Lock()
defer b.mu.Unlock()
b.failureCount++
b.lastAttempt = now
switch b.state {
case breakerHalfOpen:
b.trip(now)
case breakerClosed:
if b.failureCount >= b.openThreshold {
b.trip(now)
}
}
}
func (b *circuitBreaker) trip(now time.Time) {
b.state = breakerOpen
delay := b.retryInterval << uint(b.failureCount)
if delay > b.maxDelay {
delay = b.maxDelay
}
b.retryInterval = delay
b.openedAt = now
b.stateSince = now
b.lastTransition = now
}
// BreakerSnapshot represents the current state of a circuit breaker.
type BreakerSnapshot struct {
Instance string `json:"instance"`
Type string `json:"type"`
State string `json:"state"`
Failures int `json:"failures"`
RetryAt time.Time `json:"retryAt,omitempty"`
}
// State returns a snapshot of the circuit breaker state for API exposure.
func (b *circuitBreaker) State() (state string, failures int, retryAt time.Time) {
state, failures, retryAt, _, _ = b.stateDetails()
return
}
func (b *circuitBreaker) stateDetails() (state string, failures int, retryAt time.Time, since time.Time, lastTransition time.Time) {
b.mu.Lock()
defer b.mu.Unlock()
switch b.state {
case breakerClosed:
state = "closed"
case breakerOpen:
state = "open"
retryAt = b.openedAt.Add(b.retryInterval)
case breakerHalfOpen:
state = "half_open"
retryAt = b.lastAttempt.Add(b.halfOpenWindow)
default:
state = "unknown"
}
return state, b.failureCount, retryAt, b.stateSince, b.lastTransition
}