mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 19:41:17 +00:00
Add comprehensive instance-level diagnostics to /api/monitoring/scheduler/health **New Response Structure:** Enhanced "instances" array with per-instance details: - Instance metadata: displayName, type, connection URL - Poll status: last success/error timestamps, error messages, error category - Circuit breaker: state, timestamps, failure counts, retry windows - Dead letter: present flag, reason, attempt history, retry schedule **Implementation:** Data structures: - instanceInfo: cache of display names, URLs, types - pollStatus: tracks successes/errors with timestamps and categories - dlqInsight: DLQ entry metadata (reason, attempts, schedule) - circuitBreaker: enhanced with stateSince, lastTransition Tracking logic: - buildInstanceInfoCache: populate metadata from config on startup - recordTaskResult: track poll outcomes, error details, categories - sendToDeadLetter: capture DLQ insights (reason, timestamps) - circuitBreaker: record state transitions with timestamps **Backward Compatible:** - Existing fields (deadLetter, breakers, staleness) unchanged - New "instances" array is additive - Old clients can ignore new fields **Testing:** - Unit test: TestSchedulerHealth_EnhancedResponse validates all fields - Integration tests: still passing (55s) - All error tracking and breaker history verified **Operator Benefits:** - Diagnose issues without log digging - See error messages directly in API - Understand breaker states and retry schedules - Track DLQ entries with full context - Single API call for complete instance health view Example: Quickly identify "401 unauthorized" on specific PBS instance, see it's in DLQ after 5 retries, and know when next retry scheduled. Part of Phase 2 follow-up work to improve observability.
155 lines
3.3 KiB
Go
155 lines
3.3 KiB
Go
package monitoring
|
|
|
|
import (
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
type breakerState int
|
|
|
|
const (
|
|
breakerClosed breakerState = iota
|
|
breakerOpen
|
|
breakerHalfOpen
|
|
)
|
|
|
|
type circuitBreaker struct {
|
|
mu sync.Mutex
|
|
state breakerState
|
|
failureCount int
|
|
openedAt time.Time
|
|
lastAttempt time.Time
|
|
retryInterval time.Duration
|
|
maxDelay time.Duration
|
|
openThreshold int
|
|
halfOpenWindow time.Duration
|
|
stateSince time.Time
|
|
lastTransition time.Time
|
|
}
|
|
|
|
func newCircuitBreaker(openThreshold int, retryInterval, maxDelay, halfOpenWindow time.Duration) *circuitBreaker {
|
|
if openThreshold <= 0 {
|
|
openThreshold = 3
|
|
}
|
|
if retryInterval <= 0 {
|
|
retryInterval = 5 * time.Second
|
|
}
|
|
if maxDelay <= 0 {
|
|
maxDelay = 5 * time.Minute
|
|
}
|
|
if halfOpenWindow <= 0 {
|
|
halfOpenWindow = 30 * time.Second
|
|
}
|
|
now := time.Now()
|
|
return &circuitBreaker{
|
|
state: breakerClosed,
|
|
retryInterval: retryInterval,
|
|
maxDelay: maxDelay,
|
|
openThreshold: openThreshold,
|
|
halfOpenWindow: halfOpenWindow,
|
|
stateSince: now,
|
|
lastTransition: now,
|
|
}
|
|
}
|
|
|
|
func (b *circuitBreaker) allow(now time.Time) bool {
|
|
b.mu.Lock()
|
|
defer b.mu.Unlock()
|
|
|
|
switch b.state {
|
|
case breakerClosed:
|
|
return true
|
|
case breakerOpen:
|
|
if now.Sub(b.openedAt) >= b.retryInterval {
|
|
b.state = breakerHalfOpen
|
|
b.lastAttempt = now
|
|
b.stateSince = now
|
|
b.lastTransition = now
|
|
return true
|
|
}
|
|
return false
|
|
case breakerHalfOpen:
|
|
if now.Sub(b.lastAttempt) >= b.halfOpenWindow {
|
|
b.lastAttempt = now
|
|
return true
|
|
}
|
|
return false
|
|
default:
|
|
return true
|
|
}
|
|
}
|
|
|
|
func (b *circuitBreaker) recordSuccess() {
|
|
b.mu.Lock()
|
|
defer b.mu.Unlock()
|
|
if b.state != breakerClosed {
|
|
now := time.Now()
|
|
b.state = breakerClosed
|
|
b.failureCount = 0
|
|
b.stateSince = now
|
|
b.lastTransition = now
|
|
}
|
|
}
|
|
|
|
func (b *circuitBreaker) recordFailure(now time.Time) {
|
|
b.mu.Lock()
|
|
defer b.mu.Unlock()
|
|
b.failureCount++
|
|
b.lastAttempt = now
|
|
|
|
switch b.state {
|
|
case breakerHalfOpen:
|
|
b.trip(now)
|
|
case breakerClosed:
|
|
if b.failureCount >= b.openThreshold {
|
|
b.trip(now)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (b *circuitBreaker) trip(now time.Time) {
|
|
b.state = breakerOpen
|
|
delay := b.retryInterval << uint(b.failureCount)
|
|
if delay > b.maxDelay {
|
|
delay = b.maxDelay
|
|
}
|
|
b.retryInterval = delay
|
|
b.openedAt = now
|
|
b.stateSince = now
|
|
b.lastTransition = now
|
|
}
|
|
|
|
// BreakerSnapshot represents the current state of a circuit breaker.
|
|
type BreakerSnapshot struct {
|
|
Instance string `json:"instance"`
|
|
Type string `json:"type"`
|
|
State string `json:"state"`
|
|
Failures int `json:"failures"`
|
|
RetryAt time.Time `json:"retryAt,omitempty"`
|
|
}
|
|
|
|
// State returns a snapshot of the circuit breaker state for API exposure.
|
|
func (b *circuitBreaker) State() (state string, failures int, retryAt time.Time) {
|
|
state, failures, retryAt, _, _ = b.stateDetails()
|
|
return
|
|
}
|
|
|
|
func (b *circuitBreaker) stateDetails() (state string, failures int, retryAt time.Time, since time.Time, lastTransition time.Time) {
|
|
b.mu.Lock()
|
|
defer b.mu.Unlock()
|
|
|
|
switch b.state {
|
|
case breakerClosed:
|
|
state = "closed"
|
|
case breakerOpen:
|
|
state = "open"
|
|
retryAt = b.openedAt.Add(b.retryInterval)
|
|
case breakerHalfOpen:
|
|
state = "half_open"
|
|
retryAt = b.lastAttempt.Add(b.halfOpenWindow)
|
|
default:
|
|
state = "unknown"
|
|
}
|
|
|
|
return state, b.failureCount, retryAt, b.stateSince, b.lastTransition
|
|
}
|