Pulse/internal/monitoring/metrics.go
rcourtman ad998a1e2f style: fix staticcheck style warnings
- Merge variable declaration with assignment (S1021)
- Use unconditional strings.TrimPrefix (S1017)
- Remove unnecessary nil checks around range (S1031)
- Remove unnecessary fmt.Sprintf (S1039)
- Use copy() instead of manual loop (S1001)
- Use time.Until instead of t.Sub(time.Now()) (S1024)
- Use buf.String() instead of string(buf.Bytes()) (S1030)
2025-11-27 09:19:33 +00:00

628 lines
17 KiB
Go

package monitoring
import (
stdErrors "errors"
"strings"
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
internalerrors "github.com/rcourtman/pulse-go-rewrite/internal/errors"
)
// PollMetrics manages Prometheus instrumentation for polling activity.
type PollMetrics struct {
pollDuration *prometheus.HistogramVec
pollResults *prometheus.CounterVec
pollErrors *prometheus.CounterVec
lastSuccess *prometheus.GaugeVec
staleness *prometheus.GaugeVec
queueDepth prometheus.Gauge
inflight *prometheus.GaugeVec
nodePollDuration *prometheus.HistogramVec
nodePollResults *prometheus.CounterVec
nodePollErrors *prometheus.CounterVec
nodeLastSuccess *prometheus.GaugeVec
nodeStaleness *prometheus.GaugeVec
schedulerQueueReady prometheus.Gauge
schedulerQueueDepthByType *prometheus.GaugeVec
schedulerQueueWait *prometheus.HistogramVec
schedulerDeadLetterDepth *prometheus.GaugeVec
schedulerBreakerState *prometheus.GaugeVec
schedulerBreakerFailureCount *prometheus.GaugeVec
schedulerBreakerRetrySeconds *prometheus.GaugeVec
mu sync.RWMutex
lastSuccessByKey map[metricKey]time.Time
nodeLastSuccessByKey map[nodeMetricKey]time.Time
lastQueueTypeKeys map[string]struct{}
lastDLQKeys map[string]struct{}
pending int
}
type metricKey struct {
instanceType string
instance string
}
type nodeMetricKey struct {
instanceType string
instance string
node string
}
var (
pollMetricsInstance *PollMetrics
pollMetricsOnce sync.Once
)
func getPollMetrics() *PollMetrics {
pollMetricsOnce.Do(func() {
pollMetricsInstance = newPollMetrics()
})
return pollMetricsInstance
}
func newPollMetrics() *PollMetrics {
pm := &PollMetrics{
pollDuration: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "pulse",
Subsystem: "monitor",
Name: "poll_duration_seconds",
Help: "Duration of polling operations per instance.",
Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 15, 20, 30},
},
[]string{"instance_type", "instance"},
),
pollResults: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "pulse",
Subsystem: "monitor",
Name: "poll_total",
Help: "Total polling attempts partitioned by result.",
},
[]string{"instance_type", "instance", "result"},
),
pollErrors: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "pulse",
Subsystem: "monitor",
Name: "poll_errors_total",
Help: "Polling failures grouped by error type.",
},
[]string{"instance_type", "instance", "error_type"},
),
lastSuccess: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "pulse",
Subsystem: "monitor",
Name: "poll_last_success_timestamp",
Help: "Unix timestamp of the last successful poll.",
},
[]string{"instance_type", "instance"},
),
staleness: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "pulse",
Subsystem: "monitor",
Name: "poll_staleness_seconds",
Help: "Seconds since the last successful poll. -1 indicates no successes yet.",
},
[]string{"instance_type", "instance"},
),
queueDepth: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: "pulse",
Subsystem: "monitor",
Name: "poll_queue_depth",
Help: "Approximate number of poll tasks waiting to complete in the current cycle.",
},
),
inflight: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "pulse",
Subsystem: "monitor",
Name: "poll_inflight",
Help: "Current number of poll operations executing per instance type.",
},
[]string{"instance_type"},
),
nodePollDuration: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "pulse",
Subsystem: "monitor",
Name: "node_poll_duration_seconds",
Help: "Duration of polling operations per node.",
Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 15, 20, 30},
},
[]string{"instance_type", "instance", "node"},
),
nodePollResults: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "pulse",
Subsystem: "monitor",
Name: "node_poll_total",
Help: "Total polling attempts per node partitioned by result.",
},
[]string{"instance_type", "instance", "node", "result"},
),
nodePollErrors: prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "pulse",
Subsystem: "monitor",
Name: "node_poll_errors_total",
Help: "Polling failures per node grouped by error type.",
},
[]string{"instance_type", "instance", "node", "error_type"},
),
nodeLastSuccess: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "pulse",
Subsystem: "monitor",
Name: "node_poll_last_success_timestamp",
Help: "Unix timestamp of the last successful poll for a node.",
},
[]string{"instance_type", "instance", "node"},
),
nodeStaleness: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "pulse",
Subsystem: "monitor",
Name: "node_poll_staleness_seconds",
Help: "Seconds since the last successful poll for a node. -1 indicates no successes yet.",
},
[]string{"instance_type", "instance", "node"},
),
schedulerQueueReady: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: "pulse",
Subsystem: "scheduler",
Name: "queue_due_soon",
Help: "Number of tasks due to run within the immediate window (12s).",
},
),
schedulerQueueDepthByType: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "pulse",
Subsystem: "scheduler",
Name: "queue_depth",
Help: "Current scheduler queue depth partitioned by instance type.",
},
[]string{"instance_type"},
),
schedulerQueueWait: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "pulse",
Subsystem: "scheduler",
Name: "queue_wait_seconds",
Help: "Observed wait time between task readiness and execution.",
Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60},
},
[]string{"instance_type"},
),
schedulerDeadLetterDepth: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "pulse",
Subsystem: "scheduler",
Name: "dead_letter_depth",
Help: "Number of tasks currently parked in the dead-letter queue per instance.",
},
[]string{"instance_type", "instance"},
),
schedulerBreakerState: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "pulse",
Subsystem: "scheduler",
Name: "breaker_state",
Help: "Circuit breaker state encoded as 0=closed, 1=half-open, 2=open, -1=unknown.",
},
[]string{"instance_type", "instance"},
),
schedulerBreakerFailureCount: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "pulse",
Subsystem: "scheduler",
Name: "breaker_failure_count",
Help: "Current consecutive failure count tracked by the circuit breaker.",
},
[]string{"instance_type", "instance"},
),
schedulerBreakerRetrySeconds: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "pulse",
Subsystem: "scheduler",
Name: "breaker_retry_seconds",
Help: "Seconds until the circuit breaker will allow another attempt.",
},
[]string{"instance_type", "instance"},
),
lastSuccessByKey: make(map[metricKey]time.Time),
nodeLastSuccessByKey: make(map[nodeMetricKey]time.Time),
lastQueueTypeKeys: make(map[string]struct{}),
lastDLQKeys: make(map[string]struct{}),
}
prometheus.MustRegister(
pm.pollDuration,
pm.pollResults,
pm.pollErrors,
pm.lastSuccess,
pm.staleness,
pm.queueDepth,
pm.inflight,
pm.nodePollDuration,
pm.nodePollResults,
pm.nodePollErrors,
pm.nodeLastSuccess,
pm.nodeStaleness,
pm.schedulerQueueReady,
pm.schedulerQueueDepthByType,
pm.schedulerQueueWait,
pm.schedulerDeadLetterDepth,
pm.schedulerBreakerState,
pm.schedulerBreakerFailureCount,
pm.schedulerBreakerRetrySeconds,
)
return pm
}
// NodePollResult captures timing and outcome for a specific node within a poll cycle.
type NodePollResult struct {
InstanceName string
InstanceType string
NodeName string
Success bool
Error error
StartTime time.Time
EndTime time.Time
}
// RecordNodeResult records metrics for an individual node poll.
func (pm *PollMetrics) RecordNodeResult(result NodePollResult) {
if pm == nil {
return
}
instType, inst := sanitizeInstanceLabels(result.InstanceType, result.InstanceName)
nodeLabel := normalizeNodeLabel(result.NodeName)
duration := result.EndTime.Sub(result.StartTime).Seconds()
if duration < 0 {
duration = 0
}
pm.nodePollDuration.WithLabelValues(instType, inst, nodeLabel).Observe(duration)
resultValue := "success"
if !result.Success {
resultValue = "error"
}
pm.nodePollResults.WithLabelValues(instType, inst, nodeLabel, resultValue).Inc()
if result.Success {
pm.nodeLastSuccess.WithLabelValues(instType, inst, nodeLabel).Set(float64(result.EndTime.Unix()))
pm.storeNodeLastSuccess(instType, inst, nodeLabel, result.EndTime)
pm.updateNodeStaleness(instType, inst, nodeLabel, 0)
return
}
errType := pm.classifyError(result.Error)
pm.nodePollErrors.WithLabelValues(instType, inst, nodeLabel, errType).Inc()
if last, ok := pm.lastNodeSuccessFor(instType, inst, nodeLabel); ok && !last.IsZero() {
staleness := result.EndTime.Sub(last).Seconds()
if staleness < 0 {
staleness = 0
}
pm.updateNodeStaleness(instType, inst, nodeLabel, staleness)
} else {
pm.updateNodeStaleness(instType, inst, nodeLabel, -1)
}
}
// RecordQueueWait observes the time a task spent waiting in the scheduler queue.
func (pm *PollMetrics) RecordQueueWait(instanceType string, wait time.Duration) {
if pm == nil {
return
}
if wait < 0 {
wait = 0
}
label := normalizeLabel(instanceType)
pm.schedulerQueueWait.WithLabelValues(label).Observe(wait.Seconds())
}
// UpdateQueueSnapshot updates scheduler queue depth metrics.
func (pm *PollMetrics) UpdateQueueSnapshot(snapshot QueueSnapshot) {
if pm == nil {
return
}
pm.schedulerQueueReady.Set(float64(snapshot.DueWithinSeconds))
current := make(map[string]struct{}, len(snapshot.PerType))
for instanceType, depth := range snapshot.PerType {
key := normalizeLabel(instanceType)
pm.schedulerQueueDepthByType.WithLabelValues(key).Set(float64(depth))
current[key] = struct{}{}
}
pm.mu.Lock()
for key := range pm.lastQueueTypeKeys {
if _, ok := current[key]; !ok {
pm.schedulerQueueDepthByType.WithLabelValues(key).Set(0)
}
}
pm.lastQueueTypeKeys = current
pm.mu.Unlock()
}
// UpdateDeadLetterCounts refreshes dead-letter queue gauges based on the provided tasks.
func (pm *PollMetrics) UpdateDeadLetterCounts(tasks []DeadLetterTask) {
if pm == nil {
return
}
current := make(map[string]float64)
for _, task := range tasks {
instType := normalizeLabel(task.Type)
inst := normalizeLabel(task.Instance)
key := instType + "::" + inst
current[key] = current[key] + 1
}
pm.mu.Lock()
prev := pm.lastDLQKeys
pm.lastDLQKeys = make(map[string]struct{}, len(current))
pm.mu.Unlock()
for key, count := range current {
instType, inst := splitInstanceKey(key)
pm.schedulerDeadLetterDepth.WithLabelValues(instType, inst).Set(count)
}
pm.mu.Lock()
for key := range current {
pm.lastDLQKeys[key] = struct{}{}
}
for key := range prev {
if _, ok := current[key]; !ok {
instType, inst := splitInstanceKey(key)
pm.schedulerDeadLetterDepth.WithLabelValues(instType, inst).Set(0)
}
}
pm.mu.Unlock()
}
// SetBreakerState updates circuit breaker metrics for a specific instance.
func (pm *PollMetrics) SetBreakerState(instanceType, instance, state string, failures int, retryAt time.Time) {
if pm == nil {
return
}
instType, inst := sanitizeInstanceLabels(instanceType, instance)
value := breakerStateToValue(state)
pm.schedulerBreakerState.WithLabelValues(instType, inst).Set(value)
pm.schedulerBreakerFailureCount.WithLabelValues(instType, inst).Set(float64(failures))
retrySeconds := 0.0
if !retryAt.IsZero() {
retrySeconds = time.Until(retryAt).Seconds()
if retrySeconds < 0 {
retrySeconds = 0
}
}
pm.schedulerBreakerRetrySeconds.WithLabelValues(instType, inst).Set(retrySeconds)
}
// RecordResult records metrics for a polling result.
func (pm *PollMetrics) RecordResult(result PollResult) {
if pm == nil {
return
}
instType, inst := sanitizeInstanceLabels(result.InstanceType, result.InstanceName)
duration := result.EndTime.Sub(result.StartTime).Seconds()
if duration < 0 {
duration = 0
}
pm.pollDuration.WithLabelValues(instType, inst).Observe(duration)
resultValue := "success"
if !result.Success {
resultValue = "error"
}
pm.pollResults.WithLabelValues(instType, inst, resultValue).Inc()
if result.Success {
pm.lastSuccess.WithLabelValues(instType, inst).Set(float64(result.EndTime.Unix()))
pm.storeLastSuccess(instType, inst, result.EndTime)
pm.updateStaleness(instType, inst, 0)
} else {
errType := pm.classifyError(result.Error)
pm.pollErrors.WithLabelValues(instType, inst, errType).Inc()
if last, ok := pm.lastSuccessFor(instType, inst); ok && !last.IsZero() {
staleness := result.EndTime.Sub(last).Seconds()
if staleness < 0 {
staleness = 0
}
pm.updateStaleness(instType, inst, staleness)
} else {
pm.updateStaleness(instType, inst, -1)
}
}
pm.decrementPending()
}
// ResetQueueDepth sets the pending queue depth for the next polling cycle.
func (pm *PollMetrics) ResetQueueDepth(total int) {
if pm == nil {
return
}
if total < 0 {
total = 0
}
pm.mu.Lock()
pm.pending = total
pm.mu.Unlock()
pm.queueDepth.Set(float64(total))
}
// SetQueueDepth allows direct gauge control when needed.
func (pm *PollMetrics) SetQueueDepth(depth int) {
if pm == nil {
return
}
if depth < 0 {
depth = 0
}
pm.queueDepth.Set(float64(depth))
}
// IncInFlight increments the in-flight gauge for the given instance type.
func (pm *PollMetrics) IncInFlight(instanceType string) {
if pm == nil {
return
}
pm.inflight.WithLabelValues(instanceType).Inc()
}
// DecInFlight decrements the in-flight gauge for the given instance type.
func (pm *PollMetrics) DecInFlight(instanceType string) {
if pm == nil {
return
}
pm.inflight.WithLabelValues(instanceType).Dec()
}
func (pm *PollMetrics) decrementPending() {
if pm == nil {
return
}
pm.mu.Lock()
if pm.pending > 0 {
pm.pending--
}
current := pm.pending
pm.mu.Unlock()
pm.queueDepth.Set(float64(current))
}
func (pm *PollMetrics) storeLastSuccess(instanceType, instance string, ts time.Time) {
pm.mu.Lock()
pm.lastSuccessByKey[makeMetricKey(instanceType, instance)] = ts
pm.mu.Unlock()
}
func (pm *PollMetrics) lastSuccessFor(instanceType, instance string) (time.Time, bool) {
pm.mu.RLock()
ts, ok := pm.lastSuccessByKey[makeMetricKey(instanceType, instance)]
pm.mu.RUnlock()
return ts, ok
}
func (pm *PollMetrics) updateStaleness(instanceType, instance string, value float64) {
instType, inst := sanitizeInstanceLabels(instanceType, instance)
pm.staleness.WithLabelValues(instType, inst).Set(value)
}
func (pm *PollMetrics) storeNodeLastSuccess(instanceType, instance, node string, ts time.Time) {
pm.mu.Lock()
pm.nodeLastSuccessByKey[makeNodeMetricKey(instanceType, instance, node)] = ts
pm.mu.Unlock()
}
func (pm *PollMetrics) lastNodeSuccessFor(instanceType, instance, node string) (time.Time, bool) {
pm.mu.RLock()
ts, ok := pm.nodeLastSuccessByKey[makeNodeMetricKey(instanceType, instance, node)]
pm.mu.RUnlock()
return ts, ok
}
func (pm *PollMetrics) updateNodeStaleness(instanceType, instance, node string, value float64) {
instType, inst := sanitizeInstanceLabels(instanceType, instance)
nodeLabel := normalizeLabel(node)
pm.nodeStaleness.WithLabelValues(instType, inst, nodeLabel).Set(value)
}
func splitInstanceKey(key string) (string, string) {
parts := strings.SplitN(key, "::", 2)
if len(parts) == 2 {
return normalizeLabel(parts[0]), normalizeLabel(parts[1])
}
if key == "" {
return "unknown", "unknown"
}
return "unknown", normalizeLabel(key)
}
func breakerStateToValue(state string) float64 {
switch strings.ToLower(state) {
case "closed":
return 0
case "half_open", "half-open":
return 1
case "open":
return 2
default:
return -1
}
}
func sanitizeInstanceLabels(instanceType, instance string) (string, string) {
return normalizeLabel(instanceType), normalizeLabel(instance)
}
func makeMetricKey(instanceType, instance string) metricKey {
instType, inst := sanitizeInstanceLabels(instanceType, instance)
return metricKey{
instanceType: instType,
instance: inst,
}
}
func makeNodeMetricKey(instanceType, instance, node string) nodeMetricKey {
instType, inst := sanitizeInstanceLabels(instanceType, instance)
return nodeMetricKey{
instanceType: instType,
instance: inst,
node: normalizeLabel(node),
}
}
func normalizeLabel(value string) string {
v := strings.TrimSpace(value)
if v == "" {
return "unknown"
}
return v
}
func normalizeNodeLabel(value string) string {
label := normalizeLabel(value)
if label == "unknown" {
return "unknown-node"
}
return label
}
func (pm *PollMetrics) classifyError(err error) string {
if err == nil {
return "none"
}
var monitorErr *internalerrors.MonitorError
if stdErrors.As(err, &monitorErr) {
return string(monitorErr.Type)
}
return "unknown"
}