mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-29 12:00:13 +00:00
1657 lines
42 KiB
Go
1657 lines
42 KiB
Go
package monitoring
|
|
|
|
import (
|
|
"errors"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
dto "github.com/prometheus/client_model/go"
|
|
internalerrors "github.com/rcourtman/pulse-go-rewrite/internal/errors"
|
|
)
|
|
|
|
// newTestPollMetrics creates a PollMetrics instance with an isolated registry for testing.
|
|
func newTestPollMetrics(t *testing.T) *PollMetrics {
|
|
t.Helper()
|
|
|
|
reg := prometheus.NewRegistry()
|
|
|
|
pm := &PollMetrics{
|
|
schedulerQueueReady: prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "scheduler",
|
|
Name: "queue_due_soon",
|
|
Help: "Number of tasks due to run within the immediate window.",
|
|
},
|
|
),
|
|
schedulerQueueDepthByType: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "scheduler",
|
|
Name: "queue_depth",
|
|
Help: "Current scheduler queue depth partitioned by instance type.",
|
|
},
|
|
[]string{"instance_type"},
|
|
),
|
|
schedulerDeadLetterDepth: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "scheduler",
|
|
Name: "dead_letter_depth",
|
|
Help: "Number of tasks currently parked in the dead-letter queue per instance.",
|
|
},
|
|
[]string{"instance_type", "instance"},
|
|
),
|
|
schedulerBreakerState: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "scheduler",
|
|
Name: "breaker_state",
|
|
Help: "Circuit breaker state encoded as 0=closed, 1=half-open, 2=open, -1=unknown.",
|
|
},
|
|
[]string{"instance_type", "instance"},
|
|
),
|
|
schedulerBreakerFailureCount: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "scheduler",
|
|
Name: "breaker_failure_count",
|
|
Help: "Current consecutive failure count tracked by the circuit breaker.",
|
|
},
|
|
[]string{"instance_type", "instance"},
|
|
),
|
|
schedulerBreakerRetrySeconds: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "scheduler",
|
|
Name: "breaker_retry_seconds",
|
|
Help: "Seconds until the circuit breaker will allow another attempt.",
|
|
},
|
|
[]string{"instance_type", "instance"},
|
|
),
|
|
lastQueueTypeKeys: make(map[string]struct{}),
|
|
lastDLQKeys: make(map[string]struct{}),
|
|
}
|
|
|
|
reg.MustRegister(
|
|
pm.schedulerQueueReady,
|
|
pm.schedulerQueueDepthByType,
|
|
pm.schedulerDeadLetterDepth,
|
|
pm.schedulerBreakerState,
|
|
pm.schedulerBreakerFailureCount,
|
|
pm.schedulerBreakerRetrySeconds,
|
|
)
|
|
|
|
return pm
|
|
}
|
|
|
|
// getGaugeValue returns the value of a prometheus gauge.
|
|
func getGaugeValue(g prometheus.Gauge) float64 {
|
|
m := &dto.Metric{}
|
|
if err := g.Write(m); err != nil {
|
|
return 0
|
|
}
|
|
return m.GetGauge().GetValue()
|
|
}
|
|
|
|
// getGaugeVecValue returns the value for specific labels from a GaugeVec.
|
|
func getGaugeVecValue(gv *prometheus.GaugeVec, labels ...string) float64 {
|
|
m := &dto.Metric{}
|
|
gauge, err := gv.GetMetricWithLabelValues(labels...)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
if err := gauge.Write(m); err != nil {
|
|
return 0
|
|
}
|
|
return m.GetGauge().GetValue()
|
|
}
|
|
|
|
func TestUpdateQueueSnapshot_NilPollMetrics(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var pm *PollMetrics
|
|
// Should not panic
|
|
pm.UpdateQueueSnapshot(QueueSnapshot{
|
|
DueWithinSeconds: 5,
|
|
PerType: map[string]int{"pve": 10},
|
|
})
|
|
}
|
|
|
|
func TestUpdateQueueSnapshot_SetsDueWithinSeconds(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newTestPollMetrics(t)
|
|
|
|
pm.UpdateQueueSnapshot(QueueSnapshot{
|
|
DueWithinSeconds: 42,
|
|
PerType: map[string]int{},
|
|
})
|
|
|
|
got := getGaugeValue(pm.schedulerQueueReady)
|
|
if got != 42 {
|
|
t.Fatalf("schedulerQueueReady = %v, want 42", got)
|
|
}
|
|
}
|
|
|
|
func TestUpdateQueueSnapshot_UpdatesPerTypeQueueDepth(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newTestPollMetrics(t)
|
|
|
|
pm.UpdateQueueSnapshot(QueueSnapshot{
|
|
DueWithinSeconds: 0,
|
|
PerType: map[string]int{
|
|
"pve": 10,
|
|
"pbs": 5,
|
|
"pmg": 3,
|
|
},
|
|
})
|
|
|
|
cases := []struct {
|
|
instanceType string
|
|
want float64
|
|
}{
|
|
{"pve", 10},
|
|
{"pbs", 5},
|
|
{"pmg", 3},
|
|
}
|
|
|
|
for _, tc := range cases {
|
|
got := getGaugeVecValue(pm.schedulerQueueDepthByType, tc.instanceType)
|
|
if got != tc.want {
|
|
t.Errorf("queue_depth{instance_type=%q} = %v, want %v", tc.instanceType, got, tc.want)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestUpdateQueueSnapshot_ClearsStaleTypeKeys(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newTestPollMetrics(t)
|
|
|
|
// First snapshot with pve and pbs
|
|
pm.UpdateQueueSnapshot(QueueSnapshot{
|
|
DueWithinSeconds: 0,
|
|
PerType: map[string]int{
|
|
"pve": 10,
|
|
"pbs": 5,
|
|
},
|
|
})
|
|
|
|
// Verify initial values
|
|
if got := getGaugeVecValue(pm.schedulerQueueDepthByType, "pve"); got != 10 {
|
|
t.Fatalf("initial pve = %v, want 10", got)
|
|
}
|
|
if got := getGaugeVecValue(pm.schedulerQueueDepthByType, "pbs"); got != 5 {
|
|
t.Fatalf("initial pbs = %v, want 5", got)
|
|
}
|
|
|
|
// Second snapshot with only pve (pbs should be cleared)
|
|
pm.UpdateQueueSnapshot(QueueSnapshot{
|
|
DueWithinSeconds: 0,
|
|
PerType: map[string]int{
|
|
"pve": 8,
|
|
},
|
|
})
|
|
|
|
if got := getGaugeVecValue(pm.schedulerQueueDepthByType, "pve"); got != 8 {
|
|
t.Errorf("updated pve = %v, want 8", got)
|
|
}
|
|
if got := getGaugeVecValue(pm.schedulerQueueDepthByType, "pbs"); got != 0 {
|
|
t.Errorf("stale pbs should be 0, got %v", got)
|
|
}
|
|
}
|
|
|
|
func TestUpdateQueueSnapshot_EmptySnapshot(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newTestPollMetrics(t)
|
|
|
|
// First add some data
|
|
pm.UpdateQueueSnapshot(QueueSnapshot{
|
|
DueWithinSeconds: 10,
|
|
PerType: map[string]int{
|
|
"pve": 5,
|
|
},
|
|
})
|
|
|
|
// Then clear with empty snapshot
|
|
pm.UpdateQueueSnapshot(QueueSnapshot{
|
|
DueWithinSeconds: 0,
|
|
PerType: map[string]int{},
|
|
})
|
|
|
|
if got := getGaugeValue(pm.schedulerQueueReady); got != 0 {
|
|
t.Errorf("schedulerQueueReady = %v, want 0", got)
|
|
}
|
|
if got := getGaugeVecValue(pm.schedulerQueueDepthByType, "pve"); got != 0 {
|
|
t.Errorf("pve should be cleared to 0, got %v", got)
|
|
}
|
|
}
|
|
|
|
func TestUpdateDeadLetterCounts_NilPollMetrics(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var pm *PollMetrics
|
|
// Should not panic
|
|
pm.UpdateDeadLetterCounts([]DeadLetterTask{
|
|
{Type: "pve", Instance: "pve1"},
|
|
})
|
|
}
|
|
|
|
func TestUpdateDeadLetterCounts_EmptyClearsPrevious(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newTestPollMetrics(t)
|
|
|
|
// First add some tasks
|
|
pm.UpdateDeadLetterCounts([]DeadLetterTask{
|
|
{Type: "pve", Instance: "pve1"},
|
|
{Type: "pbs", Instance: "pbs1"},
|
|
})
|
|
|
|
// Verify they were set
|
|
if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pve", "pve1"); got != 1 {
|
|
t.Fatalf("initial pve/pve1 = %v, want 1", got)
|
|
}
|
|
if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pbs", "pbs1"); got != 1 {
|
|
t.Fatalf("initial pbs/pbs1 = %v, want 1", got)
|
|
}
|
|
|
|
// Clear with empty slice
|
|
pm.UpdateDeadLetterCounts([]DeadLetterTask{})
|
|
|
|
if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pve", "pve1"); got != 0 {
|
|
t.Errorf("pve/pve1 should be cleared to 0, got %v", got)
|
|
}
|
|
if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pbs", "pbs1"); got != 0 {
|
|
t.Errorf("pbs/pbs1 should be cleared to 0, got %v", got)
|
|
}
|
|
}
|
|
|
|
func TestUpdateDeadLetterCounts_SingleTask(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newTestPollMetrics(t)
|
|
|
|
pm.UpdateDeadLetterCounts([]DeadLetterTask{
|
|
{Type: "pve", Instance: "my-pve-instance"},
|
|
})
|
|
|
|
got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pve", "my-pve-instance")
|
|
if got != 1 {
|
|
t.Fatalf("dead_letter_depth{pve,my-pve-instance} = %v, want 1", got)
|
|
}
|
|
}
|
|
|
|
func TestUpdateDeadLetterCounts_AggregatesSameTypeInstance(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newTestPollMetrics(t)
|
|
|
|
pm.UpdateDeadLetterCounts([]DeadLetterTask{
|
|
{Type: "pve", Instance: "pve1"},
|
|
{Type: "pve", Instance: "pve1"},
|
|
{Type: "pve", Instance: "pve1"},
|
|
{Type: "pbs", Instance: "pbs1"},
|
|
{Type: "pbs", Instance: "pbs1"},
|
|
})
|
|
|
|
if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pve", "pve1"); got != 3 {
|
|
t.Errorf("pve/pve1 = %v, want 3", got)
|
|
}
|
|
if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pbs", "pbs1"); got != 2 {
|
|
t.Errorf("pbs/pbs1 = %v, want 2", got)
|
|
}
|
|
}
|
|
|
|
func TestUpdateDeadLetterCounts_ClearsStaleKeys(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newTestPollMetrics(t)
|
|
|
|
// First update with pve1 and pbs1
|
|
pm.UpdateDeadLetterCounts([]DeadLetterTask{
|
|
{Type: "pve", Instance: "pve1"},
|
|
{Type: "pbs", Instance: "pbs1"},
|
|
})
|
|
|
|
// Verify initial values
|
|
if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pve", "pve1"); got != 1 {
|
|
t.Fatalf("initial pve/pve1 = %v, want 1", got)
|
|
}
|
|
if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pbs", "pbs1"); got != 1 {
|
|
t.Fatalf("initial pbs/pbs1 = %v, want 1", got)
|
|
}
|
|
|
|
// Second update with only pve1 (pbs1 should be cleared)
|
|
pm.UpdateDeadLetterCounts([]DeadLetterTask{
|
|
{Type: "pve", Instance: "pve1"},
|
|
{Type: "pve", Instance: "pve1"},
|
|
})
|
|
|
|
if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pve", "pve1"); got != 2 {
|
|
t.Errorf("updated pve/pve1 = %v, want 2", got)
|
|
}
|
|
if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pbs", "pbs1"); got != 0 {
|
|
t.Errorf("stale pbs/pbs1 should be 0, got %v", got)
|
|
}
|
|
}
|
|
|
|
func TestUpdateDeadLetterCounts_NormalizesEmptyLabels(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newTestPollMetrics(t)
|
|
|
|
// Empty Type and Instance should normalize to "unknown"
|
|
pm.UpdateDeadLetterCounts([]DeadLetterTask{
|
|
{Type: "", Instance: ""},
|
|
{Type: " ", Instance: " "},
|
|
})
|
|
|
|
got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "unknown", "unknown")
|
|
if got != 2 {
|
|
t.Fatalf("empty labels should normalize to unknown, got count %v, want 2", got)
|
|
}
|
|
}
|
|
|
|
func TestUpdateDeadLetterCounts_MultipleInstancesSameType(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newTestPollMetrics(t)
|
|
|
|
pm.UpdateDeadLetterCounts([]DeadLetterTask{
|
|
{Type: "pve", Instance: "pve1"},
|
|
{Type: "pve", Instance: "pve2"},
|
|
{Type: "pve", Instance: "pve3"},
|
|
})
|
|
|
|
if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pve", "pve1"); got != 1 {
|
|
t.Errorf("pve/pve1 = %v, want 1", got)
|
|
}
|
|
if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pve", "pve2"); got != 1 {
|
|
t.Errorf("pve/pve2 = %v, want 1", got)
|
|
}
|
|
if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pve", "pve3"); got != 1 {
|
|
t.Errorf("pve/pve3 = %v, want 1", got)
|
|
}
|
|
}
|
|
|
|
func TestSetBreakerState_NilPollMetrics(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var pm *PollMetrics
|
|
// Should not panic
|
|
pm.SetBreakerState("pve", "pve1", "open", 5, time.Now().Add(time.Minute))
|
|
}
|
|
|
|
func TestSetBreakerState_ZeroRetryAt(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newTestPollMetrics(t)
|
|
|
|
pm.SetBreakerState("pve", "pve1", "closed", 0, time.Time{})
|
|
|
|
got := getGaugeVecValue(pm.schedulerBreakerRetrySeconds, "pve", "pve1")
|
|
if got != 0 {
|
|
t.Fatalf("retrySeconds = %v, want 0 for zero retryAt", got)
|
|
}
|
|
}
|
|
|
|
func TestSetBreakerState_FutureRetryAt(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newTestPollMetrics(t)
|
|
|
|
// Set retryAt 60 seconds in the future
|
|
retryAt := time.Now().Add(60 * time.Second)
|
|
pm.SetBreakerState("pve", "pve1", "open", 3, retryAt)
|
|
|
|
got := getGaugeVecValue(pm.schedulerBreakerRetrySeconds, "pve", "pve1")
|
|
// Allow some tolerance for test execution time
|
|
if got < 55 || got > 65 {
|
|
t.Fatalf("retrySeconds = %v, want ~60 for future retryAt", got)
|
|
}
|
|
}
|
|
|
|
func TestSetBreakerState_PastRetryAtClampsToZero(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newTestPollMetrics(t)
|
|
|
|
// Set retryAt 60 seconds in the past (already expired)
|
|
retryAt := time.Now().Add(-60 * time.Second)
|
|
pm.SetBreakerState("pve", "pve1", "half_open", 1, retryAt)
|
|
|
|
got := getGaugeVecValue(pm.schedulerBreakerRetrySeconds, "pve", "pve1")
|
|
if got != 0 {
|
|
t.Fatalf("retrySeconds = %v, want 0 for past retryAt", got)
|
|
}
|
|
}
|
|
|
|
func TestSetBreakerState_StateConversion(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
cases := []struct {
|
|
state string
|
|
want float64
|
|
}{
|
|
{"closed", 0},
|
|
{"CLOSED", 0},
|
|
{"Closed", 0},
|
|
{"half_open", 1},
|
|
{"half-open", 1},
|
|
{"HALF_OPEN", 1},
|
|
{"open", 2},
|
|
{"OPEN", 2},
|
|
{"Open", 2},
|
|
{"unknown_state", -1},
|
|
{"", -1},
|
|
{"invalid", -1},
|
|
}
|
|
|
|
for _, tc := range cases {
|
|
t.Run(tc.state, func(t *testing.T) {
|
|
pm := newTestPollMetrics(t)
|
|
pm.SetBreakerState("pve", "pve1", tc.state, 0, time.Time{})
|
|
|
|
got := getGaugeVecValue(pm.schedulerBreakerState, "pve", "pve1")
|
|
if got != tc.want {
|
|
t.Errorf("breakerState for %q = %v, want %v", tc.state, got, tc.want)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestSetBreakerState_FailureCount(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newTestPollMetrics(t)
|
|
|
|
pm.SetBreakerState("pve", "pve1", "open", 7, time.Time{})
|
|
|
|
got := getGaugeVecValue(pm.schedulerBreakerFailureCount, "pve", "pve1")
|
|
if got != 7 {
|
|
t.Fatalf("failureCount = %v, want 7", got)
|
|
}
|
|
}
|
|
|
|
func TestSetBreakerState_EmptyLabelsSanitized(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newTestPollMetrics(t)
|
|
|
|
pm.SetBreakerState("", "", "open", 5, time.Time{})
|
|
|
|
// Empty labels should be normalized to "unknown"
|
|
gotState := getGaugeVecValue(pm.schedulerBreakerState, "unknown", "unknown")
|
|
if gotState != 2 {
|
|
t.Errorf("breaker_state{unknown,unknown} = %v, want 2 (open)", gotState)
|
|
}
|
|
|
|
gotFailures := getGaugeVecValue(pm.schedulerBreakerFailureCount, "unknown", "unknown")
|
|
if gotFailures != 5 {
|
|
t.Errorf("breaker_failure_count{unknown,unknown} = %v, want 5", gotFailures)
|
|
}
|
|
}
|
|
|
|
func TestSetBreakerState_WhitespaceOnlyLabelsSanitized(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newTestPollMetrics(t)
|
|
|
|
pm.SetBreakerState(" ", " ", "closed", 0, time.Time{})
|
|
|
|
gotState := getGaugeVecValue(pm.schedulerBreakerState, "unknown", "unknown")
|
|
if gotState != 0 {
|
|
t.Errorf("breaker_state{unknown,unknown} = %v, want 0 (closed)", gotState)
|
|
}
|
|
}
|
|
|
|
// newFullTestPollMetrics creates a PollMetrics instance with all fields for RecordResult testing.
|
|
func newFullTestPollMetrics(t *testing.T) *PollMetrics {
|
|
t.Helper()
|
|
|
|
reg := prometheus.NewRegistry()
|
|
|
|
pm := &PollMetrics{
|
|
pollDuration: prometheus.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "monitor",
|
|
Name: "poll_duration_seconds",
|
|
Help: "Duration of polling operations per instance.",
|
|
Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 15, 20, 30},
|
|
},
|
|
[]string{"instance_type", "instance"},
|
|
),
|
|
pollResults: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "monitor",
|
|
Name: "poll_total",
|
|
Help: "Total polling attempts partitioned by result.",
|
|
},
|
|
[]string{"instance_type", "instance", "result"},
|
|
),
|
|
pollErrors: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "monitor",
|
|
Name: "poll_errors_total",
|
|
Help: "Polling failures grouped by error type.",
|
|
},
|
|
[]string{"instance_type", "instance", "error_type"},
|
|
),
|
|
lastSuccess: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "monitor",
|
|
Name: "poll_last_success_timestamp",
|
|
Help: "Unix timestamp of the last successful poll.",
|
|
},
|
|
[]string{"instance_type", "instance"},
|
|
),
|
|
staleness: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "monitor",
|
|
Name: "poll_staleness_seconds",
|
|
Help: "Seconds since the last successful poll.",
|
|
},
|
|
[]string{"instance_type", "instance"},
|
|
),
|
|
queueDepth: prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "monitor",
|
|
Name: "poll_queue_depth",
|
|
Help: "Approximate number of poll tasks waiting.",
|
|
},
|
|
),
|
|
lastSuccessByKey: make(map[metricKey]time.Time),
|
|
}
|
|
|
|
reg.MustRegister(
|
|
pm.pollDuration,
|
|
pm.pollResults,
|
|
pm.pollErrors,
|
|
pm.lastSuccess,
|
|
pm.staleness,
|
|
pm.queueDepth,
|
|
)
|
|
|
|
return pm
|
|
}
|
|
|
|
// getCounterVecValue returns the value for specific labels from a CounterVec.
|
|
func getCounterVecValue(cv *prometheus.CounterVec, labels ...string) float64 {
|
|
m := &dto.Metric{}
|
|
counter, err := cv.GetMetricWithLabelValues(labels...)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
if err := counter.Write(m); err != nil {
|
|
return 0
|
|
}
|
|
return m.GetCounter().GetValue()
|
|
}
|
|
|
|
// getHistogramSampleCount returns the sample count for specific labels from a HistogramVec.
|
|
func getHistogramSampleCount(hv *prometheus.HistogramVec, labels ...string) uint64 {
|
|
m := &dto.Metric{}
|
|
obs, err := hv.GetMetricWithLabelValues(labels...)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
if err := obs.(prometheus.Metric).Write(m); err != nil {
|
|
return 0
|
|
}
|
|
return m.GetHistogram().GetSampleCount()
|
|
}
|
|
|
|
// getHistogramSampleSum returns the sample sum for specific labels from a HistogramVec.
|
|
func getHistogramSampleSum(hv *prometheus.HistogramVec, labels ...string) float64 {
|
|
m := &dto.Metric{}
|
|
obs, err := hv.GetMetricWithLabelValues(labels...)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
if err := obs.(prometheus.Metric).Write(m); err != nil {
|
|
return 0
|
|
}
|
|
return m.GetHistogram().GetSampleSum()
|
|
}
|
|
|
|
func TestRecordResult_NilPollMetrics(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var pm *PollMetrics
|
|
// Should not panic
|
|
pm.RecordResult(PollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "pve1",
|
|
StartTime: time.Now(),
|
|
EndTime: time.Now().Add(time.Second),
|
|
Success: true,
|
|
})
|
|
}
|
|
|
|
func TestRecordResult_SuccessUpdatesLastSuccessAndStaleness(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newFullTestPollMetrics(t)
|
|
|
|
endTime := time.Now()
|
|
pm.RecordResult(PollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "my-instance",
|
|
StartTime: endTime.Add(-500 * time.Millisecond),
|
|
EndTime: endTime,
|
|
Success: true,
|
|
})
|
|
|
|
// Check lastSuccess is set to EndTime
|
|
gotLastSuccess := getGaugeVecValue(pm.lastSuccess, "pve", "my-instance")
|
|
wantLastSuccess := float64(endTime.Unix())
|
|
if gotLastSuccess != wantLastSuccess {
|
|
t.Errorf("lastSuccess = %v, want %v", gotLastSuccess, wantLastSuccess)
|
|
}
|
|
|
|
// Check staleness is set to 0 on success
|
|
gotStaleness := getGaugeVecValue(pm.staleness, "pve", "my-instance")
|
|
if gotStaleness != 0 {
|
|
t.Errorf("staleness = %v, want 0 on success", gotStaleness)
|
|
}
|
|
|
|
// Check success counter incremented
|
|
gotSuccessCount := getCounterVecValue(pm.pollResults, "pve", "my-instance", "success")
|
|
if gotSuccessCount != 1 {
|
|
t.Errorf("poll_total{result=success} = %v, want 1", gotSuccessCount)
|
|
}
|
|
|
|
// Check internal lastSuccessByKey is updated
|
|
ts, ok := pm.lastSuccessFor("pve", "my-instance")
|
|
if !ok {
|
|
t.Fatal("lastSuccessFor returned false, expected true")
|
|
}
|
|
if !ts.Equal(endTime) {
|
|
t.Errorf("stored lastSuccess = %v, want %v", ts, endTime)
|
|
}
|
|
}
|
|
|
|
func TestRecordResult_FailureIncrementsErrorCounter(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newFullTestPollMetrics(t)
|
|
|
|
// Use a MonitorError to test error classification
|
|
monErr := internalerrors.NewMonitorError(
|
|
internalerrors.ErrorTypeConnection,
|
|
"poll_nodes",
|
|
"pve1",
|
|
errors.New("connection refused"),
|
|
)
|
|
|
|
pm.RecordResult(PollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "pve1",
|
|
StartTime: time.Now().Add(-time.Second),
|
|
EndTime: time.Now(),
|
|
Success: false,
|
|
Error: monErr,
|
|
})
|
|
|
|
// Check error counter with classified type
|
|
gotErrorCount := getCounterVecValue(pm.pollErrors, "pve", "pve1", "connection")
|
|
if gotErrorCount != 1 {
|
|
t.Errorf("poll_errors_total{error_type=connection} = %v, want 1", gotErrorCount)
|
|
}
|
|
|
|
// Check error result counter
|
|
gotErrorResultCount := getCounterVecValue(pm.pollResults, "pve", "pve1", "error")
|
|
if gotErrorResultCount != 1 {
|
|
t.Errorf("poll_total{result=error} = %v, want 1", gotErrorResultCount)
|
|
}
|
|
}
|
|
|
|
func TestRecordResult_FailureWithUnknownErrorType(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newFullTestPollMetrics(t)
|
|
|
|
// Non-MonitorError should classify as "unknown"
|
|
pm.RecordResult(PollResult{
|
|
InstanceType: "pbs",
|
|
InstanceName: "pbs1",
|
|
StartTime: time.Now().Add(-time.Second),
|
|
EndTime: time.Now(),
|
|
Success: false,
|
|
Error: errors.New("some random error"),
|
|
})
|
|
|
|
gotErrorCount := getCounterVecValue(pm.pollErrors, "pbs", "pbs1", "unknown")
|
|
if gotErrorCount != 1 {
|
|
t.Errorf("poll_errors_total{error_type=unknown} = %v, want 1", gotErrorCount)
|
|
}
|
|
}
|
|
|
|
func TestRecordResult_FailureWithNilErrorClassifiesAsNone(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newFullTestPollMetrics(t)
|
|
|
|
pm.RecordResult(PollResult{
|
|
InstanceType: "pmg",
|
|
InstanceName: "pmg1",
|
|
StartTime: time.Now().Add(-time.Second),
|
|
EndTime: time.Now(),
|
|
Success: false,
|
|
Error: nil,
|
|
})
|
|
|
|
gotErrorCount := getCounterVecValue(pm.pollErrors, "pmg", "pmg1", "none")
|
|
if gotErrorCount != 1 {
|
|
t.Errorf("poll_errors_total{error_type=none} = %v, want 1", gotErrorCount)
|
|
}
|
|
}
|
|
|
|
func TestRecordResult_NegativeDurationClampedToZero(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newFullTestPollMetrics(t)
|
|
|
|
endTime := time.Now()
|
|
startTime := endTime.Add(time.Second) // Start AFTER end = negative duration
|
|
|
|
pm.RecordResult(PollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "neg-test",
|
|
StartTime: startTime,
|
|
EndTime: endTime,
|
|
Success: true,
|
|
})
|
|
|
|
// Histogram should have recorded 0, not a negative value
|
|
sampleSum := getHistogramSampleSum(pm.pollDuration, "pve", "neg-test")
|
|
if sampleSum != 0 {
|
|
t.Errorf("poll_duration sum = %v, want 0 for negative duration", sampleSum)
|
|
}
|
|
|
|
sampleCount := getHistogramSampleCount(pm.pollDuration, "pve", "neg-test")
|
|
if sampleCount != 1 {
|
|
t.Errorf("poll_duration count = %v, want 1", sampleCount)
|
|
}
|
|
}
|
|
|
|
func TestRecordResult_LabelsSanitized(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newFullTestPollMetrics(t)
|
|
|
|
pm.RecordResult(PollResult{
|
|
InstanceType: "", // Should become "unknown"
|
|
InstanceName: " ", // Should become "unknown"
|
|
StartTime: time.Now().Add(-time.Second),
|
|
EndTime: time.Now(),
|
|
Success: true,
|
|
})
|
|
|
|
// Check that metrics were recorded with sanitized labels
|
|
gotSuccessCount := getCounterVecValue(pm.pollResults, "unknown", "unknown", "success")
|
|
if gotSuccessCount != 1 {
|
|
t.Errorf("poll_total{unknown,unknown,success} = %v, want 1", gotSuccessCount)
|
|
}
|
|
}
|
|
|
|
func TestRecordResult_DecrementsPending(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newFullTestPollMetrics(t)
|
|
|
|
// Set initial pending count
|
|
pm.ResetQueueDepth(5)
|
|
|
|
// Record a result - should decrement pending
|
|
pm.RecordResult(PollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "pve1",
|
|
StartTime: time.Now().Add(-time.Second),
|
|
EndTime: time.Now(),
|
|
Success: true,
|
|
})
|
|
|
|
// Check pending was decremented
|
|
pm.mu.RLock()
|
|
gotPending := pm.pending
|
|
pm.mu.RUnlock()
|
|
|
|
if gotPending != 4 {
|
|
t.Errorf("pending = %v, want 4 after decrement from 5", gotPending)
|
|
}
|
|
|
|
// Check queueDepth gauge reflects the new value
|
|
gotQueueDepth := getGaugeValue(pm.queueDepth)
|
|
if gotQueueDepth != 4 {
|
|
t.Errorf("queueDepth gauge = %v, want 4", gotQueueDepth)
|
|
}
|
|
}
|
|
|
|
func TestRecordResult_FailureStalenessWithPreviousSuccess(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newFullTestPollMetrics(t)
|
|
|
|
// First, record a successful poll
|
|
firstEndTime := time.Now().Add(-10 * time.Second)
|
|
pm.RecordResult(PollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "stale-test",
|
|
StartTime: firstEndTime.Add(-time.Second),
|
|
EndTime: firstEndTime,
|
|
Success: true,
|
|
})
|
|
|
|
// Now record a failed poll
|
|
secondEndTime := time.Now()
|
|
pm.RecordResult(PollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "stale-test",
|
|
StartTime: secondEndTime.Add(-time.Second),
|
|
EndTime: secondEndTime,
|
|
Success: false,
|
|
Error: errors.New("failed"),
|
|
})
|
|
|
|
// Staleness should be ~10 seconds
|
|
gotStaleness := getGaugeVecValue(pm.staleness, "pve", "stale-test")
|
|
if gotStaleness < 9 || gotStaleness > 11 {
|
|
t.Errorf("staleness = %v, want ~10 seconds", gotStaleness)
|
|
}
|
|
}
|
|
|
|
func TestRecordResult_FailureStalenessWithoutPreviousSuccess(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newFullTestPollMetrics(t)
|
|
|
|
// Record a failure without any prior success
|
|
pm.RecordResult(PollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "no-prior-success",
|
|
StartTime: time.Now().Add(-time.Second),
|
|
EndTime: time.Now(),
|
|
Success: false,
|
|
Error: errors.New("failed"),
|
|
})
|
|
|
|
// Staleness should be -1 (no prior success)
|
|
gotStaleness := getGaugeVecValue(pm.staleness, "pve", "no-prior-success")
|
|
if gotStaleness != -1 {
|
|
t.Errorf("staleness = %v, want -1 for no prior success", gotStaleness)
|
|
}
|
|
}
|
|
|
|
func TestRecordResult_DurationObserved(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newFullTestPollMetrics(t)
|
|
|
|
endTime := time.Now()
|
|
startTime := endTime.Add(-2 * time.Second)
|
|
|
|
pm.RecordResult(PollResult{
|
|
InstanceType: "pbs",
|
|
InstanceName: "duration-test",
|
|
StartTime: startTime,
|
|
EndTime: endTime,
|
|
Success: true,
|
|
})
|
|
|
|
sampleSum := getHistogramSampleSum(pm.pollDuration, "pbs", "duration-test")
|
|
if sampleSum < 1.9 || sampleSum > 2.1 {
|
|
t.Errorf("poll_duration sum = %v, want ~2.0 seconds", sampleSum)
|
|
}
|
|
|
|
sampleCount := getHistogramSampleCount(pm.pollDuration, "pbs", "duration-test")
|
|
if sampleCount != 1 {
|
|
t.Errorf("poll_duration count = %v, want 1", sampleCount)
|
|
}
|
|
}
|
|
|
|
// newInFlightTestPollMetrics creates a PollMetrics with inflight gauge for testing.
|
|
func newInFlightTestPollMetrics(t *testing.T) *PollMetrics {
|
|
t.Helper()
|
|
|
|
reg := prometheus.NewRegistry()
|
|
|
|
pm := &PollMetrics{
|
|
inflight: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "monitor",
|
|
Name: "poll_inflight",
|
|
Help: "Current number of poll operations executing per instance type.",
|
|
},
|
|
[]string{"instance_type"},
|
|
),
|
|
queueDepth: prometheus.NewGauge(
|
|
prometheus.GaugeOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "monitor",
|
|
Name: "poll_queue_depth",
|
|
Help: "Approximate number of poll tasks waiting.",
|
|
},
|
|
),
|
|
}
|
|
|
|
reg.MustRegister(pm.inflight, pm.queueDepth)
|
|
|
|
return pm
|
|
}
|
|
|
|
func TestResetQueueDepth_NilPollMetrics(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var pm *PollMetrics
|
|
// Should not panic
|
|
pm.ResetQueueDepth(10)
|
|
}
|
|
|
|
func TestResetQueueDepth_SetsPendingTotal(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newFullTestPollMetrics(t)
|
|
|
|
pm.ResetQueueDepth(42)
|
|
|
|
pm.mu.RLock()
|
|
gotPending := pm.pending
|
|
pm.mu.RUnlock()
|
|
|
|
if gotPending != 42 {
|
|
t.Errorf("pending = %v, want 42", gotPending)
|
|
}
|
|
|
|
gotQueueDepth := getGaugeValue(pm.queueDepth)
|
|
if gotQueueDepth != 42 {
|
|
t.Errorf("queueDepth gauge = %v, want 42", gotQueueDepth)
|
|
}
|
|
}
|
|
|
|
func TestResetQueueDepth_NegativeClampsToZero(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newFullTestPollMetrics(t)
|
|
|
|
// First set to positive value
|
|
pm.ResetQueueDepth(10)
|
|
|
|
// Then set to negative - should clamp to 0
|
|
pm.ResetQueueDepth(-5)
|
|
|
|
pm.mu.RLock()
|
|
gotPending := pm.pending
|
|
pm.mu.RUnlock()
|
|
|
|
if gotPending != 0 {
|
|
t.Errorf("pending = %v, want 0 for negative input", gotPending)
|
|
}
|
|
|
|
gotQueueDepth := getGaugeValue(pm.queueDepth)
|
|
if gotQueueDepth != 0 {
|
|
t.Errorf("queueDepth gauge = %v, want 0 for negative input", gotQueueDepth)
|
|
}
|
|
}
|
|
|
|
func TestResetQueueDepth_ZeroWorksCorrectly(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newFullTestPollMetrics(t)
|
|
|
|
// First set to positive value
|
|
pm.ResetQueueDepth(10)
|
|
|
|
// Then reset to zero
|
|
pm.ResetQueueDepth(0)
|
|
|
|
pm.mu.RLock()
|
|
gotPending := pm.pending
|
|
pm.mu.RUnlock()
|
|
|
|
if gotPending != 0 {
|
|
t.Errorf("pending = %v, want 0", gotPending)
|
|
}
|
|
|
|
gotQueueDepth := getGaugeValue(pm.queueDepth)
|
|
if gotQueueDepth != 0 {
|
|
t.Errorf("queueDepth gauge = %v, want 0", gotQueueDepth)
|
|
}
|
|
}
|
|
|
|
func TestIncInFlight_NilPollMetrics(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var pm *PollMetrics
|
|
// Should not panic
|
|
pm.IncInFlight("pve")
|
|
}
|
|
|
|
func TestIncInFlight_IncrementsGauge(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newInFlightTestPollMetrics(t)
|
|
|
|
pm.IncInFlight("pve")
|
|
pm.IncInFlight("pve")
|
|
pm.IncInFlight("pbs")
|
|
|
|
gotPve := getGaugeVecValue(pm.inflight, "pve")
|
|
if gotPve != 2 {
|
|
t.Errorf("inflight{pve} = %v, want 2", gotPve)
|
|
}
|
|
|
|
gotPbs := getGaugeVecValue(pm.inflight, "pbs")
|
|
if gotPbs != 1 {
|
|
t.Errorf("inflight{pbs} = %v, want 1", gotPbs)
|
|
}
|
|
}
|
|
|
|
func TestDecInFlight_NilPollMetrics(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var pm *PollMetrics
|
|
// Should not panic
|
|
pm.DecInFlight("pve")
|
|
}
|
|
|
|
func TestDecInFlight_DecrementsGauge(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newInFlightTestPollMetrics(t)
|
|
|
|
// First increment a few times
|
|
pm.IncInFlight("pve")
|
|
pm.IncInFlight("pve")
|
|
pm.IncInFlight("pve")
|
|
|
|
// Then decrement
|
|
pm.DecInFlight("pve")
|
|
|
|
got := getGaugeVecValue(pm.inflight, "pve")
|
|
if got != 2 {
|
|
t.Errorf("inflight{pve} = %v, want 2 after inc(3) dec(1)", got)
|
|
}
|
|
|
|
// Decrement again
|
|
pm.DecInFlight("pve")
|
|
pm.DecInFlight("pve")
|
|
|
|
got = getGaugeVecValue(pm.inflight, "pve")
|
|
if got != 0 {
|
|
t.Errorf("inflight{pve} = %v, want 0 after full decrement", got)
|
|
}
|
|
}
|
|
|
|
func TestDecrementPending_NilPollMetrics(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var pm *PollMetrics
|
|
// Should not panic
|
|
pm.decrementPending()
|
|
}
|
|
|
|
func TestDecrementPending_DecrementsWhenPositive(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newFullTestPollMetrics(t)
|
|
|
|
// Set initial pending count
|
|
pm.ResetQueueDepth(5)
|
|
|
|
pm.decrementPending()
|
|
|
|
pm.mu.RLock()
|
|
gotPending := pm.pending
|
|
pm.mu.RUnlock()
|
|
|
|
if gotPending != 4 {
|
|
t.Errorf("pending = %v, want 4 after decrement from 5", gotPending)
|
|
}
|
|
}
|
|
|
|
func TestDecrementPending_DoesNotGoBelowZero(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newFullTestPollMetrics(t)
|
|
|
|
// Start at 0 (default)
|
|
pm.decrementPending()
|
|
|
|
pm.mu.RLock()
|
|
gotPending := pm.pending
|
|
pm.mu.RUnlock()
|
|
|
|
if gotPending != 0 {
|
|
t.Errorf("pending = %v, want 0 (should not go negative)", gotPending)
|
|
}
|
|
|
|
// Also verify the gauge is 0
|
|
gotQueueDepth := getGaugeValue(pm.queueDepth)
|
|
if gotQueueDepth != 0 {
|
|
t.Errorf("queueDepth gauge = %v, want 0", gotQueueDepth)
|
|
}
|
|
}
|
|
|
|
func TestDecrementPending_UpdatesQueueDepthGauge(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newFullTestPollMetrics(t)
|
|
|
|
pm.ResetQueueDepth(10)
|
|
pm.decrementPending()
|
|
|
|
gotQueueDepth := getGaugeValue(pm.queueDepth)
|
|
if gotQueueDepth != 9 {
|
|
t.Errorf("queueDepth gauge = %v, want 9", gotQueueDepth)
|
|
}
|
|
}
|
|
|
|
func TestDecrementPending_MultipleDecrements(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newFullTestPollMetrics(t)
|
|
|
|
pm.ResetQueueDepth(5)
|
|
|
|
// Decrement 5 times
|
|
for i := 0; i < 5; i++ {
|
|
pm.decrementPending()
|
|
}
|
|
|
|
pm.mu.RLock()
|
|
gotPending := pm.pending
|
|
pm.mu.RUnlock()
|
|
|
|
if gotPending != 0 {
|
|
t.Errorf("pending = %v, want 0 after 5 decrements from 5", gotPending)
|
|
}
|
|
|
|
gotQueueDepth := getGaugeValue(pm.queueDepth)
|
|
if gotQueueDepth != 0 {
|
|
t.Errorf("queueDepth gauge = %v, want 0", gotQueueDepth)
|
|
}
|
|
|
|
// Decrement one more time - should stay at 0
|
|
pm.decrementPending()
|
|
|
|
pm.mu.RLock()
|
|
gotPending = pm.pending
|
|
pm.mu.RUnlock()
|
|
|
|
if gotPending != 0 {
|
|
t.Errorf("pending = %v, want 0 after extra decrement", gotPending)
|
|
}
|
|
}
|
|
|
|
func TestRecordResult_FailureNegativeStalenessClampedToZero(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newFullTestPollMetrics(t)
|
|
|
|
// Record a success with a future timestamp
|
|
futureTime := time.Now().Add(10 * time.Second)
|
|
pm.RecordResult(PollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "negative-staleness",
|
|
StartTime: futureTime.Add(-time.Second),
|
|
EndTime: futureTime,
|
|
Success: true,
|
|
})
|
|
|
|
// Now record a failure with an EndTime BEFORE the last success
|
|
// This creates a negative staleness calculation (EndTime - lastSuccess < 0)
|
|
pastTime := futureTime.Add(-5 * time.Second)
|
|
pm.RecordResult(PollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "negative-staleness",
|
|
StartTime: pastTime.Add(-time.Second),
|
|
EndTime: pastTime,
|
|
Success: false,
|
|
Error: errors.New("failed"),
|
|
})
|
|
|
|
// Staleness should be clamped to 0, not negative
|
|
gotStaleness := getGaugeVecValue(pm.staleness, "pve", "negative-staleness")
|
|
if gotStaleness != 0 {
|
|
t.Errorf("staleness = %v, want 0 for negative staleness calculation", gotStaleness)
|
|
}
|
|
}
|
|
|
|
// newNodeTestPollMetrics creates a PollMetrics instance with node-level metrics for testing.
|
|
func newNodeTestPollMetrics(t *testing.T) *PollMetrics {
|
|
t.Helper()
|
|
|
|
reg := prometheus.NewRegistry()
|
|
|
|
pm := &PollMetrics{
|
|
nodePollDuration: prometheus.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "monitor",
|
|
Name: "node_poll_duration_seconds",
|
|
Help: "Duration of polling operations per node.",
|
|
Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 15, 20, 30},
|
|
},
|
|
[]string{"instance_type", "instance", "node"},
|
|
),
|
|
nodePollResults: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "monitor",
|
|
Name: "node_poll_total",
|
|
Help: "Total polling attempts per node partitioned by result.",
|
|
},
|
|
[]string{"instance_type", "instance", "node", "result"},
|
|
),
|
|
nodePollErrors: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "monitor",
|
|
Name: "node_poll_errors_total",
|
|
Help: "Polling failures per node grouped by error type.",
|
|
},
|
|
[]string{"instance_type", "instance", "node", "error_type"},
|
|
),
|
|
nodeLastSuccess: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "monitor",
|
|
Name: "node_poll_last_success_timestamp",
|
|
Help: "Unix timestamp of the last successful poll for a node.",
|
|
},
|
|
[]string{"instance_type", "instance", "node"},
|
|
),
|
|
nodeStaleness: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "monitor",
|
|
Name: "node_poll_staleness_seconds",
|
|
Help: "Seconds since the last successful poll for a node.",
|
|
},
|
|
[]string{"instance_type", "instance", "node"},
|
|
),
|
|
nodeLastSuccessByKey: make(map[nodeMetricKey]time.Time),
|
|
}
|
|
|
|
reg.MustRegister(
|
|
pm.nodePollDuration,
|
|
pm.nodePollResults,
|
|
pm.nodePollErrors,
|
|
pm.nodeLastSuccess,
|
|
pm.nodeStaleness,
|
|
)
|
|
|
|
return pm
|
|
}
|
|
|
|
func TestRecordNodeResult_NilPollMetrics(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var pm *PollMetrics
|
|
// Should not panic
|
|
pm.RecordNodeResult(NodePollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "pve1",
|
|
NodeName: "node1",
|
|
StartTime: time.Now(),
|
|
EndTime: time.Now().Add(time.Second),
|
|
Success: true,
|
|
})
|
|
}
|
|
|
|
func TestRecordNodeResult_SuccessUpdatesMetrics(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newNodeTestPollMetrics(t)
|
|
|
|
endTime := time.Now()
|
|
pm.RecordNodeResult(NodePollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "my-pve",
|
|
NodeName: "node1",
|
|
StartTime: endTime.Add(-500 * time.Millisecond),
|
|
EndTime: endTime,
|
|
Success: true,
|
|
})
|
|
|
|
// Check lastSuccess is set
|
|
gotLastSuccess := getGaugeVecValue(pm.nodeLastSuccess, "pve", "my-pve", "node1")
|
|
wantLastSuccess := float64(endTime.Unix())
|
|
if gotLastSuccess != wantLastSuccess {
|
|
t.Errorf("nodeLastSuccess = %v, want %v", gotLastSuccess, wantLastSuccess)
|
|
}
|
|
|
|
// Check staleness is set to 0 on success
|
|
gotStaleness := getGaugeVecValue(pm.nodeStaleness, "pve", "my-pve", "node1")
|
|
if gotStaleness != 0 {
|
|
t.Errorf("nodeStaleness = %v, want 0 on success", gotStaleness)
|
|
}
|
|
|
|
// Check success counter incremented
|
|
gotSuccessCount := getCounterVecValue(pm.nodePollResults, "pve", "my-pve", "node1", "success")
|
|
if gotSuccessCount != 1 {
|
|
t.Errorf("node_poll_total{result=success} = %v, want 1", gotSuccessCount)
|
|
}
|
|
|
|
// Check internal nodeLastSuccessByKey is updated
|
|
ts, ok := pm.lastNodeSuccessFor("pve", "my-pve", "node1")
|
|
if !ok {
|
|
t.Fatal("lastNodeSuccessFor returned false, expected true")
|
|
}
|
|
if !ts.Equal(endTime) {
|
|
t.Errorf("stored nodeLastSuccess = %v, want %v", ts, endTime)
|
|
}
|
|
}
|
|
|
|
func TestRecordNodeResult_FailureIncrementsErrorCounter(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newNodeTestPollMetrics(t)
|
|
|
|
monErr := internalerrors.NewMonitorError(
|
|
internalerrors.ErrorTypeTimeout,
|
|
"poll_node",
|
|
"pve1",
|
|
errors.New("timeout"),
|
|
)
|
|
|
|
pm.RecordNodeResult(NodePollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "pve1",
|
|
NodeName: "node2",
|
|
StartTime: time.Now().Add(-time.Second),
|
|
EndTime: time.Now(),
|
|
Success: false,
|
|
Error: monErr,
|
|
})
|
|
|
|
// Check error counter with classified type
|
|
gotErrorCount := getCounterVecValue(pm.nodePollErrors, "pve", "pve1", "node2", "timeout")
|
|
if gotErrorCount != 1 {
|
|
t.Errorf("node_poll_errors_total{error_type=timeout} = %v, want 1", gotErrorCount)
|
|
}
|
|
|
|
// Check error result counter
|
|
gotErrorResultCount := getCounterVecValue(pm.nodePollResults, "pve", "pve1", "node2", "error")
|
|
if gotErrorResultCount != 1 {
|
|
t.Errorf("node_poll_total{result=error} = %v, want 1", gotErrorResultCount)
|
|
}
|
|
}
|
|
|
|
func TestRecordNodeResult_NegativeDurationClampedToZero(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newNodeTestPollMetrics(t)
|
|
|
|
endTime := time.Now()
|
|
startTime := endTime.Add(time.Second) // Start AFTER end = negative duration
|
|
|
|
pm.RecordNodeResult(NodePollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "neg-test",
|
|
NodeName: "node1",
|
|
StartTime: startTime,
|
|
EndTime: endTime,
|
|
Success: true,
|
|
})
|
|
|
|
sampleSum := getHistogramSampleSum(pm.nodePollDuration, "pve", "neg-test", "node1")
|
|
if sampleSum != 0 {
|
|
t.Errorf("node_poll_duration sum = %v, want 0 for negative duration", sampleSum)
|
|
}
|
|
}
|
|
|
|
func TestRecordNodeResult_EmptyNodeNameNormalized(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newNodeTestPollMetrics(t)
|
|
|
|
pm.RecordNodeResult(NodePollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "pve1",
|
|
NodeName: "",
|
|
StartTime: time.Now().Add(-time.Second),
|
|
EndTime: time.Now(),
|
|
Success: true,
|
|
})
|
|
|
|
// Empty node name should normalize to "unknown-node"
|
|
gotSuccessCount := getCounterVecValue(pm.nodePollResults, "pve", "pve1", "unknown-node", "success")
|
|
if gotSuccessCount != 1 {
|
|
t.Errorf("node_poll_total{node=unknown-node,result=success} = %v, want 1", gotSuccessCount)
|
|
}
|
|
}
|
|
|
|
func TestRecordNodeResult_FailureStalenessWithPreviousSuccess(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newNodeTestPollMetrics(t)
|
|
|
|
// First, record a successful poll
|
|
firstEndTime := time.Now().Add(-10 * time.Second)
|
|
pm.RecordNodeResult(NodePollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "stale-test",
|
|
NodeName: "node1",
|
|
StartTime: firstEndTime.Add(-time.Second),
|
|
EndTime: firstEndTime,
|
|
Success: true,
|
|
})
|
|
|
|
// Now record a failed poll
|
|
secondEndTime := time.Now()
|
|
pm.RecordNodeResult(NodePollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "stale-test",
|
|
NodeName: "node1",
|
|
StartTime: secondEndTime.Add(-time.Second),
|
|
EndTime: secondEndTime,
|
|
Success: false,
|
|
Error: errors.New("failed"),
|
|
})
|
|
|
|
// Staleness should be ~10 seconds
|
|
gotStaleness := getGaugeVecValue(pm.nodeStaleness, "pve", "stale-test", "node1")
|
|
if gotStaleness < 9 || gotStaleness > 11 {
|
|
t.Errorf("nodeStaleness = %v, want ~10 seconds", gotStaleness)
|
|
}
|
|
}
|
|
|
|
func TestRecordNodeResult_FailureStalenessWithoutPreviousSuccess(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newNodeTestPollMetrics(t)
|
|
|
|
// Record a failure without any prior success
|
|
pm.RecordNodeResult(NodePollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "no-prior-success",
|
|
NodeName: "node1",
|
|
StartTime: time.Now().Add(-time.Second),
|
|
EndTime: time.Now(),
|
|
Success: false,
|
|
Error: errors.New("failed"),
|
|
})
|
|
|
|
// Staleness should be -1 (no prior success)
|
|
gotStaleness := getGaugeVecValue(pm.nodeStaleness, "pve", "no-prior-success", "node1")
|
|
if gotStaleness != -1 {
|
|
t.Errorf("nodeStaleness = %v, want -1 for no prior success", gotStaleness)
|
|
}
|
|
}
|
|
|
|
func TestRecordNodeResult_FailureNegativeStalenessClampedToZero(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newNodeTestPollMetrics(t)
|
|
|
|
// Record a success with a future timestamp
|
|
futureTime := time.Now().Add(10 * time.Second)
|
|
pm.RecordNodeResult(NodePollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "neg-stale",
|
|
NodeName: "node1",
|
|
StartTime: futureTime.Add(-time.Second),
|
|
EndTime: futureTime,
|
|
Success: true,
|
|
})
|
|
|
|
// Now record a failure with an EndTime BEFORE the last success
|
|
pastTime := futureTime.Add(-5 * time.Second)
|
|
pm.RecordNodeResult(NodePollResult{
|
|
InstanceType: "pve",
|
|
InstanceName: "neg-stale",
|
|
NodeName: "node1",
|
|
StartTime: pastTime.Add(-time.Second),
|
|
EndTime: pastTime,
|
|
Success: false,
|
|
Error: errors.New("failed"),
|
|
})
|
|
|
|
// Staleness should be clamped to 0, not negative
|
|
gotStaleness := getGaugeVecValue(pm.nodeStaleness, "pve", "neg-stale", "node1")
|
|
if gotStaleness != 0 {
|
|
t.Errorf("nodeStaleness = %v, want 0 for negative staleness calculation", gotStaleness)
|
|
}
|
|
}
|
|
|
|
// newQueueWaitTestPollMetrics creates a PollMetrics instance for RecordQueueWait testing.
|
|
func newQueueWaitTestPollMetrics(t *testing.T) *PollMetrics {
|
|
t.Helper()
|
|
|
|
reg := prometheus.NewRegistry()
|
|
|
|
pm := &PollMetrics{
|
|
schedulerQueueWait: prometheus.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "scheduler",
|
|
Name: "queue_wait_seconds",
|
|
Help: "Observed wait time between task readiness and execution.",
|
|
Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60},
|
|
},
|
|
[]string{"instance_type"},
|
|
),
|
|
}
|
|
|
|
reg.MustRegister(pm.schedulerQueueWait)
|
|
|
|
return pm
|
|
}
|
|
|
|
func TestRecordQueueWait_NilPollMetrics(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var pm *PollMetrics
|
|
// Should not panic
|
|
pm.RecordQueueWait("pve", 5*time.Second)
|
|
}
|
|
|
|
func TestRecordQueueWait_RecordsWaitTime(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newQueueWaitTestPollMetrics(t)
|
|
|
|
pm.RecordQueueWait("pve", 2*time.Second)
|
|
|
|
sampleCount := getHistogramSampleCount(pm.schedulerQueueWait, "pve")
|
|
if sampleCount != 1 {
|
|
t.Errorf("queue_wait count = %v, want 1", sampleCount)
|
|
}
|
|
|
|
sampleSum := getHistogramSampleSum(pm.schedulerQueueWait, "pve")
|
|
if sampleSum < 1.9 || sampleSum > 2.1 {
|
|
t.Errorf("queue_wait sum = %v, want ~2.0 seconds", sampleSum)
|
|
}
|
|
}
|
|
|
|
func TestRecordQueueWait_NegativeWaitClampedToZero(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newQueueWaitTestPollMetrics(t)
|
|
|
|
pm.RecordQueueWait("pve", -5*time.Second)
|
|
|
|
sampleSum := getHistogramSampleSum(pm.schedulerQueueWait, "pve")
|
|
if sampleSum != 0 {
|
|
t.Errorf("queue_wait sum = %v, want 0 for negative wait", sampleSum)
|
|
}
|
|
}
|
|
|
|
func TestRecordQueueWait_EmptyTypeNormalized(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newQueueWaitTestPollMetrics(t)
|
|
|
|
pm.RecordQueueWait("", time.Second)
|
|
|
|
// Empty label should normalize to "unknown"
|
|
sampleCount := getHistogramSampleCount(pm.schedulerQueueWait, "unknown")
|
|
if sampleCount != 1 {
|
|
t.Errorf("queue_wait{unknown} count = %v, want 1", sampleCount)
|
|
}
|
|
}
|
|
|
|
func TestSetQueueDepth_NilPollMetrics(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
var pm *PollMetrics
|
|
// Should not panic
|
|
pm.SetQueueDepth(10)
|
|
}
|
|
|
|
func TestSetQueueDepth_SetsGauge(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newInFlightTestPollMetrics(t)
|
|
|
|
pm.SetQueueDepth(42)
|
|
|
|
got := getGaugeValue(pm.queueDepth)
|
|
if got != 42 {
|
|
t.Errorf("queueDepth = %v, want 42", got)
|
|
}
|
|
}
|
|
|
|
func TestSetQueueDepth_NegativeClampedToZero(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newInFlightTestPollMetrics(t)
|
|
|
|
pm.SetQueueDepth(-10)
|
|
|
|
got := getGaugeValue(pm.queueDepth)
|
|
if got != 0 {
|
|
t.Errorf("queueDepth = %v, want 0 for negative input", got)
|
|
}
|
|
}
|
|
|
|
func TestSetQueueDepth_ZeroWorks(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
pm := newInFlightTestPollMetrics(t)
|
|
|
|
// First set to positive
|
|
pm.SetQueueDepth(10)
|
|
|
|
// Then set to zero
|
|
pm.SetQueueDepth(0)
|
|
|
|
got := getGaugeValue(pm.queueDepth)
|
|
if got != 0 {
|
|
t.Errorf("queueDepth = %v, want 0", got)
|
|
}
|
|
}
|