Pulse/internal/monitoring/metrics_test.go

package monitoring

import (
	"errors"
	"testing"
	"time"

	"github.com/prometheus/client_golang/prometheus"
	dto "github.com/prometheus/client_model/go"
	internalerrors "github.com/rcourtman/pulse-go-rewrite/internal/errors"
)

// newTestPollMetrics creates a PollMetrics instance with an isolated registry for testing.
func newTestPollMetrics(t *testing.T) *PollMetrics {
	t.Helper()

	reg := prometheus.NewRegistry()

	pm := &PollMetrics{
		schedulerQueueReady: prometheus.NewGauge(
			prometheus.GaugeOpts{
				Namespace: "pulse",
				Subsystem: "scheduler",
				Name:      "queue_due_soon",
				Help:      "Number of tasks due to run within the immediate window.",
			},
		),
		schedulerQueueDepthByType: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace: "pulse",
				Subsystem: "scheduler",
				Name:      "queue_depth",
				Help:      "Current scheduler queue depth partitioned by instance type.",
			},
			[]string{"instance_type"},
		),
		schedulerDeadLetterDepth: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace: "pulse",
				Subsystem: "scheduler",
				Name:      "dead_letter_depth",
				Help:      "Number of tasks currently parked in the dead-letter queue per instance.",
			},
			[]string{"instance_type", "instance"},
		),
		schedulerBreakerState: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace: "pulse",
				Subsystem: "scheduler",
				Name:      "breaker_state",
				Help:      "Circuit breaker state encoded as 0=closed, 1=half-open, 2=open, -1=unknown.",
			},
			[]string{"instance_type", "instance"},
		),
		schedulerBreakerFailureCount: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace: "pulse",
				Subsystem: "scheduler",
				Name:      "breaker_failure_count",
				Help:      "Current consecutive failure count tracked by the circuit breaker.",
			},
			[]string{"instance_type", "instance"},
		),
		schedulerBreakerRetrySeconds: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace: "pulse",
				Subsystem: "scheduler",
				Name:      "breaker_retry_seconds",
				Help:      "Seconds until the circuit breaker will allow another attempt.",
			},
			[]string{"instance_type", "instance"},
		),
		lastQueueTypeKeys: make(map[string]struct{}),
		lastDLQKeys:       make(map[string]struct{}),
	}

	reg.MustRegister(
		pm.schedulerQueueReady,
		pm.schedulerQueueDepthByType,
		pm.schedulerDeadLetterDepth,
		pm.schedulerBreakerState,
		pm.schedulerBreakerFailureCount,
		pm.schedulerBreakerRetrySeconds,
	)

	return pm
}

// getGaugeValue returns the value of a prometheus gauge.
func getGaugeValue(g prometheus.Gauge) float64 {
	m := &dto.Metric{}
	if err := g.Write(m); err != nil {
		return 0
	}
	return m.GetGauge().GetValue()
}

// getGaugeVecValue returns the value for specific labels from a GaugeVec.
func getGaugeVecValue(gv *prometheus.GaugeVec, labels ...string) float64 {
	m := &dto.Metric{}
	gauge, err := gv.GetMetricWithLabelValues(labels...)
	if err != nil {
		return 0
	}
	if err := gauge.Write(m); err != nil {
		return 0
	}
	return m.GetGauge().GetValue()
}

func TestUpdateQueueSnapshot_NilPollMetrics(t *testing.T) {
	t.Parallel()

	var pm *PollMetrics
	// Should not panic
	pm.UpdateQueueSnapshot(QueueSnapshot{
		DueWithinSeconds: 5,
		PerType:          map[string]int{"pve": 10},
	})
}

func TestUpdateQueueSnapshot_SetsDueWithinSeconds(t *testing.T) {
	t.Parallel()

	pm := newTestPollMetrics(t)

	pm.UpdateQueueSnapshot(QueueSnapshot{
		DueWithinSeconds: 42,
		PerType:          map[string]int{},
	})

	got := getGaugeValue(pm.schedulerQueueReady)
	if got != 42 {
		t.Fatalf("schedulerQueueReady = %v, want 42", got)
	}
}

func TestUpdateQueueSnapshot_UpdatesPerTypeQueueDepth(t *testing.T) {
	t.Parallel()

	pm := newTestPollMetrics(t)

	pm.UpdateQueueSnapshot(QueueSnapshot{
		DueWithinSeconds: 0,
		PerType: map[string]int{
			"pve": 10,
			"pbs": 5,
			"pmg": 3,
		},
	})

	cases := []struct {
		instanceType string
		want         float64
	}{
		{"pve", 10},
		{"pbs", 5},
		{"pmg", 3},
	}

	for _, tc := range cases {
		got := getGaugeVecValue(pm.schedulerQueueDepthByType, tc.instanceType)
		if got != tc.want {
			t.Errorf("queue_depth{instance_type=%q} = %v, want %v", tc.instanceType, got, tc.want)
		}
	}
}

func TestUpdateQueueSnapshot_ClearsStaleTypeKeys(t *testing.T) {
	t.Parallel()

	pm := newTestPollMetrics(t)

	// First snapshot with pve and pbs
	pm.UpdateQueueSnapshot(QueueSnapshot{
		DueWithinSeconds: 0,
		PerType: map[string]int{
			"pve": 10,
			"pbs": 5,
		},
	})

	// Verify initial values
	if got := getGaugeVecValue(pm.schedulerQueueDepthByType, "pve"); got != 10 {
		t.Fatalf("initial pve = %v, want 10", got)
	}
	if got := getGaugeVecValue(pm.schedulerQueueDepthByType, "pbs"); got != 5 {
		t.Fatalf("initial pbs = %v, want 5", got)
	}

	// Second snapshot with only pve (pbs should be cleared)
	pm.UpdateQueueSnapshot(QueueSnapshot{
		DueWithinSeconds: 0,
		PerType: map[string]int{
			"pve": 8,
		},
	})

	if got := getGaugeVecValue(pm.schedulerQueueDepthByType, "pve"); got != 8 {
		t.Errorf("updated pve = %v, want 8", got)
	}
	if got := getGaugeVecValue(pm.schedulerQueueDepthByType, "pbs"); got != 0 {
		t.Errorf("stale pbs should be 0, got %v", got)
	}
}

func TestUpdateQueueSnapshot_EmptySnapshot(t *testing.T) {
	t.Parallel()

	pm := newTestPollMetrics(t)

	// First add some data
	pm.UpdateQueueSnapshot(QueueSnapshot{
		DueWithinSeconds: 10,
		PerType: map[string]int{
			"pve": 5,
		},
	})

	// Then clear with empty snapshot
	pm.UpdateQueueSnapshot(QueueSnapshot{
		DueWithinSeconds: 0,
		PerType:          map[string]int{},
	})

	if got := getGaugeValue(pm.schedulerQueueReady); got != 0 {
		t.Errorf("schedulerQueueReady = %v, want 0", got)
	}
	if got := getGaugeVecValue(pm.schedulerQueueDepthByType, "pve"); got != 0 {
		t.Errorf("pve should be cleared to 0, got %v", got)
	}
}

func TestUpdateDeadLetterCounts_NilPollMetrics(t *testing.T) {
	t.Parallel()

	var pm *PollMetrics
	// Should not panic
	pm.UpdateDeadLetterCounts([]DeadLetterTask{
		{Type: "pve", Instance: "pve1"},
	})
}

func TestUpdateDeadLetterCounts_EmptyClearsPrevious(t *testing.T) {
	t.Parallel()

	pm := newTestPollMetrics(t)

	// First add some tasks
	pm.UpdateDeadLetterCounts([]DeadLetterTask{
		{Type: "pve", Instance: "pve1"},
		{Type: "pbs", Instance: "pbs1"},
	})

	// Verify they were set
	if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pve", "pve1"); got != 1 {
		t.Fatalf("initial pve/pve1 = %v, want 1", got)
	}
	if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pbs", "pbs1"); got != 1 {
		t.Fatalf("initial pbs/pbs1 = %v, want 1", got)
	}

	// Clear with empty slice
	pm.UpdateDeadLetterCounts([]DeadLetterTask{})

	if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pve", "pve1"); got != 0 {
		t.Errorf("pve/pve1 should be cleared to 0, got %v", got)
	}
	if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pbs", "pbs1"); got != 0 {
		t.Errorf("pbs/pbs1 should be cleared to 0, got %v", got)
	}
}

func TestUpdateDeadLetterCounts_SingleTask(t *testing.T) {
	t.Parallel()

	pm := newTestPollMetrics(t)

	pm.UpdateDeadLetterCounts([]DeadLetterTask{
		{Type: "pve", Instance: "my-pve-instance"},
	})

	got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pve", "my-pve-instance")
	if got != 1 {
		t.Fatalf("dead_letter_depth{pve,my-pve-instance} = %v, want 1", got)
	}
}

func TestUpdateDeadLetterCounts_AggregatesSameTypeInstance(t *testing.T) {
	t.Parallel()

	pm := newTestPollMetrics(t)

	pm.UpdateDeadLetterCounts([]DeadLetterTask{
		{Type: "pve", Instance: "pve1"},
		{Type: "pve", Instance: "pve1"},
		{Type: "pve", Instance: "pve1"},
		{Type: "pbs", Instance: "pbs1"},
		{Type: "pbs", Instance: "pbs1"},
	})

	if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pve", "pve1"); got != 3 {
		t.Errorf("pve/pve1 = %v, want 3", got)
	}
	if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pbs", "pbs1"); got != 2 {
		t.Errorf("pbs/pbs1 = %v, want 2", got)
	}
}

func TestUpdateDeadLetterCounts_ClearsStaleKeys(t *testing.T) {
	t.Parallel()

	pm := newTestPollMetrics(t)

	// First update with pve1 and pbs1
	pm.UpdateDeadLetterCounts([]DeadLetterTask{
		{Type: "pve", Instance: "pve1"},
		{Type: "pbs", Instance: "pbs1"},
	})

	// Verify initial values
	if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pve", "pve1"); got != 1 {
		t.Fatalf("initial pve/pve1 = %v, want 1", got)
	}
	if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pbs", "pbs1"); got != 1 {
		t.Fatalf("initial pbs/pbs1 = %v, want 1", got)
	}

	// Second update with only pve1 (pbs1 should be cleared)
	pm.UpdateDeadLetterCounts([]DeadLetterTask{
		{Type: "pve", Instance: "pve1"},
		{Type: "pve", Instance: "pve1"},
	})

	if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pve", "pve1"); got != 2 {
		t.Errorf("updated pve/pve1 = %v, want 2", got)
	}
	if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pbs", "pbs1"); got != 0 {
		t.Errorf("stale pbs/pbs1 should be 0, got %v", got)
	}
}

func TestUpdateDeadLetterCounts_NormalizesEmptyLabels(t *testing.T) {
	t.Parallel()

	pm := newTestPollMetrics(t)

	// Empty Type and Instance should normalize to "unknown"
	pm.UpdateDeadLetterCounts([]DeadLetterTask{
		{Type: "", Instance: ""},
		{Type: "  ", Instance: "  "},
	})

	got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "unknown", "unknown")
	if got != 2 {
		t.Fatalf("empty labels should normalize to unknown, got count %v, want 2", got)
	}
}

func TestUpdateDeadLetterCounts_MultipleInstancesSameType(t *testing.T) {
	t.Parallel()

	pm := newTestPollMetrics(t)

	pm.UpdateDeadLetterCounts([]DeadLetterTask{
		{Type: "pve", Instance: "pve1"},
		{Type: "pve", Instance: "pve2"},
		{Type: "pve", Instance: "pve3"},
	})

	if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pve", "pve1"); got != 1 {
		t.Errorf("pve/pve1 = %v, want 1", got)
	}
	if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pve", "pve2"); got != 1 {
		t.Errorf("pve/pve2 = %v, want 1", got)
	}
	if got := getGaugeVecValue(pm.schedulerDeadLetterDepth, "pve", "pve3"); got != 1 {
		t.Errorf("pve/pve3 = %v, want 1", got)
	}
}

func TestSetBreakerState_NilPollMetrics(t *testing.T) {
	t.Parallel()

	var pm *PollMetrics
	// Should not panic
	pm.SetBreakerState("pve", "pve1", "open", 5, time.Now().Add(time.Minute))
}

func TestSetBreakerState_ZeroRetryAt(t *testing.T) {
	t.Parallel()

	pm := newTestPollMetrics(t)

	pm.SetBreakerState("pve", "pve1", "closed", 0, time.Time{})

	got := getGaugeVecValue(pm.schedulerBreakerRetrySeconds, "pve", "pve1")
	if got != 0 {
		t.Fatalf("retrySeconds = %v, want 0 for zero retryAt", got)
	}
}

func TestSetBreakerState_FutureRetryAt(t *testing.T) {
	t.Parallel()

	pm := newTestPollMetrics(t)

	// Set retryAt 60 seconds in the future
	retryAt := time.Now().Add(60 * time.Second)
	pm.SetBreakerState("pve", "pve1", "open", 3, retryAt)

	got := getGaugeVecValue(pm.schedulerBreakerRetrySeconds, "pve", "pve1")
	// Allow some tolerance for test execution time
	if got < 55 || got > 65 {
		t.Fatalf("retrySeconds = %v, want ~60 for future retryAt", got)
	}
}

func TestSetBreakerState_PastRetryAtClampsToZero(t *testing.T) {
	t.Parallel()

	pm := newTestPollMetrics(t)

	// Set retryAt 60 seconds in the past (already expired)
	retryAt := time.Now().Add(-60 * time.Second)
	pm.SetBreakerState("pve", "pve1", "half_open", 1, retryAt)

	got := getGaugeVecValue(pm.schedulerBreakerRetrySeconds, "pve", "pve1")
	if got != 0 {
		t.Fatalf("retrySeconds = %v, want 0 for past retryAt", got)
	}
}

func TestSetBreakerState_StateConversion(t *testing.T) {
	t.Parallel()

	cases := []struct {
		state string
		want  float64
	}{
		{"closed", 0},
		{"CLOSED", 0},
		{"Closed", 0},
		{"half_open", 1},
		{"half-open", 1},
		{"HALF_OPEN", 1},
		{"open", 2},
		{"OPEN", 2},
		{"Open", 2},
		{"unknown_state", -1},
		{"", -1},
		{"invalid", -1},
	}

	for _, tc := range cases {
		t.Run(tc.state, func(t *testing.T) {
			pm := newTestPollMetrics(t)
			pm.SetBreakerState("pve", "pve1", tc.state, 0, time.Time{})

			got := getGaugeVecValue(pm.schedulerBreakerState, "pve", "pve1")
			if got != tc.want {
				t.Errorf("breakerState for %q = %v, want %v", tc.state, got, tc.want)
			}
		})
	}
}

func TestSetBreakerState_FailureCount(t *testing.T) {
	t.Parallel()

	pm := newTestPollMetrics(t)

	pm.SetBreakerState("pve", "pve1", "open", 7, time.Time{})

	got := getGaugeVecValue(pm.schedulerBreakerFailureCount, "pve", "pve1")
	if got != 7 {
		t.Fatalf("failureCount = %v, want 7", got)
	}
}

func TestSetBreakerState_EmptyLabelsSanitized(t *testing.T) {
	t.Parallel()

	pm := newTestPollMetrics(t)

	pm.SetBreakerState("", "", "open", 5, time.Time{})

	// Empty labels should be normalized to "unknown"
	gotState := getGaugeVecValue(pm.schedulerBreakerState, "unknown", "unknown")
	if gotState != 2 {
		t.Errorf("breaker_state{unknown,unknown} = %v, want 2 (open)", gotState)
	}

	gotFailures := getGaugeVecValue(pm.schedulerBreakerFailureCount, "unknown", "unknown")
	if gotFailures != 5 {
		t.Errorf("breaker_failure_count{unknown,unknown} = %v, want 5", gotFailures)
	}
}

func TestSetBreakerState_WhitespaceOnlyLabelsSanitized(t *testing.T) {
	t.Parallel()

	pm := newTestPollMetrics(t)

	pm.SetBreakerState("  ", "   ", "closed", 0, time.Time{})

	gotState := getGaugeVecValue(pm.schedulerBreakerState, "unknown", "unknown")
	if gotState != 0 {
		t.Errorf("breaker_state{unknown,unknown} = %v, want 0 (closed)", gotState)
	}
}

// newFullTestPollMetrics creates a PollMetrics instance with all fields for RecordResult testing.
func newFullTestPollMetrics(t *testing.T) *PollMetrics {
	t.Helper()

	reg := prometheus.NewRegistry()

	pm := &PollMetrics{
		pollDuration: prometheus.NewHistogramVec(
			prometheus.HistogramOpts{
				Namespace: "pulse",
				Subsystem: "monitor",
				Name:      "poll_duration_seconds",
				Help:      "Duration of polling operations per instance.",
				Buckets:   []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 15, 20, 30},
			},
			[]string{"instance_type", "instance"},
		),
		pollResults: prometheus.NewCounterVec(
			prometheus.CounterOpts{
				Namespace: "pulse",
				Subsystem: "monitor",
				Name:      "poll_total",
				Help:      "Total polling attempts partitioned by result.",
			},
			[]string{"instance_type", "instance", "result"},
		),
		pollErrors: prometheus.NewCounterVec(
			prometheus.CounterOpts{
				Namespace: "pulse",
				Subsystem: "monitor",
				Name:      "poll_errors_total",
				Help:      "Polling failures grouped by error type.",
			},
			[]string{"instance_type", "instance", "error_type"},
		),
		lastSuccess: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace: "pulse",
				Subsystem: "monitor",
				Name:      "poll_last_success_timestamp",
				Help:      "Unix timestamp of the last successful poll.",
			},
			[]string{"instance_type", "instance"},
		),
		staleness: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace: "pulse",
				Subsystem: "monitor",
				Name:      "poll_staleness_seconds",
				Help:      "Seconds since the last successful poll.",
			},
			[]string{"instance_type", "instance"},
		),
		queueDepth: prometheus.NewGauge(
			prometheus.GaugeOpts{
				Namespace: "pulse",
				Subsystem: "monitor",
				Name:      "poll_queue_depth",
				Help:      "Approximate number of poll tasks waiting.",
			},
		),
		lastSuccessByKey: make(map[metricKey]time.Time),
	}

	reg.MustRegister(
		pm.pollDuration,
		pm.pollResults,
		pm.pollErrors,
		pm.lastSuccess,
		pm.staleness,
		pm.queueDepth,
	)

	return pm
}

// getCounterVecValue returns the value for specific labels from a CounterVec.
func getCounterVecValue(cv *prometheus.CounterVec, labels ...string) float64 {
	m := &dto.Metric{}
	counter, err := cv.GetMetricWithLabelValues(labels...)
	if err != nil {
		return 0
	}
	if err := counter.Write(m); err != nil {
		return 0
	}
	return m.GetCounter().GetValue()
}

// getHistogramSampleCount returns the sample count for specific labels from a HistogramVec.
func getHistogramSampleCount(hv *prometheus.HistogramVec, labels ...string) uint64 {
	m := &dto.Metric{}
	obs, err := hv.GetMetricWithLabelValues(labels...)
	if err != nil {
		return 0
	}
	if err := obs.(prometheus.Metric).Write(m); err != nil {
		return 0
	}
	return m.GetHistogram().GetSampleCount()
}

// getHistogramSampleSum returns the sample sum for specific labels from a HistogramVec.
func getHistogramSampleSum(hv *prometheus.HistogramVec, labels ...string) float64 {
	m := &dto.Metric{}
	obs, err := hv.GetMetricWithLabelValues(labels...)
	if err != nil {
		return 0
	}
	if err := obs.(prometheus.Metric).Write(m); err != nil {
		return 0
	}
	return m.GetHistogram().GetSampleSum()
}

func TestRecordResult_NilPollMetrics(t *testing.T) {
	t.Parallel()

	var pm *PollMetrics
	// Should not panic
	pm.RecordResult(PollResult{
		InstanceType: "pve",
		InstanceName: "pve1",
		StartTime:    time.Now(),
		EndTime:      time.Now().Add(time.Second),
		Success:      true,
	})
}

func TestRecordResult_SuccessUpdatesLastSuccessAndStaleness(t *testing.T) {
	t.Parallel()

	pm := newFullTestPollMetrics(t)

	endTime := time.Now()
	pm.RecordResult(PollResult{
		InstanceType: "pve",
		InstanceName: "my-instance",
		StartTime:    endTime.Add(-500 * time.Millisecond),
		EndTime:      endTime,
		Success:      true,
	})

	// Check lastSuccess is set to EndTime
	gotLastSuccess := getGaugeVecValue(pm.lastSuccess, "pve", "my-instance")
	wantLastSuccess := float64(endTime.Unix())
	if gotLastSuccess != wantLastSuccess {
		t.Errorf("lastSuccess = %v, want %v", gotLastSuccess, wantLastSuccess)
	}

	// Check staleness is set to 0 on success
	gotStaleness := getGaugeVecValue(pm.staleness, "pve", "my-instance")
	if gotStaleness != 0 {
		t.Errorf("staleness = %v, want 0 on success", gotStaleness)
	}

	// Check success counter incremented
	gotSuccessCount := getCounterVecValue(pm.pollResults, "pve", "my-instance", "success")
	if gotSuccessCount != 1 {
		t.Errorf("poll_total{result=success} = %v, want 1", gotSuccessCount)
	}

	// Check internal lastSuccessByKey is updated
	ts, ok := pm.lastSuccessFor("pve", "my-instance")
	if !ok {
		t.Fatal("lastSuccessFor returned false, expected true")
	}
	if !ts.Equal(endTime) {
		t.Errorf("stored lastSuccess = %v, want %v", ts, endTime)
	}
}

func TestRecordResult_FailureIncrementsErrorCounter(t *testing.T) {
	t.Parallel()

	pm := newFullTestPollMetrics(t)

	// Use a MonitorError to test error classification
	monErr := internalerrors.NewMonitorError(
		internalerrors.ErrorTypeConnection,
		"poll_nodes",
		"pve1",
		errors.New("connection refused"),
	)

	pm.RecordResult(PollResult{
		InstanceType: "pve",
		InstanceName: "pve1",
		StartTime:    time.Now().Add(-time.Second),
		EndTime:      time.Now(),
		Success:      false,
		Error:        monErr,
	})

	// Check error counter with classified type
	gotErrorCount := getCounterVecValue(pm.pollErrors, "pve", "pve1", "connection")
	if gotErrorCount != 1 {
		t.Errorf("poll_errors_total{error_type=connection} = %v, want 1", gotErrorCount)
	}

	// Check error result counter
	gotErrorResultCount := getCounterVecValue(pm.pollResults, "pve", "pve1", "error")
	if gotErrorResultCount != 1 {
		t.Errorf("poll_total{result=error} = %v, want 1", gotErrorResultCount)
	}
}

func TestRecordResult_FailureWithUnknownErrorType(t *testing.T) {
	t.Parallel()

	pm := newFullTestPollMetrics(t)

	// Non-MonitorError should classify as "unknown"
	pm.RecordResult(PollResult{
		InstanceType: "pbs",
		InstanceName: "pbs1",
		StartTime:    time.Now().Add(-time.Second),
		EndTime:      time.Now(),
		Success:      false,
		Error:        errors.New("some random error"),
	})

	gotErrorCount := getCounterVecValue(pm.pollErrors, "pbs", "pbs1", "unknown")
	if gotErrorCount != 1 {
		t.Errorf("poll_errors_total{error_type=unknown} = %v, want 1", gotErrorCount)
	}
}

func TestRecordResult_FailureWithNilErrorClassifiesAsNone(t *testing.T) {
	t.Parallel()

	pm := newFullTestPollMetrics(t)

	pm.RecordResult(PollResult{
		InstanceType: "pmg",
		InstanceName: "pmg1",
		StartTime:    time.Now().Add(-time.Second),
		EndTime:      time.Now(),
		Success:      false,
		Error:        nil,
	})

	gotErrorCount := getCounterVecValue(pm.pollErrors, "pmg", "pmg1", "none")
	if gotErrorCount != 1 {
		t.Errorf("poll_errors_total{error_type=none} = %v, want 1", gotErrorCount)
	}
}

func TestRecordResult_NegativeDurationClampedToZero(t *testing.T) {
	t.Parallel()

	pm := newFullTestPollMetrics(t)

	endTime := time.Now()
	startTime := endTime.Add(time.Second) // Start AFTER end = negative duration

	pm.RecordResult(PollResult{
		InstanceType: "pve",
		InstanceName: "neg-test",
		StartTime:    startTime,
		EndTime:      endTime,
		Success:      true,
	})

	// Histogram should have recorded 0, not a negative value
	sampleSum := getHistogramSampleSum(pm.pollDuration, "pve", "neg-test")
	if sampleSum != 0 {
		t.Errorf("poll_duration sum = %v, want 0 for negative duration", sampleSum)
	}

	sampleCount := getHistogramSampleCount(pm.pollDuration, "pve", "neg-test")
	if sampleCount != 1 {
		t.Errorf("poll_duration count = %v, want 1", sampleCount)
	}
}

func TestRecordResult_LabelsSanitized(t *testing.T) {
	t.Parallel()

	pm := newFullTestPollMetrics(t)

	pm.RecordResult(PollResult{
		InstanceType: "",   // Should become "unknown"
		InstanceName: "  ", // Should become "unknown"
		StartTime:    time.Now().Add(-time.Second),
		EndTime:      time.Now(),
		Success:      true,
	})

	// Check that metrics were recorded with sanitized labels
	gotSuccessCount := getCounterVecValue(pm.pollResults, "unknown", "unknown", "success")
	if gotSuccessCount != 1 {
		t.Errorf("poll_total{unknown,unknown,success} = %v, want 1", gotSuccessCount)
	}
}

func TestRecordResult_DecrementsPending(t *testing.T) {
	t.Parallel()

	pm := newFullTestPollMetrics(t)

	// Set initial pending count
	pm.ResetQueueDepth(5)

	// Record a result - should decrement pending
	pm.RecordResult(PollResult{
		InstanceType: "pve",
		InstanceName: "pve1",
		StartTime:    time.Now().Add(-time.Second),
		EndTime:      time.Now(),
		Success:      true,
	})

	// Check pending was decremented
	pm.mu.RLock()
	gotPending := pm.pending
	pm.mu.RUnlock()

	if gotPending != 4 {
		t.Errorf("pending = %v, want 4 after decrement from 5", gotPending)
	}

	// Check queueDepth gauge reflects the new value
	gotQueueDepth := getGaugeValue(pm.queueDepth)
	if gotQueueDepth != 4 {
		t.Errorf("queueDepth gauge = %v, want 4", gotQueueDepth)
	}
}

func TestRecordResult_FailureStalenessWithPreviousSuccess(t *testing.T) {
	t.Parallel()

	pm := newFullTestPollMetrics(t)

	// First, record a successful poll
	firstEndTime := time.Now().Add(-10 * time.Second)
	pm.RecordResult(PollResult{
		InstanceType: "pve",
		InstanceName: "stale-test",
		StartTime:    firstEndTime.Add(-time.Second),
		EndTime:      firstEndTime,
		Success:      true,
	})

	// Now record a failed poll
	secondEndTime := time.Now()
	pm.RecordResult(PollResult{
		InstanceType: "pve",
		InstanceName: "stale-test",
		StartTime:    secondEndTime.Add(-time.Second),
		EndTime:      secondEndTime,
		Success:      false,
		Error:        errors.New("failed"),
	})

	// Staleness should be ~10 seconds
	gotStaleness := getGaugeVecValue(pm.staleness, "pve", "stale-test")
	if gotStaleness < 9 || gotStaleness > 11 {
		t.Errorf("staleness = %v, want ~10 seconds", gotStaleness)
	}
}

func TestRecordResult_FailureStalenessWithoutPreviousSuccess(t *testing.T) {
	t.Parallel()

	pm := newFullTestPollMetrics(t)

	// Record a failure without any prior success
	pm.RecordResult(PollResult{
		InstanceType: "pve",
		InstanceName: "no-prior-success",
		StartTime:    time.Now().Add(-time.Second),
		EndTime:      time.Now(),
		Success:      false,
		Error:        errors.New("failed"),
	})

	// Staleness should be -1 (no prior success)
	gotStaleness := getGaugeVecValue(pm.staleness, "pve", "no-prior-success")
	if gotStaleness != -1 {
		t.Errorf("staleness = %v, want -1 for no prior success", gotStaleness)
	}
}

func TestRecordResult_DurationObserved(t *testing.T) {
	t.Parallel()

	pm := newFullTestPollMetrics(t)

	endTime := time.Now()
	startTime := endTime.Add(-2 * time.Second)

	pm.RecordResult(PollResult{
		InstanceType: "pbs",
		InstanceName: "duration-test",
		StartTime:    startTime,
		EndTime:      endTime,
		Success:      true,
	})

	sampleSum := getHistogramSampleSum(pm.pollDuration, "pbs", "duration-test")
	if sampleSum < 1.9 || sampleSum > 2.1 {
		t.Errorf("poll_duration sum = %v, want ~2.0 seconds", sampleSum)
	}

	sampleCount := getHistogramSampleCount(pm.pollDuration, "pbs", "duration-test")
	if sampleCount != 1 {
		t.Errorf("poll_duration count = %v, want 1", sampleCount)
	}
}

// newInFlightTestPollMetrics creates a PollMetrics with inflight gauge for testing.
func newInFlightTestPollMetrics(t *testing.T) *PollMetrics {
	t.Helper()

	reg := prometheus.NewRegistry()

	pm := &PollMetrics{
		inflight: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace: "pulse",
				Subsystem: "monitor",
				Name:      "poll_inflight",
				Help:      "Current number of poll operations executing per instance type.",
			},
			[]string{"instance_type"},
		),
		queueDepth: prometheus.NewGauge(
			prometheus.GaugeOpts{
				Namespace: "pulse",
				Subsystem: "monitor",
				Name:      "poll_queue_depth",
				Help:      "Approximate number of poll tasks waiting.",
			},
		),
	}

	reg.MustRegister(pm.inflight, pm.queueDepth)

	return pm
}

func TestResetQueueDepth_NilPollMetrics(t *testing.T) {
	t.Parallel()

	var pm *PollMetrics
	// Should not panic
	pm.ResetQueueDepth(10)
}

func TestResetQueueDepth_SetsPendingTotal(t *testing.T) {
	t.Parallel()

	pm := newFullTestPollMetrics(t)

	pm.ResetQueueDepth(42)

	pm.mu.RLock()
	gotPending := pm.pending
	pm.mu.RUnlock()

	if gotPending != 42 {
		t.Errorf("pending = %v, want 42", gotPending)
	}

	gotQueueDepth := getGaugeValue(pm.queueDepth)
	if gotQueueDepth != 42 {
		t.Errorf("queueDepth gauge = %v, want 42", gotQueueDepth)
	}
}

func TestResetQueueDepth_NegativeClampsToZero(t *testing.T) {
	t.Parallel()

	pm := newFullTestPollMetrics(t)

	// First set to positive value
	pm.ResetQueueDepth(10)

	// Then set to negative - should clamp to 0
	pm.ResetQueueDepth(-5)

	pm.mu.RLock()
	gotPending := pm.pending
	pm.mu.RUnlock()

	if gotPending != 0 {
		t.Errorf("pending = %v, want 0 for negative input", gotPending)
	}

	gotQueueDepth := getGaugeValue(pm.queueDepth)
	if gotQueueDepth != 0 {
		t.Errorf("queueDepth gauge = %v, want 0 for negative input", gotQueueDepth)
	}
}

func TestResetQueueDepth_ZeroWorksCorrectly(t *testing.T) {
	t.Parallel()

	pm := newFullTestPollMetrics(t)

	// First set to positive value
	pm.ResetQueueDepth(10)

	// Then reset to zero
	pm.ResetQueueDepth(0)

	pm.mu.RLock()
	gotPending := pm.pending
	pm.mu.RUnlock()

	if gotPending != 0 {
		t.Errorf("pending = %v, want 0", gotPending)
	}

	gotQueueDepth := getGaugeValue(pm.queueDepth)
	if gotQueueDepth != 0 {
		t.Errorf("queueDepth gauge = %v, want 0", gotQueueDepth)
	}
}

func TestIncInFlight_NilPollMetrics(t *testing.T) {
	t.Parallel()

	var pm *PollMetrics
	// Should not panic
	pm.IncInFlight("pve")
}

func TestIncInFlight_IncrementsGauge(t *testing.T) {
	t.Parallel()

	pm := newInFlightTestPollMetrics(t)

	pm.IncInFlight("pve")
	pm.IncInFlight("pve")
	pm.IncInFlight("pbs")

	gotPve := getGaugeVecValue(pm.inflight, "pve")
	if gotPve != 2 {
		t.Errorf("inflight{pve} = %v, want 2", gotPve)
	}

	gotPbs := getGaugeVecValue(pm.inflight, "pbs")
	if gotPbs != 1 {
		t.Errorf("inflight{pbs} = %v, want 1", gotPbs)
	}
}

func TestDecInFlight_NilPollMetrics(t *testing.T) {
	t.Parallel()

	var pm *PollMetrics
	// Should not panic
	pm.DecInFlight("pve")
}

func TestDecInFlight_DecrementsGauge(t *testing.T) {
	t.Parallel()

	pm := newInFlightTestPollMetrics(t)

	// First increment a few times
	pm.IncInFlight("pve")
	pm.IncInFlight("pve")
	pm.IncInFlight("pve")

	// Then decrement
	pm.DecInFlight("pve")

	got := getGaugeVecValue(pm.inflight, "pve")
	if got != 2 {
		t.Errorf("inflight{pve} = %v, want 2 after inc(3) dec(1)", got)
	}

	// Decrement again
	pm.DecInFlight("pve")
	pm.DecInFlight("pve")

	got = getGaugeVecValue(pm.inflight, "pve")
	if got != 0 {
		t.Errorf("inflight{pve} = %v, want 0 after full decrement", got)
	}
}

func TestDecrementPending_NilPollMetrics(t *testing.T) {
	t.Parallel()

	var pm *PollMetrics
	// Should not panic
	pm.decrementPending()
}

func TestDecrementPending_DecrementsWhenPositive(t *testing.T) {
	t.Parallel()

	pm := newFullTestPollMetrics(t)

	// Set initial pending count
	pm.ResetQueueDepth(5)

	pm.decrementPending()

	pm.mu.RLock()
	gotPending := pm.pending
	pm.mu.RUnlock()

	if gotPending != 4 {
		t.Errorf("pending = %v, want 4 after decrement from 5", gotPending)
	}
}

func TestDecrementPending_DoesNotGoBelowZero(t *testing.T) {
	t.Parallel()

	pm := newFullTestPollMetrics(t)

	// Start at 0 (default)
	pm.decrementPending()

	pm.mu.RLock()
	gotPending := pm.pending
	pm.mu.RUnlock()

	if gotPending != 0 {
		t.Errorf("pending = %v, want 0 (should not go negative)", gotPending)
	}

	// Also verify the gauge is 0
	gotQueueDepth := getGaugeValue(pm.queueDepth)
	if gotQueueDepth != 0 {
		t.Errorf("queueDepth gauge = %v, want 0", gotQueueDepth)
	}
}

func TestDecrementPending_UpdatesQueueDepthGauge(t *testing.T) {
	t.Parallel()

	pm := newFullTestPollMetrics(t)

	pm.ResetQueueDepth(10)
	pm.decrementPending()

	gotQueueDepth := getGaugeValue(pm.queueDepth)
	if gotQueueDepth != 9 {
		t.Errorf("queueDepth gauge = %v, want 9", gotQueueDepth)
	}
}

func TestDecrementPending_MultipleDecrements(t *testing.T) {
	t.Parallel()

	pm := newFullTestPollMetrics(t)

	pm.ResetQueueDepth(5)

	// Decrement 5 times
	for i := 0; i < 5; i++ {
		pm.decrementPending()
	}

	pm.mu.RLock()
	gotPending := pm.pending
	pm.mu.RUnlock()

	if gotPending != 0 {
		t.Errorf("pending = %v, want 0 after 5 decrements from 5", gotPending)
	}

	gotQueueDepth := getGaugeValue(pm.queueDepth)
	if gotQueueDepth != 0 {
		t.Errorf("queueDepth gauge = %v, want 0", gotQueueDepth)
	}

	// Decrement one more time - should stay at 0
	pm.decrementPending()

	pm.mu.RLock()
	gotPending = pm.pending
	pm.mu.RUnlock()

	if gotPending != 0 {
		t.Errorf("pending = %v, want 0 after extra decrement", gotPending)
	}
}

func TestRecordResult_FailureNegativeStalenessClampedToZero(t *testing.T) {
	t.Parallel()

	pm := newFullTestPollMetrics(t)

	// Record a success with a future timestamp
	futureTime := time.Now().Add(10 * time.Second)
	pm.RecordResult(PollResult{
		InstanceType: "pve",
		InstanceName: "negative-staleness",
		StartTime:    futureTime.Add(-time.Second),
		EndTime:      futureTime,
		Success:      true,
	})

	// Now record a failure with an EndTime BEFORE the last success
	// This creates a negative staleness calculation (EndTime - lastSuccess < 0)
	pastTime := futureTime.Add(-5 * time.Second)
	pm.RecordResult(PollResult{
		InstanceType: "pve",
		InstanceName: "negative-staleness",
		StartTime:    pastTime.Add(-time.Second),
		EndTime:      pastTime,
		Success:      false,
		Error:        errors.New("failed"),
	})

	// Staleness should be clamped to 0, not negative
	gotStaleness := getGaugeVecValue(pm.staleness, "pve", "negative-staleness")
	if gotStaleness != 0 {
		t.Errorf("staleness = %v, want 0 for negative staleness calculation", gotStaleness)
	}
}

// newNodeTestPollMetrics creates a PollMetrics instance with node-level metrics for testing.
func newNodeTestPollMetrics(t *testing.T) *PollMetrics {
	t.Helper()

	reg := prometheus.NewRegistry()

	pm := &PollMetrics{
		nodePollDuration: prometheus.NewHistogramVec(
			prometheus.HistogramOpts{
				Namespace: "pulse",
				Subsystem: "monitor",
				Name:      "node_poll_duration_seconds",
				Help:      "Duration of polling operations per node.",
				Buckets:   []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 15, 20, 30},
			},
			[]string{"instance_type", "instance", "node"},
		),
		nodePollResults: prometheus.NewCounterVec(
			prometheus.CounterOpts{
				Namespace: "pulse",
				Subsystem: "monitor",
				Name:      "node_poll_total",
				Help:      "Total polling attempts per node partitioned by result.",
			},
			[]string{"instance_type", "instance", "node", "result"},
		),
		nodePollErrors: prometheus.NewCounterVec(
			prometheus.CounterOpts{
				Namespace: "pulse",
				Subsystem: "monitor",
				Name:      "node_poll_errors_total",
				Help:      "Polling failures per node grouped by error type.",
			},
			[]string{"instance_type", "instance", "node", "error_type"},
		),
		nodeLastSuccess: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace: "pulse",
				Subsystem: "monitor",
				Name:      "node_poll_last_success_timestamp",
				Help:      "Unix timestamp of the last successful poll for a node.",
			},
			[]string{"instance_type", "instance", "node"},
		),
		nodeStaleness: prometheus.NewGaugeVec(
			prometheus.GaugeOpts{
				Namespace: "pulse",
				Subsystem: "monitor",
				Name:      "node_poll_staleness_seconds",
				Help:      "Seconds since the last successful poll for a node.",
			},
			[]string{"instance_type", "instance", "node"},
		),
		nodeLastSuccessByKey: make(map[nodeMetricKey]time.Time),
	}

	reg.MustRegister(
		pm.nodePollDuration,
		pm.nodePollResults,
		pm.nodePollErrors,
		pm.nodeLastSuccess,
		pm.nodeStaleness,
	)

	return pm
}

func TestRecordNodeResult_NilPollMetrics(t *testing.T) {
	t.Parallel()

	var pm *PollMetrics
	// Should not panic
	pm.RecordNodeResult(NodePollResult{
		InstanceType: "pve",
		InstanceName: "pve1",
		NodeName:     "node1",
		StartTime:    time.Now(),
		EndTime:      time.Now().Add(time.Second),
		Success:      true,
	})
}

func TestRecordNodeResult_SuccessUpdatesMetrics(t *testing.T) {
	t.Parallel()

	pm := newNodeTestPollMetrics(t)

	endTime := time.Now()
	pm.RecordNodeResult(NodePollResult{
		InstanceType: "pve",
		InstanceName: "my-pve",
		NodeName:     "node1",
		StartTime:    endTime.Add(-500 * time.Millisecond),
		EndTime:      endTime,
		Success:      true,
	})

	// Check lastSuccess is set
	gotLastSuccess := getGaugeVecValue(pm.nodeLastSuccess, "pve", "my-pve", "node1")
	wantLastSuccess := float64(endTime.Unix())
	if gotLastSuccess != wantLastSuccess {
		t.Errorf("nodeLastSuccess = %v, want %v", gotLastSuccess, wantLastSuccess)
	}

	// Check staleness is set to 0 on success
	gotStaleness := getGaugeVecValue(pm.nodeStaleness, "pve", "my-pve", "node1")
	if gotStaleness != 0 {
		t.Errorf("nodeStaleness = %v, want 0 on success", gotStaleness)
	}

	// Check success counter incremented
	gotSuccessCount := getCounterVecValue(pm.nodePollResults, "pve", "my-pve", "node1", "success")
	if gotSuccessCount != 1 {
		t.Errorf("node_poll_total{result=success} = %v, want 1", gotSuccessCount)
	}

	// Check internal nodeLastSuccessByKey is updated
	ts, ok := pm.lastNodeSuccessFor("pve", "my-pve", "node1")
	if !ok {
		t.Fatal("lastNodeSuccessFor returned false, expected true")
	}
	if !ts.Equal(endTime) {
		t.Errorf("stored nodeLastSuccess = %v, want %v", ts, endTime)
	}
}

func TestRecordNodeResult_FailureIncrementsErrorCounter(t *testing.T) {
	t.Parallel()

	pm := newNodeTestPollMetrics(t)

	monErr := internalerrors.NewMonitorError(
		internalerrors.ErrorTypeTimeout,
		"poll_node",
		"pve1",
		errors.New("timeout"),
	)

	pm.RecordNodeResult(NodePollResult{
		InstanceType: "pve",
		InstanceName: "pve1",
		NodeName:     "node2",
		StartTime:    time.Now().Add(-time.Second),
		EndTime:      time.Now(),
		Success:      false,
		Error:        monErr,
	})

	// Check error counter with classified type
	gotErrorCount := getCounterVecValue(pm.nodePollErrors, "pve", "pve1", "node2", "timeout")
	if gotErrorCount != 1 {
		t.Errorf("node_poll_errors_total{error_type=timeout} = %v, want 1", gotErrorCount)
	}

	// Check error result counter
	gotErrorResultCount := getCounterVecValue(pm.nodePollResults, "pve", "pve1", "node2", "error")
	if gotErrorResultCount != 1 {
		t.Errorf("node_poll_total{result=error} = %v, want 1", gotErrorResultCount)
	}
}

func TestRecordNodeResult_NegativeDurationClampedToZero(t *testing.T) {
	t.Parallel()

	pm := newNodeTestPollMetrics(t)

	endTime := time.Now()
	startTime := endTime.Add(time.Second) // Start AFTER end = negative duration

	pm.RecordNodeResult(NodePollResult{
		InstanceType: "pve",
		InstanceName: "neg-test",
		NodeName:     "node1",
		StartTime:    startTime,
		EndTime:      endTime,
		Success:      true,
	})

	sampleSum := getHistogramSampleSum(pm.nodePollDuration, "pve", "neg-test", "node1")
	if sampleSum != 0 {
		t.Errorf("node_poll_duration sum = %v, want 0 for negative duration", sampleSum)
	}
}

func TestRecordNodeResult_EmptyNodeNameNormalized(t *testing.T) {
	t.Parallel()

	pm := newNodeTestPollMetrics(t)

	pm.RecordNodeResult(NodePollResult{
		InstanceType: "pve",
		InstanceName: "pve1",
		NodeName:     "",
		StartTime:    time.Now().Add(-time.Second),
		EndTime:      time.Now(),
		Success:      true,
	})

	// Empty node name should normalize to "unknown-node"
	gotSuccessCount := getCounterVecValue(pm.nodePollResults, "pve", "pve1", "unknown-node", "success")
	if gotSuccessCount != 1 {
		t.Errorf("node_poll_total{node=unknown-node,result=success} = %v, want 1", gotSuccessCount)
	}
}

func TestRecordNodeResult_FailureStalenessWithPreviousSuccess(t *testing.T) {
	t.Parallel()

	pm := newNodeTestPollMetrics(t)

	// First, record a successful poll
	firstEndTime := time.Now().Add(-10 * time.Second)
	pm.RecordNodeResult(NodePollResult{
		InstanceType: "pve",
		InstanceName: "stale-test",
		NodeName:     "node1",
		StartTime:    firstEndTime.Add(-time.Second),
		EndTime:      firstEndTime,
		Success:      true,
	})

	// Now record a failed poll
	secondEndTime := time.Now()
	pm.RecordNodeResult(NodePollResult{
		InstanceType: "pve",
		InstanceName: "stale-test",
		NodeName:     "node1",
		StartTime:    secondEndTime.Add(-time.Second),
		EndTime:      secondEndTime,
		Success:      false,
		Error:        errors.New("failed"),
	})

	// Staleness should be ~10 seconds
	gotStaleness := getGaugeVecValue(pm.nodeStaleness, "pve", "stale-test", "node1")
	if gotStaleness < 9 || gotStaleness > 11 {
		t.Errorf("nodeStaleness = %v, want ~10 seconds", gotStaleness)
	}
}

func TestRecordNodeResult_FailureStalenessWithoutPreviousSuccess(t *testing.T) {
	t.Parallel()

	pm := newNodeTestPollMetrics(t)

	// Record a failure without any prior success
	pm.RecordNodeResult(NodePollResult{
		InstanceType: "pve",
		InstanceName: "no-prior-success",
		NodeName:     "node1",
		StartTime:    time.Now().Add(-time.Second),
		EndTime:      time.Now(),
		Success:      false,
		Error:        errors.New("failed"),
	})

	// Staleness should be -1 (no prior success)
	gotStaleness := getGaugeVecValue(pm.nodeStaleness, "pve", "no-prior-success", "node1")
	if gotStaleness != -1 {
		t.Errorf("nodeStaleness = %v, want -1 for no prior success", gotStaleness)
	}
}

func TestRecordNodeResult_FailureNegativeStalenessClampedToZero(t *testing.T) {
	t.Parallel()

	pm := newNodeTestPollMetrics(t)

	// Record a success with a future timestamp
	futureTime := time.Now().Add(10 * time.Second)
	pm.RecordNodeResult(NodePollResult{
		InstanceType: "pve",
		InstanceName: "neg-stale",
		NodeName:     "node1",
		StartTime:    futureTime.Add(-time.Second),
		EndTime:      futureTime,
		Success:      true,
	})

	// Now record a failure with an EndTime BEFORE the last success
	pastTime := futureTime.Add(-5 * time.Second)
	pm.RecordNodeResult(NodePollResult{
		InstanceType: "pve",
		InstanceName: "neg-stale",
		NodeName:     "node1",
		StartTime:    pastTime.Add(-time.Second),
		EndTime:      pastTime,
		Success:      false,
		Error:        errors.New("failed"),
	})

	// Staleness should be clamped to 0, not negative
	gotStaleness := getGaugeVecValue(pm.nodeStaleness, "pve", "neg-stale", "node1")
	if gotStaleness != 0 {
		t.Errorf("nodeStaleness = %v, want 0 for negative staleness calculation", gotStaleness)
	}
}

// newQueueWaitTestPollMetrics creates a PollMetrics instance for RecordQueueWait testing.
func newQueueWaitTestPollMetrics(t *testing.T) *PollMetrics {
	t.Helper()

	reg := prometheus.NewRegistry()

	pm := &PollMetrics{
		schedulerQueueWait: prometheus.NewHistogramVec(
			prometheus.HistogramOpts{
				Namespace: "pulse",
				Subsystem: "scheduler",
				Name:      "queue_wait_seconds",
				Help:      "Observed wait time between task readiness and execution.",
				Buckets:   []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60},
			},
			[]string{"instance_type"},
		),
	}

	reg.MustRegister(pm.schedulerQueueWait)

	return pm
}

func TestRecordQueueWait_NilPollMetrics(t *testing.T) {
	t.Parallel()

	var pm *PollMetrics
	// Should not panic
	pm.RecordQueueWait("pve", 5*time.Second)
}

func TestRecordQueueWait_RecordsWaitTime(t *testing.T) {
	t.Parallel()

	pm := newQueueWaitTestPollMetrics(t)

	pm.RecordQueueWait("pve", 2*time.Second)

	sampleCount := getHistogramSampleCount(pm.schedulerQueueWait, "pve")
	if sampleCount != 1 {
		t.Errorf("queue_wait count = %v, want 1", sampleCount)
	}

	sampleSum := getHistogramSampleSum(pm.schedulerQueueWait, "pve")
	if sampleSum < 1.9 || sampleSum > 2.1 {
		t.Errorf("queue_wait sum = %v, want ~2.0 seconds", sampleSum)
	}
}

func TestRecordQueueWait_NegativeWaitClampedToZero(t *testing.T) {
	t.Parallel()

	pm := newQueueWaitTestPollMetrics(t)

	pm.RecordQueueWait("pve", -5*time.Second)

	sampleSum := getHistogramSampleSum(pm.schedulerQueueWait, "pve")
	if sampleSum != 0 {
		t.Errorf("queue_wait sum = %v, want 0 for negative wait", sampleSum)
	}
}

func TestRecordQueueWait_EmptyTypeNormalized(t *testing.T) {
	t.Parallel()

	pm := newQueueWaitTestPollMetrics(t)

	pm.RecordQueueWait("", time.Second)

	// Empty label should normalize to "unknown"
	sampleCount := getHistogramSampleCount(pm.schedulerQueueWait, "unknown")
	if sampleCount != 1 {
		t.Errorf("queue_wait{unknown} count = %v, want 1", sampleCount)
	}
}

func TestSetQueueDepth_NilPollMetrics(t *testing.T) {
	t.Parallel()

	var pm *PollMetrics
	// Should not panic
	pm.SetQueueDepth(10)
}

func TestSetQueueDepth_SetsGauge(t *testing.T) {
	t.Parallel()

	pm := newInFlightTestPollMetrics(t)

	pm.SetQueueDepth(42)

	got := getGaugeValue(pm.queueDepth)
	if got != 42 {
		t.Errorf("queueDepth = %v, want 42", got)
	}
}

func TestSetQueueDepth_NegativeClampedToZero(t *testing.T) {
	t.Parallel()

	pm := newInFlightTestPollMetrics(t)

	pm.SetQueueDepth(-10)

	got := getGaugeValue(pm.queueDepth)
	if got != 0 {
		t.Errorf("queueDepth = %v, want 0 for negative input", got)
	}
}

func TestSetQueueDepth_ZeroWorks(t *testing.T) {
	t.Parallel()

	pm := newInFlightTestPollMetrics(t)

	// First set to positive
	pm.SetQueueDepth(10)

	// Then set to zero
	pm.SetQueueDepth(0)

	got := getGaugeValue(pm.queueDepth)
	if got != 0 {
		t.Errorf("queueDepth = %v, want 0", got)
	}
}