diff --git a/internal/monitoring/guest_metadata_test.go b/internal/monitoring/guest_metadata_test.go index 40643dd66..9e99e2cfd 100644 --- a/internal/monitoring/guest_metadata_test.go +++ b/internal/monitoring/guest_metadata_test.go @@ -1,7 +1,11 @@ package monitoring import ( + "context" + "errors" + "math/rand" "testing" + "time" "github.com/rcourtman/pulse-go-rewrite/internal/models" "github.com/rcourtman/pulse-go-rewrite/pkg/proxmox" @@ -543,3 +547,733 @@ func TestProcessGuestNetworkInterfaces(t *testing.T) { }) } } + +func TestRetryGuestAgentCall(t *testing.T) { + t.Parallel() + + t.Run("successful call on first attempt", func(t *testing.T) { + t.Parallel() + + m := &Monitor{} + callCount := 0 + fn := func(ctx context.Context) (interface{}, error) { + callCount++ + return "success", nil + } + + ctx := context.Background() + result, err := m.retryGuestAgentCall(ctx, 50*time.Millisecond, 2, fn) + + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if result != "success" { + t.Fatalf("expected result 'success', got %v", result) + } + if callCount != 1 { + t.Fatalf("expected 1 call, got %d", callCount) + } + }) + + t.Run("timeout error triggers retry and eventually succeeds", func(t *testing.T) { + t.Parallel() + + m := &Monitor{} + callCount := 0 + fn := func(ctx context.Context) (interface{}, error) { + callCount++ + if callCount < 3 { + return nil, errors.New("context deadline exceeded (timeout)") + } + return "success after retries", nil + } + + ctx := context.Background() + result, err := m.retryGuestAgentCall(ctx, 50*time.Millisecond, 3, fn) + + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + if result != "success after retries" { + t.Fatalf("expected result 'success after retries', got %v", result) + } + if callCount != 3 { + t.Fatalf("expected 3 calls, got %d", callCount) + } + }) + + t.Run("non-timeout error does not trigger retry", func(t *testing.T) { + t.Parallel() + + m := &Monitor{} + callCount := 0 + nonTimeoutErr := errors.New("connection refused") + fn := func(ctx context.Context) (interface{}, error) { + callCount++ + return nil, nonTimeoutErr + } + + ctx := context.Background() + result, err := m.retryGuestAgentCall(ctx, 50*time.Millisecond, 3, fn) + + if err == nil { + t.Fatal("expected error, got nil") + } + if err.Error() != nonTimeoutErr.Error() { + t.Fatalf("expected error %q, got %q", nonTimeoutErr.Error(), err.Error()) + } + if result != nil { + t.Fatalf("expected nil result, got %v", result) + } + if callCount != 1 { + t.Fatalf("expected 1 call (no retry for non-timeout), got %d", callCount) + } + }) + + t.Run("all retries exhausted returns last error", func(t *testing.T) { + t.Parallel() + + m := &Monitor{} + callCount := 0 + fn := func(ctx context.Context) (interface{}, error) { + callCount++ + return nil, errors.New("persistent timeout error") + } + + ctx := context.Background() + result, err := m.retryGuestAgentCall(ctx, 50*time.Millisecond, 2, fn) + + if err == nil { + t.Fatal("expected error, got nil") + } + if err.Error() != "persistent timeout error" { + t.Fatalf("expected 'persistent timeout error', got %q", err.Error()) + } + if result != nil { + t.Fatalf("expected nil result, got %v", result) + } + // maxRetries=2 means: attempt 0 (initial) + attempts 1,2 (retries) = 3 calls + if callCount != 3 { + t.Fatalf("expected 3 calls (1 initial + 2 retries), got %d", callCount) + } + }) + + t.Run("context cancellation during retry delay aborts early", func(t *testing.T) { + t.Parallel() + + m := &Monitor{} + callCount := 0 + fn := func(ctx context.Context) (interface{}, error) { + callCount++ + return nil, errors.New("timeout error") + } + + ctx, cancel := context.WithCancel(context.Background()) + + // Cancel context after first call completes but during retry delay + go func() { + time.Sleep(10 * time.Millisecond) + cancel() + }() + + result, err := m.retryGuestAgentCall(ctx, 50*time.Millisecond, 5, fn) + + if err == nil { + t.Fatal("expected error, got nil") + } + if !errors.Is(err, context.Canceled) { + t.Fatalf("expected context.Canceled error, got %v", err) + } + if result != nil { + t.Fatalf("expected nil result, got %v", result) + } + // Should only have made 1 call before context was canceled during delay + if callCount != 1 { + t.Fatalf("expected 1 call before cancellation, got %d", callCount) + } + }) + + t.Run("zero retries means single attempt", func(t *testing.T) { + t.Parallel() + + m := &Monitor{} + callCount := 0 + fn := func(ctx context.Context) (interface{}, error) { + callCount++ + return nil, errors.New("timeout error") + } + + ctx := context.Background() + result, err := m.retryGuestAgentCall(ctx, 50*time.Millisecond, 0, fn) + + if err == nil { + t.Fatal("expected error, got nil") + } + if result != nil { + t.Fatalf("expected nil result, got %v", result) + } + if callCount != 1 { + t.Fatalf("expected 1 call with maxRetries=0, got %d", callCount) + } + }) +} + +func TestAcquireGuestMetadataSlot(t *testing.T) { + t.Parallel() + + t.Run("nil monitor returns true", func(t *testing.T) { + t.Parallel() + + var m *Monitor + ctx := context.Background() + if !m.acquireGuestMetadataSlot(ctx) { + t.Fatal("nil monitor should return true (permissive default)") + } + }) + + t.Run("nil slots channel returns true", func(t *testing.T) { + t.Parallel() + + m := &Monitor{guestMetadataSlots: nil} + ctx := context.Background() + if !m.acquireGuestMetadataSlot(ctx) { + t.Fatal("nil slots channel should return true (permissive default)") + } + }) + + t.Run("successfully acquires slot when available", func(t *testing.T) { + t.Parallel() + + m := &Monitor{guestMetadataSlots: make(chan struct{}, 2)} + ctx := context.Background() + if !m.acquireGuestMetadataSlot(ctx) { + t.Fatal("should acquire slot when channel has capacity") + } + // Verify slot was actually acquired (channel should have 1 element) + if len(m.guestMetadataSlots) != 1 { + t.Fatalf("expected 1 slot acquired, got %d", len(m.guestMetadataSlots)) + } + }) + + t.Run("context cancellation returns false when slot not available", func(t *testing.T) { + t.Parallel() + + // Create a channel with capacity 1 and fill it + m := &Monitor{guestMetadataSlots: make(chan struct{}, 1)} + m.guestMetadataSlots <- struct{}{} + + // Create already-cancelled context + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + if m.acquireGuestMetadataSlot(ctx) { + t.Fatal("should return false when context is cancelled and slot not available") + } + }) + + t.Run("multiple acquires fill up the channel", func(t *testing.T) { + t.Parallel() + + capacity := 3 + m := &Monitor{guestMetadataSlots: make(chan struct{}, capacity)} + ctx := context.Background() + + // Acquire all available slots + for i := 0; i < capacity; i++ { + if !m.acquireGuestMetadataSlot(ctx) { + t.Fatalf("should acquire slot %d of %d", i+1, capacity) + } + } + + // Verify channel is full + if len(m.guestMetadataSlots) != capacity { + t.Fatalf("expected %d slots acquired, got %d", capacity, len(m.guestMetadataSlots)) + } + + // Next acquire should block; use cancelled context to test + ctx2, cancel := context.WithCancel(context.Background()) + cancel() + if m.acquireGuestMetadataSlot(ctx2) { + t.Fatal("should return false when channel is full and context cancelled") + } + }) +} + +func TestTryReserveGuestMetadataFetch(t *testing.T) { + t.Parallel() + + t.Run("nil monitor returns false", func(t *testing.T) { + t.Parallel() + + var m *Monitor + now := time.Now() + if m.tryReserveGuestMetadataFetch("test-key", now) { + t.Fatal("nil monitor should return false") + } + }) + + t.Run("key not in map reserves and returns true", func(t *testing.T) { + t.Parallel() + + m := &Monitor{ + guestMetadataLimiter: make(map[string]time.Time), + guestMetadataHoldDuration: 15 * time.Second, + } + now := time.Now() + + if !m.tryReserveGuestMetadataFetch("new-key", now) { + t.Fatal("should return true for new key") + } + + // Verify the key was added with correct expiry + next, ok := m.guestMetadataLimiter["new-key"] + if !ok { + t.Fatal("key should be in limiter map") + } + expectedNext := now.Add(15 * time.Second) + if next.Sub(expectedNext) > time.Millisecond || expectedNext.Sub(next) > time.Millisecond { + t.Fatalf("expected next time ~%v, got %v", expectedNext, next) + } + }) + + t.Run("key in map with future time returns false", func(t *testing.T) { + t.Parallel() + + now := time.Now() + m := &Monitor{ + guestMetadataLimiter: map[string]time.Time{ + "existing-key": now.Add(10 * time.Second), // future + }, + guestMetadataHoldDuration: 15 * time.Second, + } + + if m.tryReserveGuestMetadataFetch("existing-key", now) { + t.Fatal("should return false when key has future expiry") + } + + // Verify the expiry was not changed + next := m.guestMetadataLimiter["existing-key"] + if !next.Equal(now.Add(10 * time.Second)) { + t.Fatal("expiry time should not have been modified") + } + }) + + t.Run("key in map with past time reserves and returns true", func(t *testing.T) { + t.Parallel() + + now := time.Now() + m := &Monitor{ + guestMetadataLimiter: map[string]time.Time{ + "expired-key": now.Add(-5 * time.Second), // past + }, + guestMetadataHoldDuration: 20 * time.Second, + } + + if !m.tryReserveGuestMetadataFetch("expired-key", now) { + t.Fatal("should return true when key has past expiry") + } + + // Verify the key was updated with new expiry + next := m.guestMetadataLimiter["expired-key"] + expectedNext := now.Add(20 * time.Second) + if next.Sub(expectedNext) > time.Millisecond || expectedNext.Sub(next) > time.Millisecond { + t.Fatalf("expected next time ~%v, got %v", expectedNext, next) + } + }) + + t.Run("default hold duration used when guestMetadataHoldDuration is zero", func(t *testing.T) { + t.Parallel() + + m := &Monitor{ + guestMetadataLimiter: make(map[string]time.Time), + guestMetadataHoldDuration: 0, // zero triggers default + } + now := time.Now() + + if !m.tryReserveGuestMetadataFetch("key", now) { + t.Fatal("should return true for new key") + } + + next := m.guestMetadataLimiter["key"] + expectedNext := now.Add(defaultGuestMetadataHold) // 15 seconds + if next.Sub(expectedNext) > time.Millisecond || expectedNext.Sub(next) > time.Millisecond { + t.Fatalf("expected next time ~%v (default hold), got %v", expectedNext, next) + } + }) + + t.Run("default hold duration used when guestMetadataHoldDuration is negative", func(t *testing.T) { + t.Parallel() + + m := &Monitor{ + guestMetadataLimiter: make(map[string]time.Time), + guestMetadataHoldDuration: -5 * time.Second, // negative triggers default + } + now := time.Now() + + if !m.tryReserveGuestMetadataFetch("key", now) { + t.Fatal("should return true for new key") + } + + next := m.guestMetadataLimiter["key"] + expectedNext := now.Add(defaultGuestMetadataHold) + if next.Sub(expectedNext) > time.Millisecond || expectedNext.Sub(next) > time.Millisecond { + t.Fatalf("expected next time ~%v (default hold), got %v", expectedNext, next) + } + }) + + t.Run("key at exact current time reserves and returns true", func(t *testing.T) { + t.Parallel() + + now := time.Now() + m := &Monitor{ + guestMetadataLimiter: map[string]time.Time{ + "exact-key": now, // exactly now + }, + guestMetadataHoldDuration: 10 * time.Second, + } + + // now.Before(now) is false, so this should reserve + if !m.tryReserveGuestMetadataFetch("exact-key", now) { + t.Fatal("should return true when key expires exactly at now") + } + }) +} + +func TestScheduleNextGuestMetadataFetch(t *testing.T) { + t.Parallel() + + t.Run("nil monitor is safe", func(t *testing.T) { + t.Parallel() + + var m *Monitor + now := time.Now() + // Should not panic + m.scheduleNextGuestMetadataFetch("test-key", now) + }) + + t.Run("schedules with default interval when guestMetadataMinRefresh is zero", func(t *testing.T) { + t.Parallel() + + m := &Monitor{ + guestMetadataLimiter: make(map[string]time.Time), + guestMetadataMinRefresh: 0, // zero triggers default + } + now := time.Now() + + m.scheduleNextGuestMetadataFetch("key", now) + + next, ok := m.guestMetadataLimiter["key"] + if !ok { + t.Fatal("key should be in limiter map") + } + // config.DefaultGuestMetadataMinRefresh is 2 minutes + expectedNext := now.Add(2 * time.Minute) + if next.Sub(expectedNext) > time.Millisecond || expectedNext.Sub(next) > time.Millisecond { + t.Fatalf("expected next time ~%v (default interval), got %v", expectedNext, next) + } + }) + + t.Run("schedules with default interval when guestMetadataMinRefresh is negative", func(t *testing.T) { + t.Parallel() + + m := &Monitor{ + guestMetadataLimiter: make(map[string]time.Time), + guestMetadataMinRefresh: -1 * time.Minute, // negative triggers default + } + now := time.Now() + + m.scheduleNextGuestMetadataFetch("key", now) + + next := m.guestMetadataLimiter["key"] + expectedNext := now.Add(2 * time.Minute) // DefaultGuestMetadataMinRefresh + if next.Sub(expectedNext) > time.Millisecond || expectedNext.Sub(next) > time.Millisecond { + t.Fatalf("expected next time ~%v (default interval), got %v", expectedNext, next) + } + }) + + t.Run("uses configured interval when positive", func(t *testing.T) { + t.Parallel() + + m := &Monitor{ + guestMetadataLimiter: make(map[string]time.Time), + guestMetadataMinRefresh: 5 * time.Minute, + } + now := time.Now() + + m.scheduleNextGuestMetadataFetch("key", now) + + next := m.guestMetadataLimiter["key"] + expectedNext := now.Add(5 * time.Minute) + if next.Sub(expectedNext) > time.Millisecond || expectedNext.Sub(next) > time.Millisecond { + t.Fatalf("expected next time ~%v, got %v", expectedNext, next) + } + }) + + t.Run("adds jitter when rng is non-nil and jitter is positive", func(t *testing.T) { + t.Parallel() + + // Use a seeded rng for deterministic testing + rng := newDeterministicRng(42) + + m := &Monitor{ + guestMetadataLimiter: make(map[string]time.Time), + guestMetadataMinRefresh: 1 * time.Minute, + guestMetadataRefreshJitter: 30 * time.Second, + rng: rng, + } + now := time.Now() + + m.scheduleNextGuestMetadataFetch("key", now) + + next := m.guestMetadataLimiter["key"] + minExpected := now.Add(1 * time.Minute) + maxExpected := now.Add(1*time.Minute + 30*time.Second) + + if next.Before(minExpected) || next.After(maxExpected) { + t.Fatalf("expected next time between %v and %v, got %v", minExpected, maxExpected, next) + } + }) + + t.Run("no jitter when rng is nil", func(t *testing.T) { + t.Parallel() + + m := &Monitor{ + guestMetadataLimiter: make(map[string]time.Time), + guestMetadataMinRefresh: 1 * time.Minute, + guestMetadataRefreshJitter: 30 * time.Second, // jitter configured but rng is nil + rng: nil, + } + now := time.Now() + + m.scheduleNextGuestMetadataFetch("key", now) + + next := m.guestMetadataLimiter["key"] + expectedNext := now.Add(1 * time.Minute) // no jitter added + if next.Sub(expectedNext) > time.Millisecond || expectedNext.Sub(next) > time.Millisecond { + t.Fatalf("expected next time ~%v (no jitter), got %v", expectedNext, next) + } + }) + + t.Run("no jitter when jitter duration is zero", func(t *testing.T) { + t.Parallel() + + rng := newDeterministicRng(42) + + m := &Monitor{ + guestMetadataLimiter: make(map[string]time.Time), + guestMetadataMinRefresh: 1 * time.Minute, + guestMetadataRefreshJitter: 0, // zero jitter + rng: rng, + } + now := time.Now() + + m.scheduleNextGuestMetadataFetch("key", now) + + next := m.guestMetadataLimiter["key"] + expectedNext := now.Add(1 * time.Minute) + if next.Sub(expectedNext) > time.Millisecond || expectedNext.Sub(next) > time.Millisecond { + t.Fatalf("expected next time ~%v (no jitter), got %v", expectedNext, next) + } + }) + + t.Run("no jitter when jitter duration is negative", func(t *testing.T) { + t.Parallel() + + rng := newDeterministicRng(42) + + m := &Monitor{ + guestMetadataLimiter: make(map[string]time.Time), + guestMetadataMinRefresh: 1 * time.Minute, + guestMetadataRefreshJitter: -10 * time.Second, // negative jitter + rng: rng, + } + now := time.Now() + + m.scheduleNextGuestMetadataFetch("key", now) + + next := m.guestMetadataLimiter["key"] + expectedNext := now.Add(1 * time.Minute) + if next.Sub(expectedNext) > time.Millisecond || expectedNext.Sub(next) > time.Millisecond { + t.Fatalf("expected next time ~%v (no jitter), got %v", expectedNext, next) + } + }) + + t.Run("overwrites existing key", func(t *testing.T) { + t.Parallel() + + now := time.Now() + m := &Monitor{ + guestMetadataLimiter: map[string]time.Time{ + "key": now.Add(-10 * time.Minute), // old value + }, + guestMetadataMinRefresh: 3 * time.Minute, + } + + m.scheduleNextGuestMetadataFetch("key", now) + + next := m.guestMetadataLimiter["key"] + expectedNext := now.Add(3 * time.Minute) + if next.Sub(expectedNext) > time.Millisecond || expectedNext.Sub(next) > time.Millisecond { + t.Fatalf("expected next time ~%v, got %v", expectedNext, next) + } + }) +} + +func TestDeferGuestMetadataRetry(t *testing.T) { + t.Parallel() + + t.Run("nil monitor is safe", func(t *testing.T) { + t.Parallel() + + var m *Monitor + now := time.Now() + // Should not panic + m.deferGuestMetadataRetry("test-key", now) + }) + + t.Run("uses default backoff when guestMetadataRetryBackoff is zero", func(t *testing.T) { + t.Parallel() + + m := &Monitor{ + guestMetadataLimiter: make(map[string]time.Time), + guestMetadataRetryBackoff: 0, // zero triggers default + } + now := time.Now() + + m.deferGuestMetadataRetry("key", now) + + next, ok := m.guestMetadataLimiter["key"] + if !ok { + t.Fatal("key should be in limiter map") + } + // config.DefaultGuestMetadataRetryBackoff is 30 seconds + expectedNext := now.Add(30 * time.Second) + if next.Sub(expectedNext) > time.Millisecond || expectedNext.Sub(next) > time.Millisecond { + t.Fatalf("expected next time ~%v (default backoff), got %v", expectedNext, next) + } + }) + + t.Run("uses default backoff when guestMetadataRetryBackoff is negative", func(t *testing.T) { + t.Parallel() + + m := &Monitor{ + guestMetadataLimiter: make(map[string]time.Time), + guestMetadataRetryBackoff: -10 * time.Second, // negative triggers default + } + now := time.Now() + + m.deferGuestMetadataRetry("key", now) + + next := m.guestMetadataLimiter["key"] + expectedNext := now.Add(30 * time.Second) // DefaultGuestMetadataRetryBackoff + if next.Sub(expectedNext) > time.Millisecond || expectedNext.Sub(next) > time.Millisecond { + t.Fatalf("expected next time ~%v (default backoff), got %v", expectedNext, next) + } + }) + + t.Run("uses configured backoff when positive", func(t *testing.T) { + t.Parallel() + + m := &Monitor{ + guestMetadataLimiter: make(map[string]time.Time), + guestMetadataRetryBackoff: 45 * time.Second, + } + now := time.Now() + + m.deferGuestMetadataRetry("key", now) + + next := m.guestMetadataLimiter["key"] + expectedNext := now.Add(45 * time.Second) + if next.Sub(expectedNext) > time.Millisecond || expectedNext.Sub(next) > time.Millisecond { + t.Fatalf("expected next time ~%v, got %v", expectedNext, next) + } + }) + + t.Run("overwrites existing key", func(t *testing.T) { + t.Parallel() + + now := time.Now() + m := &Monitor{ + guestMetadataLimiter: map[string]time.Time{ + "key": now.Add(-5 * time.Minute), // old value + }, + guestMetadataRetryBackoff: 1 * time.Minute, + } + + m.deferGuestMetadataRetry("key", now) + + next := m.guestMetadataLimiter["key"] + expectedNext := now.Add(1 * time.Minute) + if next.Sub(expectedNext) > time.Millisecond || expectedNext.Sub(next) > time.Millisecond { + t.Fatalf("expected next time ~%v, got %v", expectedNext, next) + } + }) +} + +// newDeterministicRng creates a rand.Rand with a fixed seed for reproducible tests. +func newDeterministicRng(seed int64) *rand.Rand { + return rand.New(rand.NewSource(seed)) +} + +func TestReleaseGuestMetadataSlot(t *testing.T) { + t.Parallel() + + t.Run("nil monitor is safe", func(t *testing.T) { + t.Parallel() + + var m *Monitor + // Should not panic + m.releaseGuestMetadataSlot() + }) + + t.Run("nil slots channel is safe", func(t *testing.T) { + t.Parallel() + + m := &Monitor{guestMetadataSlots: nil} + // Should not panic + m.releaseGuestMetadataSlot() + }) + + t.Run("successfully releases a slot", func(t *testing.T) { + t.Parallel() + + m := &Monitor{guestMetadataSlots: make(chan struct{}, 2)} + ctx := context.Background() + + // Acquire a slot first + if !m.acquireGuestMetadataSlot(ctx) { + t.Fatal("failed to acquire slot") + } + if len(m.guestMetadataSlots) != 1 { + t.Fatalf("expected 1 slot acquired, got %d", len(m.guestMetadataSlots)) + } + + // Release the slot + m.releaseGuestMetadataSlot() + + // Verify slot was released + if len(m.guestMetadataSlots) != 0 { + t.Fatalf("expected 0 slots after release, got %d", len(m.guestMetadataSlots)) + } + }) + + t.Run("release on empty channel does not block", func(t *testing.T) { + t.Parallel() + + m := &Monitor{guestMetadataSlots: make(chan struct{}, 2)} + + // Channel is empty - release should not block due to default case in select + done := make(chan struct{}) + go func() { + m.releaseGuestMetadataSlot() + close(done) + }() + + select { + case <-done: + // Success - release completed without blocking + case <-time.After(100 * time.Millisecond): + t.Fatal("releaseGuestMetadataSlot blocked on empty channel") + } + }) +}