Pulse/internal/monitoring/fake_executor_integration.go
rcourtman 2636ba9137 test: add comprehensive integration test harness for adaptive polling (Phase 2 Task 9c)
Add PollExecutor seam and integration test infrastructure:

**PollExecutor Interface:**
- Add pluggable executor interface for testability
- Implement realExecutor wrapping existing poll functions
- Add SetExecutor() for test injection
- Zero impact on production behavior

**Integration Test Harness:**
- Build-tagged integration tests (go:build integration)
- Synthetic workload generator with configurable scenarios
- Fake executor simulating latencies, failures, recovery
- Runtime metrics collection (queue depth, staleness, goroutines)

**Comprehensive Assertions:**
- Queue depth bounds: stays within 1.5× instance count
- Staleness: healthy instances <20s, multiple poll cycles
- Circuit breakers: transient failures recover, permanent stay blocked
- Dead-letter queue: only permanent failures routed
- Scheduler health: snapshot consistency validation

**Test Scenarios:**
- 10 healthy PVE instances (rapid polling)
- 1 transient failure instance (fail → recover)
- 1 permanent failure instance (DLQ routing)
- 55s test duration with 3s base intervals
- Validates full adaptive scheduler lifecycle

Runs with: go test -tags=integration ./internal/monitoring -run TestAdaptiveSchedulerIntegration

Part of Phase 2 Task 9 (Integration/Soak Testing)
2025-10-20 15:13:38 +00:00

217 lines
4.9 KiB
Go

//go:build integration
package monitoring
import (
"context"
"fmt"
"math/rand"
"strings"
"sync"
"time"
internalerrors "github.com/rcourtman/pulse-go-rewrite/internal/errors"
)
type fakeExecutor struct {
monitor *Monitor
configs map[string]InstanceConfig
mu sync.Mutex
state map[string]*instanceState
rng *rand.Rand
}
type instanceState struct {
config InstanceConfig
seqIndex int
successes int
failures int
transient int
permanent int
totalLatency time.Duration
executions int
lastError string
lastSuccess time.Time
}
func newFakeExecutor(m *Monitor, scenario HarnessScenario) *fakeExecutor {
cfgs := make(map[string]InstanceConfig, len(scenario.Instances))
for _, inst := range scenario.Instances {
key := instanceKey(inst.Type, inst.Name)
cfgs[key] = inst
}
return &fakeExecutor{
monitor: m,
configs: cfgs,
state: make(map[string]*instanceState, len(cfgs)),
rng: rand.New(rand.NewSource(time.Now().UnixNano())),
}
}
func (f *fakeExecutor) Execute(ctx context.Context, task PollTask) {
start := time.Now()
key := instanceKey(task.InstanceType, task.InstanceName)
cfg, found := f.configs[key]
if !found {
cfg = InstanceConfig{
Type: task.InstanceType,
Name: task.InstanceName,
SuccessRate: 1.0,
}
}
state := f.getState(key, cfg)
latency := f.latencyFor(cfg)
select {
case <-ctx.Done():
return
case <-time.After(latency):
}
failType := f.nextFailure(state, cfg)
success := failType == FailureNone
var pollErr error
if !success {
err := fmt.Errorf("synthetic failure on %s", task.InstanceName)
switch failType {
case FailureTransient:
pollErr = internalerrors.NewMonitorError(internalerrors.ErrorTypeConnection, "fake_poll", task.InstanceName, err)
case FailurePermanent:
pollErr = internalerrors.NewMonitorError(internalerrors.ErrorTypeValidation, "fake_poll", task.InstanceName, err)
default:
pollErr = internalerrors.NewMonitorError(internalerrors.ErrorTypeInternal, "fake_poll", task.InstanceName, err)
}
}
result := PollResult{
InstanceName: task.InstanceName,
InstanceType: task.InstanceType,
Success: success,
Error: pollErr,
StartTime: start,
EndTime: time.Now(),
}
if f.monitor.pollMetrics != nil {
f.monitor.pollMetrics.RecordResult(result)
}
instanceType := toInstanceType(task.InstanceType)
if f.monitor.stalenessTracker != nil {
if success {
f.monitor.stalenessTracker.UpdateSuccess(instanceType, task.InstanceName, nil)
} else {
f.monitor.stalenessTracker.UpdateError(instanceType, task.InstanceName)
}
}
f.monitor.recordTaskResult(instanceType, task.InstanceName, pollErr)
f.recordStats(state, latency, success, failType, pollErr)
}
func (f *fakeExecutor) InstanceReport() map[string]InstanceStats {
f.mu.Lock()
defer f.mu.Unlock()
report := make(map[string]InstanceStats, len(f.state))
for key, st := range f.state {
avgLatency := time.Duration(0)
if st.executions > 0 {
avgLatency = st.totalLatency / time.Duration(st.executions)
}
report[key] = InstanceStats{
Total: st.executions,
Successes: st.successes,
Failures: st.failures,
TransientFailures: st.transient,
PermanentFailures: st.permanent,
AverageLatency: avgLatency,
LastError: st.lastError,
LastSuccessAt: st.lastSuccess,
}
}
return report
}
func (f *fakeExecutor) getState(key string, cfg InstanceConfig) *instanceState {
f.mu.Lock()
defer f.mu.Unlock()
if st, ok := f.state[key]; ok {
return st
}
st := &instanceState{config: cfg}
f.state[key] = st
return st
}
func (f *fakeExecutor) latencyFor(cfg InstanceConfig) time.Duration {
base := cfg.BaseLatency
if base <= 0 {
base = 200 * time.Millisecond
}
jitter := base / 5
if jitter <= 0 {
return base
}
offset := time.Duration(f.rng.Int63n(int64(jitter))) - jitter/2
return base + offset
}
func (f *fakeExecutor) nextFailure(state *instanceState, cfg InstanceConfig) FailureType {
if state.seqIndex < len(cfg.FailureSeq) {
ft := cfg.FailureSeq[state.seqIndex]
state.seqIndex++
return ft
}
successRate := cfg.SuccessRate
if successRate <= 0 {
return FailureTransient
}
if successRate >= 1 {
return FailureNone
}
if f.rng.Float64() <= successRate {
return FailureNone
}
return FailureTransient
}
func (f *fakeExecutor) recordStats(state *instanceState, latency time.Duration, success bool, failure FailureType, pollErr error) {
f.mu.Lock()
defer f.mu.Unlock()
state.executions++
state.totalLatency += latency
if success {
state.successes++
state.lastError = ""
state.lastSuccess = time.Now()
return
}
state.failures++
if failure == FailureTransient {
state.transient++
} else if failure == FailurePermanent {
state.permanent++
}
if pollErr != nil {
state.lastError = pollErr.Error()
}
}
func instanceKey(typ, name string) string {
return fmt.Sprintf("%s::%s", strings.ToLower(typ), name)
}