//go:build integration package monitoring import ( "context" "flag" "fmt" "math" "os" "strconv" "strings" "testing" "time" ) var soakFlag = flag.Bool("soak", false, "run adaptive polling soak test") func TestAdaptiveSchedulerIntegration(t *testing.T) { scenario := HarnessScenario{ Duration: 45 * time.Second, WarmupDuration: 10 * time.Second, } for i := 0; i < 10; i++ { scenario.Instances = append(scenario.Instances, InstanceConfig{ Type: "pve", Name: fmt.Sprintf("pve-%02d", i), SuccessRate: 1.0, BaseLatency: 150 * time.Millisecond, }) } scenario.Instances = append(scenario.Instances, InstanceConfig{ Type: "pve", Name: "pve-transient", SuccessRate: 1.0, FailureSeq: []FailureType{ FailureTransient, FailureTransient, FailureTransient, FailureNone, FailureNone, FailureNone, }, BaseLatency: 120 * time.Millisecond, }) scenario.Instances = append(scenario.Instances, InstanceConfig{ Type: "pve", Name: "pve-permanent", SuccessRate: 1.0, FailureSeq: []FailureType{ FailurePermanent, }, BaseLatency: 160 * time.Millisecond, }) harness := NewHarness(scenario) ctx, cancel := context.WithTimeout(context.Background(), scenario.Duration+scenario.WarmupDuration+10*time.Second) defer cancel() report := harness.Run(ctx) instanceCount := len(scenario.Instances) if len(report.PerInstanceStats) != instanceCount { t.Fatalf("expected stats for %d instances, got %d", instanceCount, len(report.PerInstanceStats)) } for key, stats := range report.PerInstanceStats { if stats.Total == 0 { t.Fatalf("instance %s executed zero tasks", key) } if stats.AverageLatency <= 0 { t.Fatalf("instance %s reported invalid latency %v", key, stats.AverageLatency) } if stats.Successes > 0 && stats.LastSuccessAt.IsZero() { t.Fatalf("instance %s recorded successes but missing last success timestamp", key) } if stats.PermanentFailures == 0 && stats.TransientFailures == 0 && stats.Successes < 3 { t.Fatalf("instance %s expected to execute at least 3 successful polls, got %d", key, stats.Successes) } } maxAllowedDepth := int(math.Ceil(float64(instanceCount) * 1.5)) if report.QueueStats.MaxDepth > maxAllowedDepth { t.Fatalf("queue depth exceeded threshold: max %d, allowed %d", report.QueueStats.MaxDepth, maxAllowedDepth) } if report.QueueStats.FinalDepth > instanceCount { t.Fatalf("final queue depth %d exceeds instance count %d", report.QueueStats.FinalDepth, instanceCount) } if report.QueueStats.AverageDepth <= 0 { t.Fatalf("expected average queue depth > 0, got %f", report.QueueStats.AverageDepth) } if report.QueueStats.MaxDepth == 0 { t.Fatal("expected queue depth to grow beyond zero") } if report.Health.Queue.Depth != report.QueueStats.FinalDepth { t.Fatalf("health queue depth %d does not match final depth %d", report.Health.Queue.Depth, report.QueueStats.FinalDepth) } if report.Health.Queue.Depth > report.QueueStats.MaxDepth { t.Fatalf("health queue depth %d exceeds observed max %d", report.Health.Queue.Depth, report.QueueStats.MaxDepth) } if len(report.RuntimeSamples) >= 2 { startSample := report.RuntimeSamples[0] finalSample := report.RuntimeSamples[len(report.RuntimeSamples)-1] if startSample.HeapAlloc > 0 { growthRatio := float64(finalSample.HeapAlloc) / float64(startSample.HeapAlloc) if growthRatio > 1.25 && finalSample.HeapAlloc > startSample.HeapAlloc+5*1024*1024 { t.Fatalf("heap allocation grew too much: start=%d final=%d ratio=%.2f", startSample.HeapAlloc, finalSample.HeapAlloc, growthRatio) } } if finalSample.Goroutines > startSample.Goroutines+20 { t.Fatalf("goroutine count grew too much: start=%d final=%d", startSample.Goroutines, finalSample.Goroutines) } } maxStaleness := report.MaxStaleness if maxStaleness <= 0 { t.Fatalf("invalid max staleness value: %v", maxStaleness) } for _, snap := range report.Health.Staleness { key := instanceKey(snap.Type, snap.Instance) stats, ok := report.PerInstanceStats[key] if !ok { t.Fatalf("missing stats for staleness snapshot %s", key) } if stats.Successes == 0 || stats.PermanentFailures > 0 { continue } if stats.LastSuccessAt.IsZero() { t.Fatalf("missing last success timestamp for %s", key) } age := time.Since(stats.LastSuccessAt) maxHealthyAge := 20 * time.Second if maxHealthyAge > scenario.Duration { maxHealthyAge = scenario.Duration } if age > maxHealthyAge { t.Fatalf("instance %s staleness age %v exceeds healthy threshold %v", key, age, maxHealthyAge) } observedScore := age.Seconds() / maxStaleness.Seconds() if snap.Score < 0 || snap.Score > 1.01 { t.Fatalf("invalid staleness score %.2f for %s", snap.Score, key) } if math.Abs(snap.Score-observedScore) > 0.5 { t.Fatalf("staleness score %.2f for %s diverges from observed %.2f", snap.Score, key, observedScore) } } transientKey := instanceKey("pve", "pve-transient") transientStats, ok := report.PerInstanceStats[transientKey] if !ok { t.Fatalf("missing transient instance stats for %s", transientKey) } if transientStats.TransientFailures < 3 { t.Fatalf("expected at least 3 transient failures for %s, got %d", transientKey, transientStats.TransientFailures) } if transientStats.Successes == 0 { t.Fatalf("expected transient instance to recover with successes, got 0") } dlqKeys := map[string]struct{}{} for _, task := range report.Health.DeadLetter.Tasks { dlqKeys[instanceKey(task.Type, task.Instance)] = struct{}{} } if len(report.Health.Breakers) > len(dlqKeys) { t.Fatalf("unexpected number of circuit breaker entries: got %d want <= %d", len(report.Health.Breakers), len(dlqKeys)) } for _, breaker := range report.Health.Breakers { key := instanceKey(breaker.Type, breaker.Instance) if _, ok := dlqKeys[key]; !ok { t.Fatalf("unexpected circuit breaker entry: %+v", breaker) } if breaker.Failures <= 0 { t.Fatalf("expected breaker %s to record failures, got %d", key, breaker.Failures) } } expectedDLQ := map[string]struct{}{} for _, inst := range scenario.Instances { for _, ft := range inst.FailureSeq { if ft == FailurePermanent { expectedDLQ[instanceKey(inst.Type, inst.Name)] = struct{}{} break } } } if report.Health.DeadLetter.Count != len(expectedDLQ) { t.Fatalf("expected %d dead-letter tasks, got %d", len(expectedDLQ), report.Health.DeadLetter.Count) } if len(report.Health.DeadLetter.Tasks) != len(expectedDLQ) { t.Fatalf("dead-letter task list mismatch: expected %d, got %d", len(expectedDLQ), len(report.Health.DeadLetter.Tasks)) } for _, task := range report.Health.DeadLetter.Tasks { key := instanceKey(task.Type, task.Instance) if _, ok := expectedDLQ[key]; !ok { t.Fatalf("unexpected dead-letter task: %s", key) } delete(expectedDLQ, key) } if len(expectedDLQ) != 0 { t.Fatalf("missing dead-letter entries for: %v", expectedDLQ) } if !report.Health.Enabled { t.Fatal("expected adaptive polling to be enabled in scheduler health response") } } func TestAdaptiveSchedulerSoak(t *testing.T) { minutesEnv := os.Getenv("HARNESS_SOAK_MINUTES") if !*soakFlag && minutesEnv == "" { t.Skip("skipping soak test (enable with -soak or HARNESS_SOAK_MINUTES)") } minutes := 15 if minutesEnv != "" { if parsed, err := strconv.Atoi(minutesEnv); err == nil && parsed > 0 { minutes = parsed } } duration := time.Duration(minutes) * time.Minute warmup := 2 * time.Minute scenario := HarnessScenario{Duration: duration, WarmupDuration: warmup} for i := 0; i < 60; i++ { scenario.Instances = append(scenario.Instances, InstanceConfig{ Type: "pve", Name: fmt.Sprintf("soak-healthy-%02d", i), SuccessRate: 0.98, BaseLatency: 200 * time.Millisecond, }) } for i := 0; i < 15; i++ { scenario.Instances = append(scenario.Instances, InstanceConfig{ Type: "pve", Name: fmt.Sprintf("soak-transient-%02d", i), SuccessRate: 0.85, FailureSeq: []FailureType{ FailureTransient, FailureTransient, FailureNone, FailureTransient, FailureNone, }, BaseLatency: 220 * time.Millisecond, }) } permanentCount := 5 for i := 0; i < permanentCount; i++ { scenario.Instances = append(scenario.Instances, InstanceConfig{ Type: "pve", Name: fmt.Sprintf("soak-permanent-%02d", i), SuccessRate: 0, FailureSeq: []FailureType{ FailureTransient, FailureTransient, FailurePermanent, }, BaseLatency: 250 * time.Millisecond, }) } harness := NewHarness(scenario) ctx, cancel := context.WithTimeout(context.Background(), duration+warmup+5*time.Minute) defer cancel() report := harness.Run(ctx) if len(report.RuntimeSamples) < 2 { t.Fatalf("expected runtime samples, got %d", len(report.RuntimeSamples)) } startSample := report.RuntimeSamples[0] warmupEnd := startSample.Timestamp.Add(report.Scenario.WarmupDuration) baseline := startSample for _, sample := range report.RuntimeSamples { if !sample.Timestamp.Before(warmupEnd) { baseline = sample break } } finalSample := report.RuntimeSamples[len(report.RuntimeSamples)-1] if baseline.HeapAlloc > 0 { allowed := float64(baseline.HeapAlloc)*1.10 + 10*1024*1024 if float64(finalSample.HeapAlloc) > allowed { t.Fatalf("heap allocation grew too much: baseline=%d final=%d", baseline.HeapAlloc, finalSample.HeapAlloc) } } if finalSample.Goroutines > baseline.Goroutines+20 { t.Fatalf("goroutine count grew too much: baseline=%d final=%d", baseline.Goroutines, finalSample.Goroutines) } if report.QueueStats.FinalDepth > len(scenario.Instances) { t.Fatalf("final queue depth %d exceeds instance count %d", report.QueueStats.FinalDepth, len(scenario.Instances)) } healthyThreshold := 45 * time.Second for key, stats := range report.PerInstanceStats { if stats.Successes == 0 || stats.PermanentFailures > 0 { continue } if stats.LastSuccessAt.IsZero() { t.Fatalf("missing last success timestamp for %s", key) } age := time.Since(stats.LastSuccessAt) if age > healthyThreshold { t.Fatalf("instance %s staleness age %v exceeds threshold %v", key, age, healthyThreshold) } } if len(report.Health.DeadLetter.Tasks) != permanentCount { t.Fatalf("expected %d dead-letter tasks, got %d", permanentCount, len(report.Health.DeadLetter.Tasks)) } for name, stats := range report.PerInstanceStats { if strings.Contains(name, "transient") { if stats.TransientFailures == 0 { t.Fatalf("expected transient failures for %s", name) } if stats.Successes == 0 { t.Fatalf("expected recoveries for %s", name) } } } if !report.Health.Enabled { t.Fatal("expected adaptive polling to be enabled during soak run") } t.Logf("soak run complete: instances=%d duration=%v samples=%d heap(start=%d end=%d) goroutines(start=%d end=%d)", len(scenario.Instances), duration, len(report.RuntimeSamples), baseline.HeapAlloc, finalSample.HeapAlloc, baseline.Goroutines, finalSample.Goroutines) }