package api import ( "encoding/json" "fmt" "net/http" "net/http/httptest" "os" "path/filepath" "sync" "sync/atomic" "testing" "time" "github.com/rcourtman/pulse-go-rewrite/internal/config" "github.com/rcourtman/pulse-go-rewrite/internal/models" "github.com/rcourtman/pulse-go-rewrite/internal/monitoring" "github.com/rcourtman/pulse-go-rewrite/pkg/metrics" ) // TestLoad_500Node_ConcurrentResources validates that the /api/resources // endpoint sustains acceptable latency under concurrent load with 500 nodes // and ~2500 VMs in state, simulating a large self-hosted deployment. // // The test launches 50 concurrent goroutines each making serial requests over // a 2-second window. It measures p50/p95/p99 and total throughput. func TestLoad_500Node_ConcurrentResources(t *testing.T) { skipUnderRace(t) suppressTestLogs(t) state := buildLargeDeploymentState(t, 500) cfg := &config.Config{DataPath: t.TempDir()} handlers := NewResourceHandlers(cfg) handlers.SetStateProvider(&loadTestStateProvider{state: state}) // Warm the cache with an initial request. req := httptest.NewRequest(http.MethodGet, "/api/resources", nil) rec := httptest.NewRecorder() handlers.HandleListResources(rec, req) if rec.Code != http.StatusOK { t.Fatalf("warmup failed: status %d, body: %s", rec.Code, rec.Body.String()) } var warmupResp map[string]interface{} if err := json.Unmarshal(rec.Body.Bytes(), &warmupResp); err != nil { t.Fatalf("warmup unmarshal: %v", err) } meta, _ := warmupResp["meta"].(map[string]interface{}) total, _ := meta["total"].(float64) // 500 nodes + 2500 VMs = 3000 resources exactly. if int(total) != 3000 { t.Fatalf("warmup: expected 3000 total resources, got %v", total) } t.Logf("state populated: %.0f total resources", total) const ( concurrency = 50 duration = 2 * time.Second ) var ( latencies = make([][]time.Duration, concurrency) errors int64 totalCount int64 ) // Use a barrier to start all goroutines simultaneously, ensuring wall-clock // timing starts after goroutine launch overhead. var ready sync.WaitGroup ready.Add(concurrency) var wg sync.WaitGroup for g := 0; g < concurrency; g++ { wg.Add(1) g := g latencies[g] = make([]time.Duration, 0, 200) go func() { defer wg.Done() ready.Done() ready.Wait() // All goroutines wait here until everyone is launched deadline := time.Now().Add(duration) for time.Now().Before(deadline) { reqStart := time.Now() req := httptest.NewRequest(http.MethodGet, "/api/resources?limit=50", nil) rec := httptest.NewRecorder() handlers.HandleListResources(rec, req) elapsed := time.Since(reqStart) if rec.Code != http.StatusOK { atomic.AddInt64(&errors, 1) continue } latencies[g] = append(latencies[g], elapsed) atomic.AddInt64(&totalCount, 1) } }() } ready.Wait() start := time.Now() wg.Wait() wallTime := time.Since(start) if errors > 0 { t.Errorf("got %d error responses", errors) } all := mergeLatencies(latencies) if len(all) == 0 { t.Fatal("no successful requests recorded") } p50 := percentile(all, 0.50) p95 := percentile(all, 0.95) p99 := percentile(all, 0.99) rps := float64(totalCount) / wallTime.Seconds() t.Logf("resources 500-node load: %d requests in %v (%.1f rps)", totalCount, wallTime, rps) t.Logf(" p50=%v p95=%v p99=%v", p50, p95, p99) // At 500-node scale (3000 resources) with 50 concurrent goroutines, the // resource handler processes snapshot comparison, filtering, sorting, // pagination, and JSON serialization under mutex contention. The p95 budget // accounts for this in-process overhead — it catches gross regressions // (e.g., O(n²) algorithms, lock contention bugs) without being flaky. // Budget set to 3s to accommodate -cpu=1 CI runners. if p95 > 3*time.Second { t.Errorf("p95 latency %v exceeds 3s budget for 500-node concurrent resources load", p95) } // Throughput includes in-flight request overrun past the 2s window, so // actual wall time may exceed 2s. RPS = completed requests / wall time, // which measures sustained throughput under contention (standard approach). // Floor is set conservatively for -cpu=1 CI runners (~47 rps observed). if rps < 30 { t.Errorf("throughput %.1f rps is below minimum 30 rps threshold", rps) } } // TestLoad_500Node_ConcurrentMetricsHistory validates that /api/metrics-store/history // sustains acceptable latency when 50 concurrent goroutines query chart data for // resources across a 500-node deployment. func TestLoad_500Node_ConcurrentMetricsHistory(t *testing.T) { skipUnderRace(t) suppressTestLogs(t) store := newLoadTestMetricsStore(t) const ( numResources = 100 // 100 VMs with history data (representative subset) numPoints = 200 ) metricTypes := []string{"cpu", "memory"} ids := seedLoadTestMetrics(t, store, "vm", metricTypes, numResources, numPoints) state := models.NewState() monitor := &monitoring.Monitor{} setTestUnexportedField(t, monitor, "state", state) setTestUnexportedField(t, monitor, "metricsHistory", monitoring.NewMetricsHistory(10, time.Hour)) setTestUnexportedField(t, monitor, "metricsStore", store) tempDir := t.TempDir() mtp := config.NewMultiTenantPersistence(tempDir) if _, err := mtp.GetPersistence("default"); err != nil { t.Fatalf("failed to init persistence: %v", err) } router := &Router{ monitor: monitor, licenseHandlers: NewLicenseHandlers(mtp, false), } // Sanity check: verify the store path is exercised and returns data. checkURL := "/api/metrics-store/history?resourceType=vm&resourceId=" + ids[0] + "&metric=cpu&range=1h" sanityReq := httptest.NewRequest(http.MethodGet, checkURL, nil) sanityRec := httptest.NewRecorder() router.handleMetricsHistory(sanityRec, sanityReq) if sanityRec.Code != http.StatusOK { t.Fatalf("sanity check failed: status %d, body: %s", sanityRec.Code, sanityRec.Body.String()) } var sanityResp metricsHistoryResponse if err := json.Unmarshal(sanityRec.Body.Bytes(), &sanityResp); err != nil { t.Fatalf("sanity check: unmarshal failed: %v", err) } if sanityResp.Source != "store" { t.Fatalf("sanity check: expected source=store, got %q", sanityResp.Source) } if len(sanityResp.Points) == 0 { t.Fatal("sanity check: expected non-empty points") } // Precompute URLs to avoid string building in timed loop. urls := make([]string, len(ids)) for i, id := range ids { urls[i] = "/api/metrics-store/history?resourceType=vm&resourceId=" + id + "&metric=cpu&range=1h" } const ( concurrency = 50 duration = 2 * time.Second ) var ( latencies = make([][]time.Duration, concurrency) errors int64 totalCount int64 ) var ready sync.WaitGroup ready.Add(concurrency) var wg sync.WaitGroup for g := 0; g < concurrency; g++ { wg.Add(1) g := g latencies[g] = make([]time.Duration, 0, 200) go func() { defer wg.Done() ready.Done() ready.Wait() deadline := time.Now().Add(duration) idx := g // Each goroutine starts with a different resource for time.Now().Before(deadline) { reqStart := time.Now() req := httptest.NewRequest(http.MethodGet, urls[idx%len(urls)], nil) rec := httptest.NewRecorder() router.handleMetricsHistory(rec, req) elapsed := time.Since(reqStart) if rec.Code != http.StatusOK { atomic.AddInt64(&errors, 1) continue } latencies[g] = append(latencies[g], elapsed) atomic.AddInt64(&totalCount, 1) idx++ } }() } ready.Wait() start := time.Now() wg.Wait() wallTime := time.Since(start) if errors > 0 { t.Errorf("got %d error responses", errors) } all := mergeLatencies(latencies) if len(all) == 0 { t.Fatal("no successful requests recorded") } p50 := percentile(all, 0.50) p95 := percentile(all, 0.95) p99 := percentile(all, 0.99) rps := float64(totalCount) / wallTime.Seconds() t.Logf("metrics-history 500-node load: %d requests in %v (%.1f rps)", totalCount, wallTime, rps) t.Logf(" p50=%v p95=%v p99=%v", p50, p95, p99) // Under concurrent SQLite reads (WAL mode), p95 should stay under 200ms. // This budget accounts for in-process test overhead with 50 goroutines // contending on a single SQLite file. if p95 > 200*time.Millisecond { t.Errorf("p95 latency %v exceeds 200ms budget for concurrent metrics-history load", p95) } if rps < 500 { t.Errorf("throughput %.0f rps is below minimum 500 rps threshold", rps) } } // TestLoad_500Node_MixedEndpoints simulates a realistic dashboard load pattern: // concurrent requests hitting /api/resources, /api/metrics-store/history, and // /api/metrics-store/stats simultaneously, as happens when a user opens a // dashboard showing a 500-node fleet. func TestLoad_500Node_MixedEndpoints(t *testing.T) { skipUnderRace(t) suppressTestLogs(t) state := buildLargeDeploymentState(t, 500) store := newLoadTestMetricsStore(t) metricTypes := []string{"cpu", "memory"} ids := seedLoadTestMetrics(t, store, "vm", metricTypes, 100, 200) monitor := &monitoring.Monitor{} setTestUnexportedField(t, monitor, "state", state) setTestUnexportedField(t, monitor, "metricsHistory", monitoring.NewMetricsHistory(10, time.Hour)) setTestUnexportedField(t, monitor, "metricsStore", store) tempDir := t.TempDir() mtp := config.NewMultiTenantPersistence(tempDir) if _, err := mtp.GetPersistence("default"); err != nil { t.Fatalf("failed to init persistence: %v", err) } routerObj := &Router{ monitor: monitor, licenseHandlers: NewLicenseHandlers(mtp, false), } cfg := &config.Config{DataPath: t.TempDir()} resourceHandlers := NewResourceHandlers(cfg) resourceHandlers.SetStateProvider(&loadTestStateProvider{state: state}) // Warm caches. warmReq := httptest.NewRequest(http.MethodGet, "/api/resources", nil) warmRec := httptest.NewRecorder() resourceHandlers.HandleListResources(warmRec, warmReq) if warmRec.Code != http.StatusOK { t.Fatalf("warmup resources failed: status %d", warmRec.Code) } // Precompute history URLs. historyURLs := make([]string, len(ids)) for i, id := range ids { historyURLs[i] = "/api/metrics-store/history?resourceType=vm&resourceId=" + id + "&metric=cpu&range=1h" } const duration = 2 * time.Second type endpointResult struct { name string latencies []time.Duration errors int64 count int64 } var histIdx int64 // test-local counter for round-robin var mu sync.Mutex // 3 endpoint groups, each with their own concurrency. groups := []struct { name string concurrency int fn func() int // returns HTTP status code }{ { name: "resources", concurrency: 20, fn: func() int { req := httptest.NewRequest(http.MethodGet, "/api/resources?limit=50", nil) rec := httptest.NewRecorder() resourceHandlers.HandleListResources(rec, req) return rec.Code }, }, { name: "metrics-history", concurrency: 25, fn: func() int { idx := int(atomic.AddInt64(&histIdx, 1)) req := httptest.NewRequest(http.MethodGet, historyURLs[idx%len(historyURLs)], nil) rec := httptest.NewRecorder() routerObj.handleMetricsHistory(rec, req) return rec.Code }, }, { name: "metrics-stats", concurrency: 5, fn: func() int { req := httptest.NewRequest(http.MethodGet, "/api/metrics-store/stats", nil) rec := httptest.NewRecorder() routerObj.handleMetricsStoreStats(rec, req) return rec.Code }, }, } results := make([]endpointResult, len(groups)) var ready sync.WaitGroup totalWorkers := 0 for _, grp := range groups { totalWorkers += grp.concurrency } ready.Add(totalWorkers) var wg sync.WaitGroup for gi, grp := range groups { results[gi].name = grp.name for c := 0; c < grp.concurrency; c++ { wg.Add(1) gi := gi fn := grp.fn go func() { defer wg.Done() ready.Done() ready.Wait() deadline := time.Now().Add(duration) var localLats []time.Duration var localErrs int64 for time.Now().Before(deadline) { reqStart := time.Now() code := fn() elapsed := time.Since(reqStart) if code != http.StatusOK { localErrs++ continue } localLats = append(localLats, elapsed) } mu.Lock() results[gi].latencies = append(results[gi].latencies, localLats...) results[gi].errors += localErrs results[gi].count += int64(len(localLats)) mu.Unlock() }() } } ready.Wait() start := time.Now() wg.Wait() wallTime := time.Since(start) t.Logf("mixed-endpoint 500-node load test completed in %v", wallTime) var totalErrors int64 for _, r := range results { if len(r.latencies) == 0 { t.Errorf("[%s] no successful requests", r.name) continue } p50 := percentile(r.latencies, 0.50) p95 := percentile(r.latencies, 0.95) p99 := percentile(r.latencies, 0.99) rps := float64(r.count) / wallTime.Seconds() t.Logf(" [%s] %d requests (%.1f rps) p50=%v p95=%v p99=%v errors=%d", r.name, r.count, rps, p50, p95, p99, r.errors) totalErrors += r.errors } if totalErrors > 0 { t.Errorf("total %d error responses across all endpoints", totalErrors) } // Under mixed concurrent load (50 goroutines across 3 endpoint types // sharing a single SQLite store + in-memory state), p95 per endpoint // should stay under 3s. This catches gross regressions (lock inversions, // O(n²) serialization) without being flaky on -cpu=1 CI runners. for _, r := range results { if len(r.latencies) == 0 { continue } p95 := percentile(r.latencies, 0.95) target := effectiveMixedLoadP95Budget(r.name, 3*time.Second) if p95 > target { t.Errorf("[%s] p95=%v exceeds %v budget under mixed load", r.name, p95, target) } } // Minimum request volume per endpoint group — ensures throughput didn't // collapse under contention. Counts include in-flight overrun past the // nominal 2s window (RPS = count / wallTime, standard approach). // The mixed-load metrics-history floor is lower because each request does a // real store query plus canonical target resolution while contending with // resource and stats endpoints on the same process. minCounts := map[string]int64{ "resources": effectiveLoadMinCount(50, 50), "metrics-history": effectiveLoadMinCount(30, 24), "metrics-stats": effectiveLoadMinCount(10, 10), } for _, r := range results { if minCount, ok := minCounts[r.name]; ok && r.count < minCount { t.Errorf("[%s] completed only %d requests, expected at least %d", r.name, r.count, minCount) } } } // --- Load test helpers --- // buildLargeDeploymentState creates a State with exactly numNodes nodes and // 5 VMs per node, simulating a large deployment. Nodes are distributed evenly // across 10 PVE instances; any remainder is spread one-per-instance to the // first N instances (where N = numNodes % 10). func buildLargeDeploymentState(t *testing.T, numNodes int) *models.State { t.Helper() state := models.NewState() const numInstances = 10 basePerInstance := numNodes / numInstances remainder := numNodes % numInstances nodesSoFar := 0 for inst := 0; inst < numInstances; inst++ { instanceName := fmt.Sprintf("pve%d", inst) count := basePerInstance if inst < remainder { count++ // Distribute remainder across first N instances } if count == 0 { continue } nodes := make([]models.Node, count) for n := 0; n < count; n++ { globalIdx := nodesSoFar + n nodes[n] = models.Node{ ID: fmt.Sprintf("%s:node%d", instanceName, n), Name: fmt.Sprintf("node-%d", globalIdx), Instance: instanceName, Status: "online", CPU: float64(globalIdx%80+10) / 100.0, Memory: models.Memory{Usage: float64(globalIdx%60 + 20), Total: 64 << 30, Used: 32 << 30}, Disk: models.Disk{Usage: float64(globalIdx%40 + 30), Total: 500 << 30, Used: 250 << 30}, } } state.UpdateNodesForInstance(instanceName, nodes) // 5 VMs per node. vms := make([]models.VM, count*5) for v := range vms { nodeIdx := v / 5 vmGlobalIdx := (nodesSoFar+nodeIdx)*5 + (v % 5) vms[v] = models.VM{ ID: fmt.Sprintf("%s:node%d:%d", instanceName, nodeIdx, 1000+vmGlobalIdx), VMID: 1000 + vmGlobalIdx, Name: fmt.Sprintf("vm-%d", vmGlobalIdx), Node: fmt.Sprintf("node%d", nodeIdx), Instance: instanceName, Status: "running", Type: "qemu", CPU: float64(vmGlobalIdx%80+10) / 100.0, Memory: models.Memory{Usage: float64(vmGlobalIdx%60 + 20), Total: 4 << 30, Used: 2 << 30}, Disk: models.Disk{Usage: float64(vmGlobalIdx%40 + 30), Total: 50 << 30, Used: 25 << 30}, } } state.UpdateVMsForInstance(instanceName, vms) nodesSoFar += count } return state } func effectiveLoadMinCount(localMinCount, githubActionsMinCount int64) int64 { if githubActionsMinCount > 0 && os.Getenv("GITHUB_ACTIONS") == "true" { return githubActionsMinCount } return localMinCount } func effectiveMixedLoadP95Budget(endpoint string, localTarget time.Duration) time.Duration { if os.Getenv("GITHUB_ACTIONS") != "true" { return localTarget } switch endpoint { case "metrics-history": return 8 * time.Second case "metrics-stats": return 4 * time.Second default: return localTarget } } // newLoadTestMetricsStore creates an ephemeral metrics store for load tests. func newLoadTestMetricsStore(t *testing.T) *metrics.Store { t.Helper() dir := t.TempDir() cfg := metrics.DefaultConfig(dir) cfg.DBPath = filepath.Join(dir, "load-test.db") cfg.FlushInterval = time.Hour cfg.WriteBufferSize = 50_000 store, err := metrics.NewStore(cfg) if err != nil { t.Fatalf("NewStore: %v", err) } t.Cleanup(func() { store.Close() }) return store } // seedLoadTestMetrics writes test metrics data (mirrors seedTestMetrics but // for load test scale). func seedLoadTestMetrics(t *testing.T, store *metrics.Store, resourceType string, metricTypes []string, numResources, numPoints int) []string { t.Helper() base := time.Now().Add(-50 * time.Minute) ids := make([]string, numResources) batch := make([]metrics.WriteMetric, 0, numResources*numPoints*len(metricTypes)) for r := 0; r < numResources; r++ { id := fmt.Sprintf("%s-load-%d", resourceType, r) ids[r] = id for _, mt := range metricTypes { for p := 0; p < numPoints; p++ { batch = append(batch, metrics.WriteMetric{ ResourceType: resourceType, ResourceID: id, MetricType: mt, Value: float64(p % 100), Timestamp: base.Add(time.Duration(p) * 15 * time.Second), Tier: metrics.TierRaw, }) } } } store.WriteBatchSync(batch) return ids } // mergeLatencies flattens per-goroutine latency slices into a single slice. func mergeLatencies(perGoroutine [][]time.Duration) []time.Duration { total := 0 for _, s := range perGoroutine { total += len(s) } merged := make([]time.Duration, 0, total) for _, s := range perGoroutine { merged = append(merged, s...) } return merged } // loadTestStateProvider implements StateProvider for load tests. type loadTestStateProvider struct { state *models.State } func (p *loadTestStateProvider) ReadSnapshot() models.StateSnapshot { return p.state.GetSnapshot() }