Pulse/internal/api/slo_bench_test.go

package api

import (
	"encoding/json"
	"fmt"
	"net/http"
	"net/http/httptest"
	"os"
	"path/filepath"
	"reflect"
	"sort"
	"testing"
	"time"
	"unsafe"

	"github.com/rcourtman/pulse-go-rewrite/internal/config"
	"github.com/rcourtman/pulse-go-rewrite/internal/models"
	"github.com/rcourtman/pulse-go-rewrite/internal/monitoring"
	"github.com/rcourtman/pulse-go-rewrite/pkg/metrics"
	"github.com/rs/zerolog"
	"github.com/rs/zerolog/log"
)

const (
	// Shared GitHub runners usually keep the store-backed history path well
	// under 5ms p95, but single-core contention can spike it to ~10.6ms on the
	// April 9, 2026 RC stabilization pass. Keep the local budget unchanged and
	// allow a narrow hosted-runner envelope.
	sloMetricsHistoryStoreGitHubActionsP95 = 12 * time.Millisecond

	// Shared GitHub runners pushed the cached /api/resources hot path just over
	// the strict 3ms local target on the April 9, 2026 RC dry run (~3.05ms p95).
	// Keep the local budget unchanged and allow a small hosted-runner envelope.
	sloResourcesListGitHubActionsP95 = 5 * time.Millisecond

	// Shared runners remain materially slower than local serial proofs on the
	// current unified-resource chart paths. A serial local run on April 11, 2026
	// measured ~44.4ms p95, while the governed RC rehearsal on the same day hit
	// ~255.3ms p95. Keep the local endpoint budget unchanged and allow only a
	// narrow hosted-runner envelope above the observed rehearsal result.
	sloInfrastructureChartsGitHubActionsP95 = 275 * time.Millisecond
	// Shared runners also drifted above the April 9 hosted baseline for workload
	// charts. A serial local run on April 11, 2026 measured ~82.2ms p95, while
	// the governed RC rehearsal on the same day hit ~514.0ms p95. Keep the local
	// SLO strict and align the GitHub Actions ceiling to the current envelope.
	sloWorkloadChartsGitHubActionsP95         = 550 * time.Millisecond
	sloWorkloadsSummaryChartsGitHubActionsP95 = sloWorkloadChartsGitHubActionsP95
)

// suppressTestLogs disables zerolog for the duration of a test.
func suppressTestLogs(t *testing.T) {
	t.Helper()
	orig := log.Logger
	log.Logger = zerolog.Nop()
	t.Cleanup(func() { log.Logger = orig })
}

// setTestUnexportedField sets an unexported field on a struct via reflection.
func setTestUnexportedField(t *testing.T, target interface{}, field string, value interface{}) {
	t.Helper()
	v := reflect.ValueOf(target).Elem()
	f := v.FieldByName(field)
	if !f.IsValid() {
		t.Fatalf("field %q not found", field)
	}
	ptr := unsafe.Pointer(f.UnsafeAddr())
	reflect.NewAt(f.Type(), ptr).Elem().Set(reflect.ValueOf(value))
}

// TestSLO_MetricsHistoryStore validates that the metrics-store/history handler
// (SQLite path) meets SLOMetricsHistoryStoreP95 under benchmark conditions.
func TestSLO_MetricsHistoryStore(t *testing.T) {
	skipUnderRace(t)
	suppressTestLogs(t)

	store := newTestMetricsStore(t)
	const numPoints = 500
	metricTypes := []string{"cpu", "memory", "disk", "netin"}
	ids := seedTestMetrics(t, store, "vm", metricTypes, 10, numPoints)

	state := models.NewState()
	monitor := &monitoring.Monitor{}
	setTestUnexportedField(t, monitor, "state", state)
	setTestUnexportedField(t, monitor, "metricsHistory", monitoring.NewMetricsHistory(10, time.Hour))
	setTestUnexportedField(t, monitor, "metricsStore", store)

	tempDir := t.TempDir()
	mtp := config.NewMultiTenantPersistence(tempDir)
	if _, err := mtp.GetPersistence("default"); err != nil {
		t.Fatalf("failed to init persistence: %v", err)
	}

	router := &Router{
		monitor:         monitor,
		licenseHandlers: NewLicenseHandlers(mtp, false),
	}

	url := "/api/metrics-store/history?resourceType=vm&resourceId=" + ids[0] + "&metric=cpu&range=1h"

	// Sanity check: verify the store path is exercised and returns expected data.
	sanityReq := httptest.NewRequest(http.MethodGet, url, nil)
	sanityRec := httptest.NewRecorder()
	router.handleMetricsHistory(sanityRec, sanityReq)
	if sanityRec.Code != http.StatusOK {
		t.Fatalf("sanity check failed: status %d, body: %s", sanityRec.Code, sanityRec.Body.String())
	}
	var sanityResp metricsHistoryResponse
	if err := json.Unmarshal(sanityRec.Body.Bytes(), &sanityResp); err != nil {
		t.Fatalf("sanity check: unmarshal failed: %v", err)
	}
	if sanityResp.Source != "store" {
		t.Fatalf("sanity check: expected source=store, got %q", sanityResp.Source)
	}
	if len(sanityResp.Points) == 0 {
		t.Fatal("sanity check: expected non-empty points from store path")
	}

	latencies := measureEndpointLatencies(t, func() {
		req := httptest.NewRequest(http.MethodGet, url, nil)
		rec := httptest.NewRecorder()
		router.handleMetricsHistory(rec, req)
		if rec.Code != http.StatusOK {
			t.Fatalf("unexpected status %d", rec.Code)
		}
	})

	p95 := percentile(latencies, 0.95)
	target := effectiveAPISLOTarget(SLOMetricsHistoryStoreP95, sloMetricsHistoryStoreGitHubActionsP95)
	t.Logf("metrics-store/history (store) p50=%v p95=%v p99=%v SLO=%v",
		percentile(latencies, 0.50), p95, percentile(latencies, 0.99), target)

	if p95 > target {
		t.Errorf("SLO VIOLATION: p95=%v exceeds target %v", p95, target)
	}
}

// TestSLO_MetricsHistoryMemory validates the in-memory fallback path.
func TestSLO_MetricsHistoryMemory(t *testing.T) {
	skipUnderRace(t)
	suppressTestLogs(t)

	state := models.NewState()
	vms := make([]models.VM, 10)
	for i := range vms {
		vms[i] = models.VM{
			ID:       fmt.Sprintf("pve1:node1:%d", 100+i),
			VMID:     100 + i,
			Name:     fmt.Sprintf("vm-%d", 100+i),
			Node:     "node1",
			Instance: "pve1",
			Status:   "running",
			Type:     "qemu",
			CPU:      float64(i%80+10) / 100.0,
			Memory:   models.Memory{Usage: float64(i%60 + 20)},
			Disk:     models.Disk{Usage: float64(i%40 + 30)},
		}
	}
	state.UpdateVMsForInstance("pve1", vms)

	mh := monitoring.NewMetricsHistory(1000, time.Hour)
	now := time.Now()
	for _, vm := range vms {
		for j := 0; j < 60; j++ {
			ts := now.Add(time.Duration(-60+j) * time.Minute)
			mh.AddGuestMetric(vm.ID, "cpu", vm.CPU*100+float64(j%10), ts)
			mh.AddGuestMetric(vm.ID, "memory", vm.Memory.Usage+float64(j%5), ts)
		}
	}

	monitor := &monitoring.Monitor{}
	setTestUnexportedField(t, monitor, "state", state)
	setTestUnexportedField(t, monitor, "metricsHistory", mh)

	router := &Router{monitor: monitor}

	url := "/api/metrics-store/history?resourceType=vm&resourceId=pve1:node1:100&metric=cpu&range=1h"

	// Sanity check: verify the memory fallback path is exercised.
	sanityReq := httptest.NewRequest(http.MethodGet, url, nil)
	sanityRec := httptest.NewRecorder()
	router.handleMetricsHistory(sanityRec, sanityReq)
	if sanityRec.Code != http.StatusOK {
		t.Fatalf("sanity check failed: status %d, body: %s", sanityRec.Code, sanityRec.Body.String())
	}
	var sanityResp metricsHistoryResponse
	if err := json.Unmarshal(sanityRec.Body.Bytes(), &sanityResp); err != nil {
		t.Fatalf("sanity check: unmarshal failed: %v", err)
	}
	if sanityResp.Source != "memory" {
		t.Fatalf("sanity check: expected source=memory, got %q", sanityResp.Source)
	}
	if len(sanityResp.Points) == 0 {
		t.Fatal("sanity check: expected non-empty points from memory fallback")
	}

	latencies := measureEndpointLatencies(t, func() {
		req := httptest.NewRequest(http.MethodGet, url, nil)
		rec := httptest.NewRecorder()
		router.handleMetricsHistory(rec, req)
		if rec.Code != http.StatusOK {
			t.Fatalf("unexpected status %d", rec.Code)
		}
	})

	p95 := percentile(latencies, 0.95)
	t.Logf("metrics-store/history (memory) p50=%v p95=%v p99=%v SLO=%v",
		percentile(latencies, 0.50), p95, percentile(latencies, 0.99), SLOMetricsHistoryMemoryP95)

	if p95 > SLOMetricsHistoryMemoryP95 {
		t.Errorf("SLO VIOLATION: p95=%v exceeds target %v", p95, SLOMetricsHistoryMemoryP95)
	}
}

// TestSLO_MetricsStoreStats validates the /api/metrics-store/stats endpoint.
func TestSLO_MetricsStoreStats(t *testing.T) {
	skipUnderRace(t)
	suppressTestLogs(t)

	store := newTestMetricsStore(t)
	seedTestMetrics(t, store, "node", []string{"cpu"}, 5, 100)

	monitor := &monitoring.Monitor{}
	setTestUnexportedField(t, monitor, "state", models.NewState())
	setTestUnexportedField(t, monitor, "metricsHistory", monitoring.NewMetricsHistory(10, time.Hour))
	setTestUnexportedField(t, monitor, "metricsStore", store)

	router := &Router{monitor: monitor}

	// Sanity check: verify stats endpoint returns valid data.
	sanityReq := httptest.NewRequest(http.MethodGet, "/api/metrics-store/stats", nil)
	sanityRec := httptest.NewRecorder()
	router.handleMetricsStoreStats(sanityRec, sanityReq)
	if sanityRec.Code != http.StatusOK {
		t.Fatalf("sanity check failed: status %d", sanityRec.Code)
	}
	var statsCheck map[string]interface{}
	if err := json.Unmarshal(sanityRec.Body.Bytes(), &statsCheck); err != nil {
		t.Fatalf("sanity check: unmarshal failed: %v", err)
	}
	if enabled, _ := statsCheck["enabled"].(bool); !enabled {
		t.Fatal("sanity check: expected enabled=true in stats response")
	}

	latencies := measureEndpointLatencies(t, func() {
		req := httptest.NewRequest(http.MethodGet, "/api/metrics-store/stats", nil)
		rec := httptest.NewRecorder()
		router.handleMetricsStoreStats(rec, req)
		if rec.Code != http.StatusOK {
			t.Fatalf("unexpected status %d", rec.Code)
		}
	})

	p95 := percentile(latencies, 0.95)
	t.Logf("metrics-store/stats p50=%v p95=%v p99=%v SLO=%v",
		percentile(latencies, 0.50), p95, percentile(latencies, 0.99), SLOMetricsStoreStatsP95)

	if p95 > SLOMetricsStoreStatsP95 {
		t.Errorf("SLO VIOLATION: p95=%v exceeds target %v", p95, SLOMetricsStoreStatsP95)
	}
}

// TestSLO_ResourcesList validates the GET /api/resources endpoint with ~85
// resources in state (5 nodes + 50 VMs + 30 containers). The handler uses
// default pagination (limit=50), so response encodes up to 50 resources.
func TestSLO_ResourcesList(t *testing.T) {
	skipUnderRace(t)
	suppressTestLogs(t)

	state := models.NewState()
	nodes := make([]models.Node, 5)
	for i := range nodes {
		nodes[i] = models.Node{
			ID:       fmt.Sprintf("pve1:node%d", i),
			Name:     fmt.Sprintf("node%d", i),
			Instance: "pve1",
			Status:   "online",
			CPU:      float64(i%80+10) / 100.0,
			Memory:   models.Memory{Usage: float64(i%60 + 20), Total: 64 << 30, Used: 32 << 30},
			Disk:     models.Disk{Usage: float64(i%40 + 30), Total: 500 << 30, Used: 250 << 30},
		}
	}
	state.UpdateNodesForInstance("pve1", nodes)

	vms := make([]models.VM, 50)
	for i := range vms {
		vms[i] = models.VM{
			ID:       fmt.Sprintf("pve1:node%d:%d", i%5, 100+i),
			VMID:     100 + i,
			Name:     fmt.Sprintf("vm-%d", 100+i),
			Node:     fmt.Sprintf("node%d", i%5),
			Instance: "pve1",
			Status:   "running",
			Type:     "qemu",
			CPU:      float64(i%80+10) / 100.0,
			Memory:   models.Memory{Usage: float64(i%60 + 20), Total: 4 << 30, Used: 2 << 30},
			Disk:     models.Disk{Usage: float64(i%40 + 30), Total: 50 << 30, Used: 25 << 30},
		}
	}
	state.UpdateVMsForInstance("pve1", vms)

	containers := make([]models.Container, 30)
	for i := range containers {
		containers[i] = models.Container{
			ID:       fmt.Sprintf("pve1:node%d:%d", i%5, 200+i),
			VMID:     200 + i,
			Name:     fmt.Sprintf("ct-%d", 200+i),
			Node:     fmt.Sprintf("node%d", i%5),
			Instance: "pve1",
			Status:   "running",
			Type:     "lxc",
			CPU:      float64(i%80+10) / 100.0,
			Memory:   models.Memory{Usage: float64(i%60 + 20), Total: 2 << 30, Used: 1 << 30},
			Disk:     models.Disk{Usage: float64(i%40 + 30), Total: 20 << 30, Used: 10 << 30},
		}
	}
	state.UpdateContainersForInstance("pve1", containers)

	cfg := &config.Config{DataPath: t.TempDir()}
	handlers := NewResourceHandlers(cfg)
	handlers.SetStateProvider(&sloTestStateProvider{state: state})

	// Warm the cache — first request populates it, subsequent requests hit cache.
	req := httptest.NewRequest(http.MethodGet, "/api/resources", nil)
	rec := httptest.NewRecorder()
	handlers.HandleListResources(rec, req)
	if rec.Code != http.StatusOK {
		t.Fatalf("warmup failed: status %d", rec.Code)
	}
	var checkResp map[string]interface{}
	if err := json.Unmarshal(rec.Body.Bytes(), &checkResp); err != nil {
		t.Fatalf("warmup unmarshal: %v", err)
	}
	data, _ := checkResp["data"].([]interface{})
	if len(data) == 0 {
		t.Fatalf("warmup: expected resources, got none")
	}
	// Verify the workload matches expectations: 50 items per page (default limit),
	// 85 total resources (5 nodes + 50 VMs + 30 containers).
	if len(data) != 50 {
		t.Fatalf("warmup: expected 50 resources in first page, got %d", len(data))
	}
	meta, _ := checkResp["meta"].(map[string]interface{})
	if total, _ := meta["total"].(float64); int(total) != 85 {
		t.Fatalf("warmup: expected total=85, got %v", total)
	}

	latencies := measureEndpointAmortizedLatencies(t, resourcesListLatencyBatchSize, func() {
		req := httptest.NewRequest(http.MethodGet, "/api/resources", nil)
		rec := httptest.NewRecorder()
		handlers.HandleListResources(rec, req)
		if rec.Code != http.StatusOK {
			t.Fatalf("unexpected status %d", rec.Code)
		}
	})

	target := effectiveAPISLOTarget(SLOResourcesListP95, sloResourcesListGitHubActionsP95)
	p95 := percentile(latencies, 0.95)
	t.Logf("resources/list p50=%v p95=%v p99=%v SLO=%v",
		percentile(latencies, 0.50), p95, percentile(latencies, 0.99), target)

	if p95 > target {
		t.Errorf("SLO VIOLATION: p95=%v exceeds target %v", p95, target)
	}
}

// TestSLO_InfrastructureCharts validates the lightweight infrastructure charts
// endpoint that drives infrastructure summary sparklines. The workload forces
// the store-backed batch path across nodes, docker hosts, and unified agents.
func TestSLO_InfrastructureCharts(t *testing.T) {
	skipUnderRace(t)
	suppressTestLogs(t)

	store := newTestMetricsStore(t)
	const (
		nodeCount       = 20
		dockerHostCount = 10
		agentCount      = 10
		pointsPerMetric = 240
	)
	base := time.Now().Add(-4 * time.Hour)

	seedBatchMetrics := func(resourceType string, ids []string, metricTypes []string) {
		batch := make([]metrics.WriteMetric, 0, len(ids)*len(metricTypes)*pointsPerMetric)
		for idx, id := range ids {
			for _, mt := range metricTypes {
				for p := 0; p < pointsPerMetric; p++ {
					batch = append(batch, metrics.WriteMetric{
						ResourceType: resourceType,
						ResourceID:   id,
						MetricType:   mt,
						Value:        float64((idx + p) % 100),
						Timestamp:    base.Add(time.Duration(p) * time.Minute),
						Tier:         metrics.TierMinute,
					})
				}
			}
		}
		store.WriteBatchSync(batch)
	}

	monitor, state, _ := newTestMonitor(t)
	setTestUnexportedField(t, monitor, "metricsStore", store)

	nodes := make([]models.Node, nodeCount)
	nodeIDs := make([]string, nodeCount)
	for i := range nodes {
		nodeIDs[i] = fmt.Sprintf("node-slo-%d", i)
		nodes[i] = models.Node{
			ID:       nodeIDs[i],
			Name:     fmt.Sprintf("node-%d", i),
			Instance: "pve1",
			Status:   "online",
			CPU:      float64(i%80+10) / 100.0,
			Memory:   models.Memory{Usage: float64(i%60 + 20), Total: 64 << 30, Used: 32 << 30},
			Disk:     models.Disk{Usage: float64(i%40 + 30), Total: 500 << 30, Used: 250 << 30},
		}
	}
	state.Nodes = nodes

	dockerHosts := make([]models.DockerHost, dockerHostCount)
	dockerHostIDs := make([]string, dockerHostCount)
	for i := range dockerHosts {
		dockerHostIDs[i] = fmt.Sprintf("docker-host-slo-%d", i)
		dockerHosts[i] = models.DockerHost{
			ID:       dockerHostIDs[i],
			Runtime:  "docker",
			Status:   "online",
			CPUUsage: float64(i%80 + 10),
			Memory:   models.Memory{Usage: float64(i%60 + 20), Total: 32 << 30, Used: 16 << 30},
			Disks:    []models.Disk{{Usage: float64(i%40 + 30), Total: 200 << 30, Used: 100 << 30}},
		}
	}
	state.DockerHosts = dockerHosts

	hosts := make([]models.Host, agentCount)
	agentIDs := make([]string, agentCount)
	for i := range hosts {
		agentIDs[i] = fmt.Sprintf("agent-slo-%d", i)
		hosts[i] = models.Host{
			ID:       agentIDs[i],
			Hostname: fmt.Sprintf("agent-host-%d", i),
			Status:   "online",
			CPUUsage: float64(i%80 + 10),
			Memory:   models.Memory{Usage: float64(i%60 + 20), Total: 32 << 30, Used: 16 << 30},
			Disks:    []models.Disk{{Usage: float64(i%40 + 30), Total: 200 << 30, Used: 100 << 30}},
		}
	}
	state.Hosts = hosts
	syncTestResourceStore(t, monitor, state)

	seedBatchMetrics("node", nodeIDs, []string{"cpu", "memory", "disk", "netin", "netout"})
	seedBatchMetrics("dockerHost", dockerHostIDs, []string{"cpu", "memory", "disk"})
	seedBatchMetrics("agent", agentIDs, []string{"cpu", "memory", "disk"})

	router := &Router{monitor: monitor}
	url := "/api/charts/infrastructure?range=4h"

	sanityReq := httptest.NewRequest(http.MethodGet, url, nil)
	sanityRec := httptest.NewRecorder()
	router.handleInfrastructureCharts(sanityRec, sanityReq)
	if sanityRec.Code != http.StatusOK {
		t.Fatalf("sanity check failed: status %d body=%s", sanityRec.Code, sanityRec.Body.String())
	}
	var sanityResp InfrastructureChartsResponse
	if err := json.Unmarshal(sanityRec.Body.Bytes(), &sanityResp); err != nil {
		t.Fatalf("sanity unmarshal: %v", err)
	}
	if len(sanityResp.NodeData) != nodeCount {
		t.Fatalf("sanity: expected %d nodes, got %d", nodeCount, len(sanityResp.NodeData))
	}
	if len(sanityResp.DockerHostData) != dockerHostCount {
		t.Fatalf("sanity: expected %d docker hosts, got %d", dockerHostCount, len(sanityResp.DockerHostData))
	}
	if len(sanityResp.AgentData) != agentCount {
		t.Fatalf("sanity: expected %d agents, got %d", agentCount, len(sanityResp.AgentData))
	}
	if sanityResp.Stats.PrimarySourceHint != "store_or_memory_fallback" {
		t.Fatalf("sanity: expected store-backed source hint, got %q", sanityResp.Stats.PrimarySourceHint)
	}
	if SLOWorkloadsSummaryChartsP95 != SLOWorkloadChartsP95 {
		t.Fatalf(
			"sanity: workloads-summary SLO=%v, want alignment with workload charts SLO=%v",
			SLOWorkloadsSummaryChartsP95,
			SLOWorkloadChartsP95,
		)
	}
	if sloWorkloadsSummaryChartsGitHubActionsP95 != sloWorkloadChartsGitHubActionsP95 {
		t.Fatalf(
			"sanity: workloads-summary GitHub Actions SLO=%v, want alignment with workload charts GitHub Actions SLO=%v",
			sloWorkloadsSummaryChartsGitHubActionsP95,
			sloWorkloadChartsGitHubActionsP95,
		)
	}

	latencies := measureEndpointLatencies(t, func() {
		req := httptest.NewRequest(http.MethodGet, url, nil)
		rec := httptest.NewRecorder()
		router.handleInfrastructureCharts(rec, req)
		if rec.Code != http.StatusOK {
			t.Fatalf("unexpected status %d", rec.Code)
		}
	})

	p95 := percentile(latencies, 0.95)
	target := effectiveAPISLOTarget(SLOInfrastructureChartsP95, sloInfrastructureChartsGitHubActionsP95)
	t.Logf("charts/infrastructure p50=%v p95=%v p99=%v SLO=%v",
		percentile(latencies, 0.50), p95, percentile(latencies, 0.99), target)

	if p95 > target {
		t.Errorf("SLO VIOLATION: p95=%v exceeds target %v", p95, target)
	}
}

// TestSLO_WorkloadCharts validates the workload charts endpoint that powers
// workload summary sparklines. The workload forces the store-backed batch path
// across VMs, system containers, and docker containers.
func TestSLO_WorkloadCharts(t *testing.T) {
	skipUnderRace(t)
	suppressTestLogs(t)

	store := newTestMetricsStore(t)
	const (
		vmCount           = 30
		containerCount    = 20
		dockerHostCount   = 10
		containersPerHost = 2
		pointsPerMetric   = 240
	)
	base := time.Now().Add(-4 * time.Hour).UTC().Truncate(time.Second)

	seedBatchMetrics := func(resourceType string, ids []string, metricTypes []string) {
		batch := make([]metrics.WriteMetric, 0, len(ids)*len(metricTypes)*pointsPerMetric)
		for idx, id := range ids {
			for _, mt := range metricTypes {
				for p := 0; p < pointsPerMetric; p++ {
					batch = append(batch, metrics.WriteMetric{
						ResourceType: resourceType,
						ResourceID:   id,
						MetricType:   mt,
						Value:        float64((idx + p) % 100),
						Timestamp:    base.Add(time.Duration(p) * time.Minute),
						Tier:         metrics.TierMinute,
					})
				}
			}
		}
		store.WriteBatchSync(batch)
	}

	monitor := &monitoring.Monitor{}
	state := models.NewState()
	setTestUnexportedField(t, monitor, "state", state)
	setTestUnexportedField(t, monitor, "metricsHistory", monitoring.NewMetricsHistory(10, time.Hour))
	setTestUnexportedField(t, monitor, "metricsStore", store)

	nodes := make([]models.Node, 5)
	for i := range nodes {
		nodes[i] = models.Node{
			ID:       fmt.Sprintf("node-slo-%d", i),
			Name:     fmt.Sprintf("node-%d", i),
			Instance: "pve1",
			Status:   "online",
		}
	}
	state.UpdateNodesForInstance("pve1", nodes)

	vms := make([]models.VM, vmCount)
	vmIDs := make([]string, vmCount)
	for i := range vms {
		vmIDs[i] = fmt.Sprintf("vm-slo-%d", i)
		vms[i] = models.VM{
			ID:       vmIDs[i],
			VMID:     100 + i,
			Name:     fmt.Sprintf("vm-%d", i),
			Node:     nodes[i%len(nodes)].Name,
			Instance: "pve1",
			Status:   "running",
			Type:     "qemu",
			CPU:      float64(i%80+10) / 100.0,
			Memory:   models.Memory{Usage: float64(i%60 + 20), Total: 4 << 30, Used: 2 << 30},
			Disk:     models.Disk{Usage: float64(i%40 + 30), Total: 50 << 30, Used: 25 << 30},
		}
	}
	state.UpdateVMsForInstance("pve1", vms)

	containers := make([]models.Container, containerCount)
	containerIDs := make([]string, containerCount)
	for i := range containers {
		containerIDs[i] = fmt.Sprintf("ct-slo-%d", i)
		containers[i] = models.Container{
			ID:       containerIDs[i],
			VMID:     200 + i,
			Name:     fmt.Sprintf("ct-%d", i),
			Node:     nodes[i%len(nodes)].Name,
			Instance: "pve1",
			Status:   "running",
			Type:     "lxc",
			CPU:      float64(i%80+10) / 100.0,
			Memory:   models.Memory{Usage: float64(i%60 + 20), Total: 2 << 30, Used: 1 << 30},
			Disk:     models.Disk{Usage: float64(i%40 + 30), Total: 20 << 30, Used: 10 << 30},
		}
	}
	state.UpdateContainersForInstance("pve1", containers)

	dockerHosts := make([]models.DockerHost, dockerHostCount)
	dockerContainerIDs := make([]string, 0, dockerHostCount*containersPerHost)
	for i := range dockerHosts {
		hostID := fmt.Sprintf("docker-host-slo-%d", i)
		hostContainers := make([]models.DockerContainer, containersPerHost)
		for j := range hostContainers {
			containerID := fmt.Sprintf("docker-container-slo-%d-%d", i, j)
			dockerContainerIDs = append(dockerContainerIDs, containerID)
			hostContainers[j] = models.DockerContainer{
				ID:            containerID,
				Name:          fmt.Sprintf("docker-%d-%d", i, j),
				State:         "running",
				Status:        "running",
				CPUPercent:    float64((i+j)%80 + 10),
				MemoryPercent: float64((i+j)%60 + 20),
				NetInRate:     float64((i+j)%50 + 5),
				NetOutRate:    float64((i+j)%50 + 7),
			}
		}
		dockerHosts[i] = models.DockerHost{
			ID:         hostID,
			AgentID:    hostID,
			Hostname:   nodes[i%len(nodes)].Name,
			Runtime:    "docker",
			Status:     "online",
			CPUUsage:   float64(i%80 + 10),
			Memory:     models.Memory{Usage: float64(i%60 + 20), Total: 32 << 30, Used: 16 << 30},
			Disks:      []models.Disk{{Usage: float64(i%40 + 30), Total: 200 << 30, Used: 100 << 30}},
			Containers: hostContainers,
		}
	}
	state.DockerHosts = dockerHosts
	syncTestResourceStore(t, monitor, state)

	seedBatchMetrics("vm", vmIDs, []string{"cpu", "memory", "disk", "netin", "netout"})
	seedBatchMetrics("container", containerIDs, []string{"cpu", "memory", "disk", "netin", "netout"})
	seedBatchMetrics("dockerContainer", dockerContainerIDs, []string{"cpu", "memory", "disk", "netin", "netout"})

	router := &Router{monitor: monitor}
	url := "/api/charts/workloads?range=4h&maxPoints=120"

	sanityReq := httptest.NewRequest(http.MethodGet, url, nil)
	sanityRec := httptest.NewRecorder()
	router.handleWorkloadCharts(sanityRec, sanityReq)
	if sanityRec.Code != http.StatusOK {
		t.Fatalf("sanity check failed: status %d body=%s", sanityRec.Code, sanityRec.Body.String())
	}
	var sanityResp WorkloadChartsResponse
	if err := json.Unmarshal(sanityRec.Body.Bytes(), &sanityResp); err != nil {
		t.Fatalf("sanity unmarshal: %v", err)
	}
	if len(sanityResp.ChartData) != vmCount+containerCount {
		t.Fatalf("sanity: expected %d guest chart entries, got %d", vmCount+containerCount, len(sanityResp.ChartData))
	}
	if len(sanityResp.DockerData) != len(dockerContainerIDs) {
		t.Fatalf("sanity: expected %d docker chart entries, got %d", len(dockerContainerIDs), len(sanityResp.DockerData))
	}
	if sanityResp.Stats.PrimarySourceHint != "store_or_memory_fallback" {
		t.Fatalf("sanity: expected store-backed source hint, got %q", sanityResp.Stats.PrimarySourceHint)
	}

	latencies := measureEndpointLatencies(t, func() {
		req := httptest.NewRequest(http.MethodGet, url, nil)
		rec := httptest.NewRecorder()
		router.handleWorkloadCharts(rec, req)
		if rec.Code != http.StatusOK {
			t.Fatalf("unexpected status %d", rec.Code)
		}
	})

	p95 := percentile(latencies, 0.95)
	target := effectiveAPISLOTarget(SLOWorkloadChartsP95, sloWorkloadChartsGitHubActionsP95)
	t.Logf("charts/workloads p50=%v p95=%v p99=%v SLO=%v",
		percentile(latencies, 0.50), p95, percentile(latencies, 0.99), target)

	if p95 > target {
		t.Errorf("SLO VIOLATION: p95=%v exceeds target %v", p95, target)
	}
}

// TestSLO_WorkloadsSummaryCharts validates the aggregate workload summary
// endpoint that powers top-card sparklines and blast-radius summaries. The
// workload forces the store-backed batch path across VMs, system containers,
// Kubernetes pods, and docker containers.
func TestSLO_WorkloadsSummaryCharts(t *testing.T) {
	skipUnderRace(t)
	suppressTestLogs(t)

	store := newTestMetricsStore(t)
	const (
		vmCount           = 30
		containerCount    = 20
		podCount          = 10
		dockerHostCount   = 10
		containersPerHost = 2
		pointsPerMetric   = 240
	)
	base := time.Now().Add(-4 * time.Hour).UTC().Truncate(time.Second)

	seedBatchMetrics := func(resourceType string, ids []string, metricTypes []string) {
		batch := make([]metrics.WriteMetric, 0, len(ids)*len(metricTypes)*pointsPerMetric)
		for idx, id := range ids {
			for _, mt := range metricTypes {
				for p := 0; p < pointsPerMetric; p++ {
					batch = append(batch, metrics.WriteMetric{
						ResourceType: resourceType,
						ResourceID:   id,
						MetricType:   mt,
						Value:        float64((idx + p) % 100),
						Timestamp:    base.Add(time.Duration(p) * time.Minute),
						Tier:         metrics.TierMinute,
					})
				}
			}
		}
		store.WriteBatchSync(batch)
	}

	monitor := &monitoring.Monitor{}
	state := models.NewState()
	setTestUnexportedField(t, monitor, "state", state)
	setTestUnexportedField(t, monitor, "metricsHistory", monitoring.NewMetricsHistory(10, time.Hour))
	setTestUnexportedField(t, monitor, "metricsStore", store)

	nodes := make([]models.Node, 5)
	for i := range nodes {
		nodes[i] = models.Node{
			ID:       fmt.Sprintf("node-summary-slo-%d", i),
			Name:     fmt.Sprintf("node-%d", i),
			Instance: "pve1",
			Status:   "online",
		}
	}
	state.UpdateNodesForInstance("pve1", nodes)

	vms := make([]models.VM, vmCount)
	vmIDs := make([]string, vmCount)
	for i := range vms {
		vmIDs[i] = fmt.Sprintf("vm-summary-slo-%d", i)
		vms[i] = models.VM{
			ID:       vmIDs[i],
			VMID:     100 + i,
			Name:     fmt.Sprintf("vm-%d", i),
			Node:     nodes[i%len(nodes)].Name,
			Instance: "pve1",
			Status:   "running",
			Type:     "qemu",
			CPU:      float64(i%80+10) / 100.0,
			Memory:   models.Memory{Usage: float64(i%60 + 20), Total: 4 << 30, Used: 2 << 30},
			Disk:     models.Disk{Usage: float64(i%40 + 30), Total: 50 << 30, Used: 25 << 30},
		}
	}
	state.UpdateVMsForInstance("pve1", vms)

	containers := make([]models.Container, containerCount)
	containerIDs := make([]string, containerCount)
	for i := range containers {
		containerIDs[i] = fmt.Sprintf("ct-summary-slo-%d", i)
		containers[i] = models.Container{
			ID:       containerIDs[i],
			VMID:     200 + i,
			Name:     fmt.Sprintf("ct-%d", i),
			Node:     nodes[i%len(nodes)].Name,
			Instance: "pve1",
			Status:   "running",
			Type:     "lxc",
			CPU:      float64(i%80+10) / 100.0,
			Memory:   models.Memory{Usage: float64(i%60 + 20), Total: 2 << 30, Used: 1 << 30},
			Disk:     models.Disk{Usage: float64(i%40 + 30), Total: 20 << 30, Used: 10 << 30},
		}
	}
	state.UpdateContainersForInstance("pve1", containers)

	clusters := []models.KubernetesCluster{{
		ID:   "k8s-summary-slo",
		Name: "k8s-summary-slo",
		Pods: make([]models.KubernetesPod, podCount),
	}}
	podIDs := make([]string, podCount)
	for i := 0; i < podCount; i++ {
		podIDs[i] = fmt.Sprintf("pod:%s:%s", "default", fmt.Sprintf("pod-%d", i))
		clusters[0].Pods[i] = models.KubernetesPod{
			UID:                fmt.Sprintf("pod-summary-slo-%d", i),
			Name:               fmt.Sprintf("pod-%d", i),
			Namespace:          "default",
			NodeName:           nodes[i%len(nodes)].Name,
			Phase:              "Running",
			UsageCPUPercent:    float64((i % 80) + 10),
			UsageMemoryPercent: float64((i % 60) + 20),
			DiskUsagePercent:   float64((i % 40) + 30),
			NetInRate:          float64((i % 50) + 5),
			NetOutRate:         float64((i % 50) + 7),
		}
	}
	state.KubernetesClusters = clusters

	dockerHosts := make([]models.DockerHost, dockerHostCount)
	dockerContainerIDs := make([]string, 0, dockerHostCount*containersPerHost)
	for i := range dockerHosts {
		hostID := fmt.Sprintf("docker-host-summary-slo-%d", i)
		hostContainers := make([]models.DockerContainer, containersPerHost)
		for j := range hostContainers {
			containerID := fmt.Sprintf("docker-container-summary-slo-%d-%d", i, j)
			dockerContainerIDs = append(dockerContainerIDs, containerID)
			hostContainers[j] = models.DockerContainer{
				ID:            containerID,
				Name:          fmt.Sprintf("docker-%d-%d", i, j),
				State:         "running",
				Status:        "running",
				CPUPercent:    float64((i+j)%80 + 10),
				MemoryPercent: float64((i+j)%60 + 20),
				NetInRate:     float64((i+j)%50 + 5),
				NetOutRate:    float64((i+j)%50 + 7),
			}
		}
		dockerHosts[i] = models.DockerHost{
			ID:         hostID,
			AgentID:    hostID,
			Hostname:   nodes[i%len(nodes)].Name,
			Runtime:    "docker",
			Status:     "online",
			CPUUsage:   float64(i%80 + 10),
			Memory:     models.Memory{Usage: float64(i%60 + 20), Total: 32 << 30, Used: 16 << 30},
			Disks:      []models.Disk{{Usage: float64(i%40 + 30), Total: 200 << 30, Used: 100 << 30}},
			Containers: hostContainers,
		}
	}
	state.DockerHosts = dockerHosts
	syncTestResourceStore(t, monitor, state)

	seedBatchMetrics("vm", vmIDs, []string{"cpu", "memory", "disk", "netin", "netout"})
	seedBatchMetrics("container", containerIDs, []string{"cpu", "memory", "disk", "netin", "netout"})
	seedBatchMetrics("k8s", podIDs, []string{"cpu", "memory", "disk", "netin", "netout"})
	seedBatchMetrics("dockerContainer", dockerContainerIDs, []string{"cpu", "memory", "disk", "netin", "netout"})

	router := &Router{monitor: monitor}
	url := "/api/charts/workloads-summary?range=4h"

	sanityReq := httptest.NewRequest(http.MethodGet, url, nil)
	sanityRec := httptest.NewRecorder()
	router.handleWorkloadsSummaryCharts(sanityRec, sanityReq)
	if sanityRec.Code != http.StatusOK {
		t.Fatalf("sanity check failed: status %d body=%s", sanityRec.Code, sanityRec.Body.String())
	}
	var sanityResp WorkloadsSummaryChartsResponse
	if err := json.Unmarshal(sanityRec.Body.Bytes(), &sanityResp); err != nil {
		t.Fatalf("sanity unmarshal: %v", err)
	}
	if sanityResp.GuestCounts.Total != vmCount+containerCount+podCount+len(dockerContainerIDs) {
		t.Fatalf("sanity: expected %d guests, got %d", vmCount+containerCount+podCount+len(dockerContainerIDs), sanityResp.GuestCounts.Total)
	}
	if sanityResp.Stats.PrimarySourceHint != "store_or_memory_fallback" {
		t.Fatalf("sanity: expected store-backed source hint, got %q", sanityResp.Stats.PrimarySourceHint)
	}

	latencies := measureEndpointLatencies(t, func() {
		req := httptest.NewRequest(http.MethodGet, url, nil)
		rec := httptest.NewRecorder()
		router.handleWorkloadsSummaryCharts(rec, req)
		if rec.Code != http.StatusOK {
			t.Fatalf("unexpected status %d", rec.Code)
		}
	})

	p95 := percentile(latencies, 0.95)
	target := effectiveAPISLOTarget(SLOWorkloadsSummaryChartsP95, sloWorkloadsSummaryChartsGitHubActionsP95)
	t.Logf("charts/workloads-summary p50=%v p95=%v p99=%v SLO=%v",
		percentile(latencies, 0.50), p95, percentile(latencies, 0.99), target)

	if p95 > target {
		t.Errorf("SLO VIOLATION: p95=%v exceeds target %v", p95, target)
	}
}

// --- Test helpers ---

// skipUnderRace skips the test when the race detector is enabled, since the
// 2-10x overhead makes latency measurements meaningless.
func skipUnderRace(t *testing.T) {
	t.Helper()
	if raceEnabled {
		t.Skip("skipping SLO latency test under -race (overhead makes measurements unreliable)")
	}
}

const (
	sloIterations                 = 200
	resourcesListLatencyBatchSize = 25
)

func effectiveAPISLOTarget(localTarget, githubActionsTarget time.Duration) time.Duration {
	if githubActionsTarget > 0 && os.Getenv("GITHUB_ACTIONS") == "true" {
		return githubActionsTarget
	}
	return localTarget
}

// measureEndpointLatencies runs fn sloIterations times with a warmup phase and
// returns the measured latency durations.
func measureEndpointLatencies(t *testing.T, fn func()) []time.Duration {
	t.Helper()

	// Warmup: run 20 iterations to stabilize allocations and caches.
	for i := 0; i < 20; i++ {
		fn()
	}

	latencies := make([]time.Duration, sloIterations)
	for i := 0; i < sloIterations; i++ {
		start := time.Now()
		fn()
		latencies[i] = time.Since(start)
	}
	return latencies
}

// measureEndpointAmortizedLatencies captures per-request latency for extremely
// fast handlers by timing a small request batch and amortizing the wall time
// across that batch. This keeps micro-endpoint SLOs sensitive to real
// regressions while filtering unrelated scheduler and GC spikes from broad
// `go test ./...` runs.
func measureEndpointAmortizedLatencies(t *testing.T, batchSize int, fn func()) []time.Duration {
	t.Helper()
	if batchSize <= 0 {
		t.Fatalf("batchSize must be positive, got %d", batchSize)
	}

	for i := 0; i < 20; i++ {
		for j := 0; j < batchSize; j++ {
			fn()
		}
	}

	latencies := make([]time.Duration, sloIterations)
	for i := 0; i < sloIterations; i++ {
		start := time.Now()
		for j := 0; j < batchSize; j++ {
			fn()
		}
		latencies[i] = time.Since(start) / time.Duration(batchSize)
	}
	return latencies
}

// percentile returns the value at the given percentile (0.0–1.0) from
// a slice of durations.
func percentile(durations []time.Duration, pct float64) time.Duration {
	if len(durations) == 0 {
		return 0
	}
	sorted := make([]time.Duration, len(durations))
	copy(sorted, durations)
	sort.Slice(sorted, func(i, j int) bool { return sorted[i] < sorted[j] })
	idx := int(float64(len(sorted)-1) * pct)
	return sorted[idx]
}

// newTestMetricsStore creates an ephemeral metrics store for SLO tests.
func newTestMetricsStore(t *testing.T) *metrics.Store {
	t.Helper()
	dir := t.TempDir()
	cfg := metrics.DefaultConfig(dir)
	cfg.DBPath = filepath.Join(dir, "slo-test.db")
	cfg.FlushInterval = time.Hour
	cfg.WriteBufferSize = 10_000
	// API chart/contract tests seed multi-day minute-tier fixtures; keep every
	// tier well beyond those windows so deferred startup maintenance cannot race
	// the fixture and prune old points on slower CI runners.
	cfg.RetentionRaw = 90 * 24 * time.Hour
	cfg.RetentionMinute = 90 * 24 * time.Hour
	cfg.RetentionHourly = 90 * 24 * time.Hour
	cfg.RetentionDaily = 90 * 24 * time.Hour
	store, err := metrics.NewStore(cfg)
	if err != nil {
		t.Fatalf("NewStore: %v", err)
	}
	if err := store.WaitForMaintenance(5 * time.Second); err != nil {
		t.Fatalf("WaitForMaintenance: %v", err)
	}
	t.Cleanup(func() { store.Close() })
	return store
}

// seedTestMetrics writes test data to the store (mirrors seedBenchMetricsMulti).
func seedTestMetrics(t *testing.T, store *metrics.Store, resourceType string, metricTypes []string, numResources, numPoints int) []string {
	t.Helper()
	base := time.Now().Add(-50 * time.Minute)
	ids := make([]string, numResources)

	batch := make([]metrics.WriteMetric, 0, numResources*numPoints*len(metricTypes))
	for r := 0; r < numResources; r++ {
		id := fmt.Sprintf("%s-slo-%d", resourceType, r)
		ids[r] = id
		for _, mt := range metricTypes {
			for p := 0; p < numPoints; p++ {
				batch = append(batch, metrics.WriteMetric{
					ResourceType: resourceType,
					ResourceID:   id,
					MetricType:   mt,
					Value:        float64(p % 100),
					Timestamp:    base.Add(time.Duration(p) * 6 * time.Second),
					Tier:         metrics.TierRaw,
				})
			}
		}
	}
	store.WriteBatchSync(batch)
	return ids
}

// sloTestStateProvider implements StateProvider for SLO tests.
type sloTestStateProvider struct {
	state *models.State
}

func (p *sloTestStateProvider) ReadSnapshot() models.StateSnapshot {
	return p.state.GetSnapshot()
}