From 4cf16ec9cbbcd142d8ea3df414ac7292222c606c Mon Sep 17 00:00:00 2001 From: rcourtman Date: Tue, 12 May 2026 14:40:55 +0100 Subject: [PATCH] Stabilize summary chart SLOs --- .../v6/internal/subsystems/agent-lifecycle.md | 6 + .../v6/internal/subsystems/api-contracts.md | 8 + .../subsystems/performance-and-scalability.md | 7 + .../internal/subsystems/storage-recovery.md | 4 +- internal/api/contract_test.go | 8 + internal/api/router.go | 221 ++++++++++++++---- .../release_control/subsystem_lookup_test.py | 2 +- 7 files changed, 205 insertions(+), 51 deletions(-) diff --git a/docs/release-control/v6/internal/subsystems/agent-lifecycle.md b/docs/release-control/v6/internal/subsystems/agent-lifecycle.md index 0d2d798de..70fcf42e0 100644 --- a/docs/release-control/v6/internal/subsystems/agent-lifecycle.md +++ b/docs/release-control/v6/internal/subsystems/agent-lifecycle.md @@ -1205,6 +1205,12 @@ presentation-only on that same boundary: `internal/api/router.go` may batch those reads in parallel, but it must request only the canonical rendered metric set for workload cards instead of widening the hot path back to fetch-all metrics on behalf of install or reporting callers. +The same presentation-only rule applies when shared infrastructure-summary or +workloads-summary chart routes serve a short cached response for repeated +org/range/scope requests: lifecycle-adjacent surfaces may render those charts +as operator context, but agent registration, heartbeat, installer status, +profile assignment, reporting freshness, and fleet-control readiness must not +derive authority from the cached chart payload or its timestamp. That shared `internal/api/` dependency now also assumes hosted tenant AI and relay bootstrap reads use one effective hosted billing lease before lifecycle-adjacent flows inspect runtime readiness, so install and setup diff --git a/docs/release-control/v6/internal/subsystems/api-contracts.md b/docs/release-control/v6/internal/subsystems/api-contracts.md index 557c7df5e..1a51fdfe0 100644 --- a/docs/release-control/v6/internal/subsystems/api-contracts.md +++ b/docs/release-control/v6/internal/subsystems/api-contracts.md @@ -105,6 +105,14 @@ product API routes free of maintainer commercial analytics. ## Shared Boundaries +Summary-chart response caching is a shared API boundary: +`internal/api/router.go` may serve a short cached JSON payload for repeated +infrastructure-summary and workloads-summary requests with the same +organization, range, metric set, and workload scope, but that cache is +transport-only. It may amortize polling and remount cost, but it must not +change normalized response shape, bypass monitor or read-state availability +checks, merge tenants, or become the source of truth for telemetry freshness. + 1. `frontend-modern/src/api/agentProfiles.ts` shared with `agent-lifecycle`: the agent profiles frontend client is both an agent lifecycle control surface and a canonical API payload contract boundary. 2. `frontend-modern/src/api/ai.ts` shared with `ai-runtime`: the AI frontend client is both an AI runtime control surface and a canonical API payload contract boundary. 3. `frontend-modern/src/api/nodes.ts` shared with `agent-lifecycle`: the shared Proxmox node client is both an agent lifecycle setup/install control surface and a canonical API payload contract boundary. diff --git a/docs/release-control/v6/internal/subsystems/performance-and-scalability.md b/docs/release-control/v6/internal/subsystems/performance-and-scalability.md index e5fc6a064..534ff0fa8 100644 --- a/docs/release-control/v6/internal/subsystems/performance-and-scalability.md +++ b/docs/release-control/v6/internal/subsystems/performance-and-scalability.md @@ -1207,6 +1207,13 @@ The aggregate `/api/charts/workloads-summary` endpoint now also has its own explicit API p95 budget constant, aligned with the per-workload charts budget, and `internal/api/slo_bench_test.go` must fail if that aggregate budget or its store-backed mixed-workload benchmark coverage drifts. +The infrastructure and workload-summary chart endpoints may keep a short +backend response cache for identical org/range/metric-scope summary requests, +but only as presentation hot-path protection for repeated summary polling and +remounts. The cached payloads must still be built from the canonical +store-backed/read-state sources, must remain isolated by organization and +explicit chart scope, and must not become telemetry freshness, lifecycle, +recovery, or persistence authority. Those budgets also assume Kubernetes pod history lookups hit one canonical series key. Pod chart and history consumers must normalize bare pod IDs onto `k8s::pod:` before lookup; otherwise demo and mock workloads pay diff --git a/docs/release-control/v6/internal/subsystems/storage-recovery.md b/docs/release-control/v6/internal/subsystems/storage-recovery.md index 2d7c5d03d..18e6245da 100644 --- a/docs/release-control/v6/internal/subsystems/storage-recovery.md +++ b/docs/release-control/v6/internal/subsystems/storage-recovery.md @@ -592,8 +592,8 @@ bypass the API fail-closed execution gate. for presentation, while storage and recovery must continue to treat `AgentData.platform` as the normalized runtime platform. 32. Keep agentless availability endpoints neutral on the shared unified-resource and API contracts. When `internal/api/availability_handlers.go`, `internal/api/connections_handlers.go`, `internal/api/platform_mock_connections.go`, or `frontend-modern/src/hooks/useUnifiedResources.ts` surface `network-endpoint` availability resources, storage and recovery may consume their liveness as infrastructure context only; they must not reinterpret ping/TCP/HTTP endpoints as storage providers, backup targets, recovery repositories, or protected-workload evidence. -33. Keep infrastructure summary chart bucketing presentation-only on the adjacent shared API boundary. When `internal/api/router.go` normalizes mixed-cadence infrastructure history into equal-time summary buckets for operator-facing summary cards, storage and recovery may consume the resulting visual context only; they must not reinterpret those normalized chart samples as recovery freshness windows, backup cadence, or restore evidence. -34. Keep workload chart downsampling presentation-only on that same adjacent shared API boundary. When `internal/api/router.go` caps mixed-cadence workload history into equal-time buckets for operator-facing workload cards, storage and recovery may consume the resulting visual context only; they must not reinterpret those shaped chart samples as recovery freshness windows, backup cadence, or restore evidence. +33. Keep infrastructure summary chart bucketing and short response caching presentation-only on the adjacent shared API boundary. When `internal/api/router.go` normalizes mixed-cadence infrastructure history into equal-time summary buckets or serves a cached summary payload for repeated operator-facing summary-card requests, storage and recovery may consume the resulting visual context only; they must not reinterpret those normalized chart samples, cached timestamps, or cache hits as recovery freshness windows, backup cadence, or restore evidence. +34. Keep workload chart downsampling and short response caching presentation-only on that same adjacent shared API boundary. When `internal/api/router.go` caps mixed-cadence workload history into equal-time buckets or serves a cached workload-summary payload for repeated operator-facing workload-card requests, storage and recovery may consume the resulting visual context only; they must not reinterpret those shaped chart samples, cached timestamps, or cache hits as recovery freshness windows, backup cadence, or restore evidence. The same adjacent chart boundary now covers compact storage capacity transport. `internal/api/router.go` may batch only the canonical `used` and `avail` storage series for `/api/charts/storage-summary`, but storage diff --git a/internal/api/contract_test.go b/internal/api/contract_test.go index 666761f41..aa7e73b2e 100644 --- a/internal/api/contract_test.go +++ b/internal/api/contract_test.go @@ -2166,6 +2166,14 @@ func TestContract_WorkloadChartMetricBudgetGuardrailsRemainCanonical(t *testing. `vmBatchMetrics = monitor.GetGuestMetricsForChartBatch("vm", vmRequests, duration, workloadSummaryMetricOrder...)`, `containerBatchMetrics = monitor.GetGuestMetricsForChartBatch("container", containerRequests, duration, workloadSummaryMetricOrder...)`, `dockerContainerBatchMetrics = monitor.GetGuestMetricsForChartBatch("dockerContainer", dockerContainerRequests, duration, workloadSummaryMetricOrder...)`, + `summaryChartsCacheTTL = 5 * time.Second`, + `infrastructureChartsCacheKey(req *http.Request, timeRange string, requestedMetricNames []string) string`, + `cachedInfrastructureChartsPayload`, + `cacheInfrastructureChartsPayload`, + `workloadsSummaryChartsCacheKey`, + `cachedWorkloadsSummaryChartsPayload`, + `cacheWorkloadsSummaryChartsPayload`, + `type workloadSummaryMetricBucket struct`, } for _, snippet := range requiredSnippets { if !strings.Contains(source, snippet) { diff --git a/internal/api/router.go b/internal/api/router.go index 54a590710..ce06a73ec 100644 --- a/internal/api/router.go +++ b/internal/api/router.go @@ -140,6 +140,10 @@ type Router struct { bootstrapTokenPath string checksumMu sync.RWMutex checksumCache map[string]checksumCacheEntry + infrastructureChartsMu sync.Mutex + infrastructureCharts map[string]summaryChartsCacheEntry + workloadsSummaryChartsMu sync.Mutex + workloadsSummaryCharts map[string]summaryChartsCacheEntry installScriptClient *http.Client relayMu sync.RWMutex relayClient *relay.Client @@ -156,6 +160,13 @@ type Router struct { aiAlertAnalysisEndpoints extensions.AIAlertAnalysisEndpoints } +const summaryChartsCacheTTL = 5 * time.Second + +type summaryChartsCacheEntry struct { + payload []byte + expiresAt time.Time +} + func pulseBinDir() string { if dir := strings.TrimSpace(os.Getenv("PULSE_BIN_DIR")); dir != "" { return dir @@ -6068,6 +6079,47 @@ func targetInfrastructureSummarySeriesPoints(duration time.Duration) int { ) } +func infrastructureChartsCacheKey(req *http.Request, timeRange string, requestedMetricNames []string) string { + orgID := strings.TrimSpace(GetOrgID(req.Context())) + if orgID == "" { + orgID = "default" + } + return orgID + "|" + strings.TrimSpace(timeRange) + "|" + strings.Join(requestedMetricNames, ",") +} + +func (r *Router) cachedInfrastructureChartsPayload(key string, now time.Time) ([]byte, bool) { + if r == nil || key == "" { + return nil, false + } + r.infrastructureChartsMu.Lock() + defer r.infrastructureChartsMu.Unlock() + + entry, ok := r.infrastructureCharts[key] + if !ok { + return nil, false + } + if !now.Before(entry.expiresAt) { + delete(r.infrastructureCharts, key) + return nil, false + } + return entry.payload, true +} + +func (r *Router) cacheInfrastructureChartsPayload(key string, payload []byte, now time.Time) { + if r == nil || key == "" || len(payload) == 0 { + return + } + r.infrastructureChartsMu.Lock() + defer r.infrastructureChartsMu.Unlock() + if r.infrastructureCharts == nil { + r.infrastructureCharts = make(map[string]summaryChartsCacheEntry, 8) + } + r.infrastructureCharts[key] = summaryChartsCacheEntry{ + payload: payload, + expiresAt: now.Add(summaryChartsCacheTTL), + } +} + func targetWorkloadsSummarySeriesPoints(duration time.Duration) int { return targetBoundedSummarySeriesPoints( duration, @@ -6076,6 +6128,47 @@ func targetWorkloadsSummarySeriesPoints(duration time.Duration) int { ) } +func workloadsSummaryChartsCacheKey(req *http.Request, timeRange, selectedNodeID string) string { + orgID := strings.TrimSpace(GetOrgID(req.Context())) + if orgID == "" { + orgID = "default" + } + return orgID + "|" + strings.TrimSpace(timeRange) + "|" + strings.TrimSpace(selectedNodeID) +} + +func (r *Router) cachedWorkloadsSummaryChartsPayload(key string, now time.Time) ([]byte, bool) { + if r == nil || key == "" { + return nil, false + } + r.workloadsSummaryChartsMu.Lock() + defer r.workloadsSummaryChartsMu.Unlock() + + entry, ok := r.workloadsSummaryCharts[key] + if !ok { + return nil, false + } + if !now.Before(entry.expiresAt) { + delete(r.workloadsSummaryCharts, key) + return nil, false + } + return entry.payload, true +} + +func (r *Router) cacheWorkloadsSummaryChartsPayload(key string, payload []byte, now time.Time) { + if r == nil || key == "" || len(payload) == 0 { + return + } + r.workloadsSummaryChartsMu.Lock() + defer r.workloadsSummaryChartsMu.Unlock() + if r.workloadsSummaryCharts == nil { + r.workloadsSummaryCharts = make(map[string]summaryChartsCacheEntry, 8) + } + r.workloadsSummaryCharts[key] = summaryChartsCacheEntry{ + payload: payload, + expiresAt: now.Add(summaryChartsCacheTTL), + } +} + func aggregateInfrastructureSummaryBucketValue( metricType string, bucket infrastructureSummaryBucket, @@ -6914,7 +7007,17 @@ func (r *Router) handleInfrastructureCharts(w http.ResponseWriter, req *http.Req primarySourceHint = "store_or_memory_fallback" } - currentTime := time.Now().UnixMilli() + now := time.Now() + cacheKey := infrastructureChartsCacheKey(req, timeRange, requestedMetricNames) + if payload, ok := r.cachedInfrastructureChartsPayload(cacheKey, now); ok { + w.Header().Set("Content-Type", "application/json") + if _, err := w.Write(payload); err != nil { + log.Error().Err(err).Msg("Failed to write cached infrastructure chart data response") + } + return + } + + currentTime := now.UnixMilli() oldestTimestamp := currentTime // Process Nodes - batch-load historical data (1-2 SQL calls instead of NĂ—5). @@ -7162,19 +7265,49 @@ func (r *Router) handleInfrastructureCharts(w http.ResponseWriter, req *http.Req }, } - w.Header().Set("Content-Type", "application/json") - if err := json.NewEncoder(w).Encode(response.NormalizeCollections()); err != nil { + payload, err := json.Marshal(response.NormalizeCollections()) + if err != nil { log.Error().Err(err).Msg("Failed to encode infrastructure chart data response") http.Error(w, "Internal server error", http.StatusInternalServerError) return } + payload = append(payload, '\n') + r.cacheInfrastructureChartsPayload(cacheKey, payload, now) + w.Header().Set("Content-Type", "application/json") + if _, err := w.Write(payload); err != nil { + log.Error().Err(err).Msg("Failed to write infrastructure chart data response") + } } type workloadSummaryBuckets struct { - cpu []float64 - memory []float64 - disk []float64 - network []float64 + cpu workloadSummaryMetricBucket + memory workloadSummaryMetricBucket + disk workloadSummaryMetricBucket + network workloadSummaryMetricBucket +} + +type workloadSummaryMetricBucket struct { + sum float64 + max float64 + count int +} + +func (bucket *workloadSummaryMetricBucket) add(value float64) { + if bucket == nil { + return + } + if bucket.count == 0 || value > bucket.max { + bucket.max = value + } + bucket.sum += value + bucket.count++ +} + +func (bucket workloadSummaryMetricBucket) average() float64 { + if bucket.count == 0 { + return 0 + } + return bucket.sum / float64(bucket.count) } type workloadsSummarySnapshot struct { @@ -7368,13 +7501,13 @@ func appendWorkloadMetricPoints( } switch target { case "cpu": - bucket.cpu = append(bucket.cpu, value) + bucket.cpu.add(value) case "memory": - bucket.memory = append(bucket.memory, value) + bucket.memory.add(value) case "disk": - bucket.disk = append(bucket.disk, value) + bucket.disk.add(value) case "network": - bucket.network = append(bucket.network, value) + bucket.network.add(value) } added++ } @@ -7418,33 +7551,9 @@ func mergeWorkloadNetworkPoints( return points } -func averageValue(values []float64) float64 { - if len(values) == 0 { - return 0 - } - sum := 0.0 - for _, value := range values { - sum += value - } - return sum / float64(len(values)) -} - -func maxValue(values []float64) float64 { - if len(values) == 0 { - return 0 - } - max := values[0] - for i := 1; i < len(values); i++ { - if values[i] > max { - max = values[i] - } - } - return max -} - func buildWorkloadsSummaryMetric( buckets map[int64]*workloadSummaryBuckets, - selector func(*workloadSummaryBuckets) []float64, + selector func(*workloadSummaryBuckets) workloadSummaryMetricBucket, ) WorkloadsSummaryMetricData { keys := make([]int64, 0, len(buckets)) for ts := range buckets { @@ -7457,17 +7566,17 @@ func buildWorkloadsSummaryMetric( P95: make([]MetricPoint, 0, len(keys)), } for _, ts := range keys { - values := selector(buckets[ts]) - if len(values) == 0 { + bucket := selector(buckets[ts]) + if bucket.count == 0 { continue } data.P50 = append(data.P50, MetricPoint{ Timestamp: ts, - Value: averageValue(values), + Value: bucket.average(), }) data.P95 = append(data.P95, MetricPoint{ Timestamp: ts, - Value: maxValue(values), + Value: bucket.max, }) } return data @@ -7621,6 +7730,15 @@ func (r *Router) handleWorkloadsSummaryCharts(w http.ResponseWriter, req *http.R http.Error(w, "State unavailable", http.StatusInternalServerError) return } + + now := time.Now() + cacheKey := workloadsSummaryChartsCacheKey(req, timeRange, selectedNodeID) + if payload, ok := r.cachedWorkloadsSummaryChartsPayload(cacheKey, now); ok { + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write(payload) + return + } + mockModeEnabled := mock.IsMockEnabled() metricsStoreEnabled := monitor.GetMetricsStore() != nil primarySourceHint := "memory" @@ -7628,8 +7746,8 @@ func (r *Router) handleWorkloadsSummaryCharts(w http.ResponseWriter, req *http.R primarySourceHint = "store_or_memory_fallback" } - currentTime := time.Now().UnixMilli() - currentTimeTime := time.UnixMilli(currentTime) + currentTime := now.UnixMilli() + currentTimeTime := now oldestTimestamp := currentTime buckets := make(map[int64]*workloadSummaryBuckets) guestPointCount := 0 @@ -8060,16 +8178,16 @@ func (r *Router) handleWorkloadsSummaryCharts(w http.ResponseWriter, req *http.R snapshots = append(snapshots, snapshot) } - cpuMetric := buildWorkloadsSummaryMetric(buckets, func(bucket *workloadSummaryBuckets) []float64 { + cpuMetric := buildWorkloadsSummaryMetric(buckets, func(bucket *workloadSummaryBuckets) workloadSummaryMetricBucket { return bucket.cpu }) - memoryMetric := buildWorkloadsSummaryMetric(buckets, func(bucket *workloadSummaryBuckets) []float64 { + memoryMetric := buildWorkloadsSummaryMetric(buckets, func(bucket *workloadSummaryBuckets) workloadSummaryMetricBucket { return bucket.memory }) - diskMetric := buildWorkloadsSummaryMetric(buckets, func(bucket *workloadSummaryBuckets) []float64 { + diskMetric := buildWorkloadsSummaryMetric(buckets, func(bucket *workloadSummaryBuckets) workloadSummaryMetricBucket { return bucket.disk }) - networkMetric := buildWorkloadsSummaryMetric(buckets, func(bucket *workloadSummaryBuckets) []float64 { + networkMetric := buildWorkloadsSummaryMetric(buckets, func(bucket *workloadSummaryBuckets) workloadSummaryMetricBucket { return bucket.network }) cpuMetric = normalizeWorkloadsSummaryMetricPointSeries(cpuMetric, duration) @@ -8134,12 +8252,19 @@ func (r *Router) handleWorkloadsSummaryCharts(w http.ResponseWriter, req *http.R }, } - w.Header().Set("Content-Type", "application/json") - if err := json.NewEncoder(w).Encode(response.NormalizeCollections()); err != nil { + payload, err := json.Marshal(response.NormalizeCollections()) + if err != nil { log.Error().Err(err).Msg("Failed to encode workloads summary chart data response") http.Error(w, "Internal server error", http.StatusInternalServerError) return } + r.cacheWorkloadsSummaryChartsPayload(cacheKey, payload, now) + + w.Header().Set("Content-Type", "application/json") + if _, err := w.Write(payload); err != nil { + log.Error().Err(err).Msg("Failed to encode workloads summary chart data response") + return + } } func workloadSummaryStatusIsRunning(runtimeState string, status unifiedresources.ResourceStatus) bool { diff --git a/scripts/release_control/subsystem_lookup_test.py b/scripts/release_control/subsystem_lookup_test.py index f98443282..1d301774d 100644 --- a/scripts/release_control/subsystem_lookup_test.py +++ b/scripts/release_control/subsystem_lookup_test.py @@ -3628,7 +3628,7 @@ class SubsystemLookupTest(unittest.TestCase): { "heading": "## Shared Boundaries", "path": "internal/api/access_control_handlers.go", - "line": 195, + "line": 203, "heading_line": 106, } ],