From 1816e2dbb8a11e4f383da4b7716e7e1c90577761 Mon Sep 17 00:00:00 2001
From: rcourtman <courtmanr@gmail.com>
Date: Sat, 10 Jan 2026 15:25:28 +0000
Subject: [PATCH] fix(agent): use dataset used capacity for RAIDZ pools instead
 of zpool alloc

For RAIDZ pools, zpool ALLOC includes parity overhead, but users expect
to see actual data usage. Now using dataset Used value (from statfs)
when RAIDZ is detected, matching the existing fix for total capacity.

Fixes the second part of #1052 where used capacity was inflated.
---
 frontend-modern/src/stores/websocket.ts | 57 +++++++++++++++++++++++--
 internal/hostmetrics/zfs.go             | 10 +++--
 internal/hostmetrics/zfs_test.go        | 20 +++++----
 3 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/frontend-modern/src/stores/websocket.ts b/frontend-modern/src/stores/websocket.ts
index 1d613d530..efbb51251 100644
--- a/frontend-modern/src/stores/websocket.ts
+++ b/frontend-modern/src/stores/websocket.ts
@@ -99,6 +99,11 @@ export function createWebSocketStore(url: string) {
   let consecutiveEmptyHostUpdates = 0;
   let hasReceivedNonEmptyHosts = false;
 
+  // Track consecutive empty Kubernetes clusters payloads (same protection as dockerHosts/hosts)
+  // This prevents clusters from disappearing when transient empty arrays are received.
+  let consecutiveEmptyK8sUpdates = 0;
+  let hasReceivedNonEmptyK8sClusters = false;
+
   const mergeDockerHostRevocations = (incomingHosts: DockerHost[]) => {
     if (!Array.isArray(incomingHosts) || incomingHosts.length === 0) {
       return incomingHosts;
@@ -311,6 +316,8 @@ export function createWebSocketStore(url: string) {
       hasReceivedNonEmptyDockerHosts = false;
       consecutiveEmptyHostUpdates = 0;
       hasReceivedNonEmptyHosts = false;
+      consecutiveEmptyK8sUpdates = 0;
+      hasReceivedNonEmptyK8sClusters = false;
 
       // Start heartbeat to keep connection alive
       if (heartbeatInterval) {
@@ -597,11 +604,53 @@ export function createWebSocketStore(url: string) {
                 : [];
               setState('removedDockerHosts', reconcile(removed, { key: 'id' }));
             }
+            // Process Kubernetes clusters with transient empty payload protection
+            // (same logic as dockerHosts/hosts to prevent UI flapping)
             if (message.data.kubernetesClusters !== undefined) {
-              const clusters = Array.isArray(message.data.kubernetesClusters)
-                ? (message.data.kubernetesClusters as KubernetesCluster[])
-                : [];
-              setState('kubernetesClusters', reconcile(clusters, { key: 'id' }));
+              if (Array.isArray(message.data.kubernetesClusters)) {
+                const incomingClusters = message.data.kubernetesClusters as KubernetesCluster[];
+                if (incomingClusters.length === 0) {
+                  consecutiveEmptyK8sUpdates += 1;
+
+                  // Check if all existing clusters are stale (>60s since lastSeen)
+                  // If so, they're probably really gone - apply the empty update immediately
+                  const now = Date.now();
+                  const staleThresholdMs = 60_000; // 60 seconds
+                  const existingClusters = state.kubernetesClusters || [];
+                  const allStale = existingClusters.length === 0 || existingClusters.every(
+                    (c) => !c.lastSeen || (now - c.lastSeen) > staleThresholdMs
+                  );
+
+                  const shouldApply =
+                    !hasReceivedNonEmptyK8sClusters ||
+                    allStale ||
+                    consecutiveEmptyK8sUpdates >= 3 ||
+                    message.type === WEBSOCKET.MESSAGE_TYPES.INITIAL_STATE;
+
+                  if (shouldApply) {
+                    logger.debug('[WebSocket] Updating kubernetesClusters', {
+                      count: incomingClusters.length,
+                      reason: allStale ? 'allStale' : 'threshold',
+                    });
+                    setState('kubernetesClusters', reconcile(incomingClusters, { key: 'id' }));
+                  } else {
+                    logger.debug('[WebSocket] Skipping transient empty kubernetesClusters payload', {
+                      streak: consecutiveEmptyK8sUpdates,
+                    });
+                  }
+                } else {
+                  consecutiveEmptyK8sUpdates = 0;
+                  hasReceivedNonEmptyK8sClusters = true;
+                  logger.debug('[WebSocket] Updating kubernetesClusters', {
+                    count: incomingClusters.length,
+                  });
+                  setState('kubernetesClusters', reconcile(incomingClusters, { key: 'id' }));
+                }
+              } else {
+                logger.warn('[WebSocket] Received non-array kubernetesClusters payload', {
+                  type: typeof message.data.kubernetesClusters,
+                });
+              }
             }
             if (message.data.removedKubernetesClusters !== undefined) {
               const removed = Array.isArray(message.data.removedKubernetesClusters)
diff --git a/internal/hostmetrics/zfs.go b/internal/hostmetrics/zfs.go
index bb57b419b..7062b190f 100644
--- a/internal/hostmetrics/zfs.go
+++ b/internal/hostmetrics/zfs.go
@@ -74,22 +74,26 @@ func disksFromZpoolStats(
 			// For RAIDZ/mirror pools, zpool SIZE is raw capacity (sum of all disks),
 			// but users expect usable capacity (accounting for parity/redundancy).
 			// The dataset's Total (from statfs) gives usable capacity.
-			// Use dataset stats when available and smaller than zpool size. (issue #1052)
+			// Similarly, zpool ALLOC includes parity overhead, but dataset Used gives
+			// actual data usage. Use dataset stats when available and smaller than
+			// zpool size. (issue #1052)
 			totalBytes := stat.Size
+			usedBytes := stat.Alloc
 			freeBytes := stat.Free
 			if ds.Total > 0 && ds.Total < stat.Size {
 				totalBytes = ds.Total
+				usedBytes = ds.Used
 				freeBytes = ds.Free
 			}
 
-			usage := clampPercent(calculatePercent(totalBytes, stat.Alloc))
+			usage := clampPercent(calculatePercent(totalBytes, usedBytes))
 			disks = append(disks, agentshost.Disk{
 				Device:     pool,
 				Mountpoint: mp,
 				Filesystem: "zfs",
 				Type:       "zfs",
 				TotalBytes: int64(totalBytes),
-				UsedBytes:  int64(stat.Alloc),
+				UsedBytes:  int64(usedBytes),
 				FreeBytes:  int64(freeBytes),
 				Usage:      usage,
 			})
diff --git a/internal/hostmetrics/zfs_test.go b/internal/hostmetrics/zfs_test.go
index 56b6cd2c7..f5b722db2 100644
--- a/internal/hostmetrics/zfs_test.go
+++ b/internal/hostmetrics/zfs_test.go
@@ -53,15 +53,17 @@ func TestSummarizeZFSPoolsRAIDZCapacity(t *testing.T) {
 	// Simulate a RAIDZ1 pool with 3 disks:
 	// - Raw SIZE from zpool list: 43.6 TB (sum of all disks)
 	// - Usable capacity from statfs: 29 TB (after RAIDZ1 parity overhead)
+	// - zpool ALLOC: 7 GB (includes parity data)
+	// - zfs USED: 4.6 GB (actual user data)
 	queryZpoolStats = func(ctx context.Context, pools []string) (map[string]zpoolStats, error) {
 		return map[string]zpoolStats{
-			"Main": {Size: 43600000000000, Alloc: 962000000, Free: 43599038000000},
+			"Main": {Size: 43600000000000, Alloc: 7000000000, Free: 43593000000000},
 		}, nil
 	}
 
-	// Dataset stats from statfs reflect usable capacity (29 TB)
+	// Dataset stats from statfs reflect usable capacity (29 TB) and actual data usage (4.6 GB)
 	datasets := []zfsDatasetUsage{
-		{Pool: "Main", Dataset: "Main", Mountpoint: "/mnt/Main", Total: 29000000000000, Used: 962000000, Free: 28999038000000},
+		{Pool: "Main", Dataset: "Main", Mountpoint: "/mnt/Main", Total: 29000000000000, Used: 4600000000, Free: 28995400000000},
 	}
 
 	disks := summarizeZFSPools(context.Background(), datasets)
@@ -80,20 +82,20 @@ func TestSummarizeZFSPoolsRAIDZCapacity(t *testing.T) {
 		t.Errorf("expected TotalBytes %d (usable capacity), got %d (might be using raw capacity)", expectedTotal, main.TotalBytes)
 	}
 
-	// Used should come from zpool stats (accurate allocation)
-	expectedUsed := int64(962000000)
+	// Used should come from dataset stats (4.6 GB actual data), not zpool alloc (7 GB with parity)
+	expectedUsed := int64(4600000000)
 	if main.UsedBytes != expectedUsed {
-		t.Errorf("expected UsedBytes %d, got %d", expectedUsed, main.UsedBytes)
+		t.Errorf("expected UsedBytes %d (dataset used), got %d (might be using zpool alloc which includes parity)", expectedUsed, main.UsedBytes)
 	}
 
 	// Free should use dataset stats when we're using dataset Total
-	expectedFree := int64(28999038000000)
+	expectedFree := int64(28995400000000)
 	if main.FreeBytes != expectedFree {
 		t.Errorf("expected FreeBytes %d, got %d", expectedFree, main.FreeBytes)
 	}
 
-	// Usage should be calculated against usable capacity
-	// 962000000 / 29000000000000 * 100 ≈ 0.003%
+	// Usage should be calculated against usable capacity with actual used data
+	// 4600000000 / 29000000000000 * 100 ≈ 0.016%
 	if main.Usage > 0.1 {
 		t.Errorf("expected usage ~0%%, got %.2f%% (might be calculated against wrong total)", main.Usage)
 	}