Add disk usage threshold support for Docker containers

Extends the Docker monitoring and alerting system to track writable layer
usage as a percentage of the container's root filesystem. This helps
identify containers with bloated copy-on-write layers before they
consume excessive disk space.

- Add disk threshold to DockerThresholdConfig (default: 85% trigger, 80% clear)
- Evaluate disk alerts for running containers when RootFilesystemBytes > 0
- Include disk metadata (writable layer, total filesystem, block I/O stats)
- Update frontend to display and configure disk thresholds
- Add test coverage for disk usage alert hysteresis
- Document disk monitoring in DOCKER_MONITORING.md

Per-container and per-host overrides apply to disk thresholds the same
way they do for CPU and memory.
This commit is contained in:
rcourtman 2025-10-29 14:52:25 +00:00
parent 6b670a7af3
commit fb22469eb0
8 changed files with 121 additions and 2 deletions

View file

@ -223,6 +223,7 @@ PROXY_AUTH_LOGOUT_URL=/logout # URL for SSO logout
"storageDefault": { "trigger": 85, "clear": 75 },
"dockerDefaults": {
"cpu": { "trigger": 75, "clear": 60 },
"disk": { "trigger": 85, "clear": 75 },
"restartCount": 3,
"restartWindow": 300,
"memoryWarnPct": 90,
@ -288,6 +289,7 @@ PROXY_AUTH_LOGOUT_URL=/logout # URL for SSO logout
- `dockerIgnoredContainerPrefixes` lets you silence state/metric/restart alerts for ephemeral containers whose names or IDs share a common, case-insensitive prefix. The Docker tab in the UI keeps this list in sync.
- Swarm service alerts track missing replicas: `serviceWarnGapPercent` defines when a warning fires, and `serviceCriticalGapPercent` must be greater than or equal to the warning gap (Pulse automatically clamps the critical value upward if an older client submits something smaller).
- Docker container state controls live in `dockerDefaults`: flip `stateDisableConnectivity` to silence exit/offline alerts globally, or change `statePoweredOffSeverity` to `critical` when you want exiting containers to page immediately. Per-container overrides still take precedence.
- `dockerDefaults.disk` defines the writable-layer usage threshold (% of the container's upper filesystem compared to its base image). Defaults trigger at 85% and clear at 80%, and can be overridden per container or host when noisy workloads need a different window.
- Quiet hours, escalation, deduplication, and restart loop detection are all managed here, and the UI keeps the JSON in sync automatically.
> Tip: Back up `alerts.json` alongside `.env` during exports. Restoring it preserves all overrides, quiet-hour schedules, and webhook routing.

View file

@ -165,6 +165,10 @@ docker run -d \
The agent automatically discovers the Docker socket via the usual environment variables. To use SSH tunnels or TCP sockets, export `DOCKER_HOST` as you would for the Docker CLI.
### Disk usage monitoring & alerts
When `--collect-disk` is enabled (the default), Pulse records each containers writable layer and root filesystem sizes. The Alerts engine treats the proportion of writable data to total filesystem as the disk usage percentage for that container. A fleet-wide threshold lives under **Alerts → Thresholds → Docker Containers** and defaults to 85% trigger / 80% clear; adjust or disable it per host/container when your workload makes heavy use of copy-on-write layers. Containers that stop reporting disk metrics (for example when size queries are disabled) automatically skip the disk alert evaluation.
### Suppressing ephemeral containers
CI runners and short-lived build containers can generate noisy state alerts when they exit on schedule. In Pulse v4.24.0 and later you can provide a list of prefixes to ignore under **Alerts → Thresholds → Docker → Ignored container prefixes**. Any container whose name *or* ID begins with a configured prefix is skipped for state, health, metric, restart-loop, and OOM alerts. Matching is case-insensitive and the list is saved as `dockerIgnoredContainerPrefixes` inside `alerts.json`. Use one entry per family of ephemeral containers (for example, `runner-` or `gitlab-job-`).

View file

@ -181,6 +181,7 @@ interface ThresholdsTableProps {
dockerDefaults: {
cpu: number;
memory: number;
disk: number;
restartCount: number;
restartWindow: number;
memoryWarnPct: number;
@ -197,6 +198,7 @@ interface ThresholdsTableProps {
| {
cpu: number;
memory: number;
disk: number;
restartCount: number;
restartWindow: number;
memoryWarnPct: number;
@ -207,6 +209,7 @@ interface ThresholdsTableProps {
| ((prev: {
cpu: number;
memory: number;
disk: number;
restartCount: number;
restartWindow: number;
memoryWarnPct: number;
@ -216,6 +219,7 @@ interface ThresholdsTableProps {
}) => {
cpu: number;
memory: number;
disk: number;
restartCount: number;
restartWindow: number;
memoryWarnPct: number;
@ -2850,6 +2854,7 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
columns={[
'CPU %',
'Memory %',
'Disk %',
'Restart Count',
'Restart Window (s)',
'Memory Warn %',
@ -2871,6 +2876,7 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
globalDefaults={{
cpu: props.dockerDefaults.cpu,
memory: props.dockerDefaults.memory,
disk: props.dockerDefaults.disk,
restartCount: props.dockerDefaults.restartCount,
restartWindow: props.dockerDefaults.restartWindow,
memoryWarnPct: props.dockerDefaults.memoryWarnPct,
@ -2880,6 +2886,7 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
const current = {
cpu: props.dockerDefaults.cpu,
memory: props.dockerDefaults.memory,
disk: props.dockerDefaults.disk,
restartCount: props.dockerDefaults.restartCount,
restartWindow: props.dockerDefaults.restartWindow,
memoryWarnPct: props.dockerDefaults.memoryWarnPct,
@ -2892,6 +2899,7 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
...prev,
cpu: next.cpu ?? prev.cpu,
memory: next.memory ?? prev.memory,
disk: next.disk ?? prev.disk,
restartCount: next.restartCount ?? prev.restartCount,
restartWindow: next.restartWindow ?? prev.restartWindow,
memoryWarnPct: next.memoryWarnPct ?? prev.memoryWarnPct,

View file

@ -51,6 +51,7 @@ const DEFAULT_PMG_THRESHOLDS: PMGThresholdDefaults = {
const DEFAULT_DOCKER_DEFAULTS = {
cpu: 80,
memory: 85,
disk: 85,
restartCount: 3,
restartWindow: 300,
memoryWarnPct: 90,

View file

@ -842,6 +842,7 @@ const [appriseConfig, setAppriseConfig] = createSignal<UIAppriseConfig>(
setDockerDefaults({
cpu: getTriggerValue(config.dockerDefaults.cpu) ?? 80,
memory: getTriggerValue(config.dockerDefaults.memory) ?? 85,
disk: getTriggerValue(config.dockerDefaults.disk) ?? FACTORY_DOCKER_DEFAULTS.disk,
restartCount: config.dockerDefaults.restartCount ?? 3,
restartWindow: config.dockerDefaults.restartWindow ?? 300,
memoryWarnPct: config.dockerDefaults.memoryWarnPct ?? 90,
@ -1205,6 +1206,7 @@ const [appriseConfig, setAppriseConfig] = createSignal<UIAppriseConfig>(
const FACTORY_DOCKER_DEFAULTS = {
cpu: 80,
memory: 85,
disk: 85,
restartCount: 3,
restartWindow: 300,
memoryWarnPct: 90,
@ -1526,6 +1528,7 @@ const [appriseConfig, setAppriseConfig] = createSignal<UIAppriseConfig>(
dockerDefaults: {
cpu: createHysteresisThreshold(dockerDefaultsValue.cpu),
memory: createHysteresisThreshold(dockerDefaultsValue.memory),
disk: createHysteresisThreshold(dockerDefaultsValue.disk),
restartCount: dockerDefaultsValue.restartCount,
restartWindow: dockerDefaultsValue.restartWindow,
memoryWarnPct: dockerDefaultsValue.memoryWarnPct,

View file

@ -60,6 +60,7 @@ export interface CustomAlertRule {
export interface DockerThresholdConfig {
cpu?: HysteresisThreshold;
memory?: HysteresisThreshold;
disk?: HysteresisThreshold;
restartCount?: number;
restartWindow?: number;
memoryWarnPct?: number;

View file

@ -266,6 +266,7 @@ type CustomAlertRule struct {
type DockerThresholdConfig struct {
CPU HysteresisThreshold `json:"cpu"` // CPU usage % threshold (default: 80%)
Memory HysteresisThreshold `json:"memory"` // Memory usage % threshold (default: 85%)
Disk HysteresisThreshold `json:"disk"` // Writable layer usage % threshold (default: 85%)
RestartCount int `json:"restartCount"` // Number of restarts to trigger alert (default: 3)
RestartWindow int `json:"restartWindow"` // Time window in seconds for restart loop detection (default: 300 = 5min)
MemoryWarnPct int `json:"memoryWarnPct"` // Memory limit % to trigger warning (default: 90)
@ -504,6 +505,7 @@ func NewManager() *Manager {
DockerDefaults: DockerThresholdConfig{
CPU: HysteresisThreshold{Trigger: 80, Clear: 75},
Memory: HysteresisThreshold{Trigger: 85, Clear: 80},
Disk: HysteresisThreshold{Trigger: 85, Clear: 80},
RestartCount: 3,
RestartWindow: 300, // 5 minutes
MemoryWarnPct: 90,
@ -807,6 +809,9 @@ func (m *Manager) UpdateConfig(config AlertConfig) {
if config.DockerDefaults.Memory.Trigger <= 0 {
config.DockerDefaults.Memory = HysteresisThreshold{Trigger: 85, Clear: 80}
}
if config.DockerDefaults.Disk.Trigger <= 0 {
config.DockerDefaults.Disk = HysteresisThreshold{Trigger: 85, Clear: 80}
}
if config.DockerDefaults.RestartCount <= 0 {
config.DockerDefaults.RestartCount = 3
}
@ -2780,7 +2785,7 @@ func (m *Manager) evaluateDockerContainer(host models.DockerHost, container mode
if state != "running" {
m.checkDockerContainerState(host, container, resourceID, containerName, instanceName, nodeName)
m.clearDockerContainerMetricAlerts(resourceID, "cpu", "memory")
m.clearDockerContainerMetricAlerts(resourceID, "cpu", "memory", "disk")
} else {
m.clearDockerContainerStateAlert(resourceID)
@ -2788,6 +2793,7 @@ func (m *Manager) evaluateDockerContainer(host models.DockerHost, container mode
thresholds := ThresholdConfig{
CPU: &m.config.DockerDefaults.CPU,
Memory: &m.config.DockerDefaults.Memory,
Disk: &m.config.DockerDefaults.Disk,
}
if hasOverride {
thresholds = m.applyThresholdOverride(thresholds, overrideConfig)
@ -2832,6 +2838,38 @@ func (m *Manager) evaluateDockerContainer(host models.DockerHost, container mode
}
m.checkMetric(resourceID, containerName, nodeName, instanceName, resourceType, "memory", container.MemoryPercent, thresholds.Memory, &metricOptions{Metadata: memMetadata})
}
if thresholds.Disk != nil {
totalBytes := container.RootFilesystemBytes
usedBytes := container.WritableLayerBytes
if totalBytes > 0 && usedBytes >= 0 {
diskPercent := (float64(usedBytes) / float64(totalBytes)) * 100
diskMetadata := map[string]interface{}{
"resourceType": resourceType,
"hostId": host.ID,
"hostName": host.DisplayName,
"hostHostname": host.Hostname,
"containerId": container.ID,
"containerName": containerName,
"image": container.Image,
"state": container.State,
"status": container.Status,
"restartCount": container.RestartCount,
"metric": "disk",
"diskPercent": diskPercent,
"writableLayerBytes": usedBytes,
"rootFilesystemBytes": totalBytes,
"mountCount": len(container.Mounts),
}
if container.BlockIO != nil {
diskMetadata["blockIoReadBytes"] = container.BlockIO.ReadBytes
diskMetadata["blockIoWriteBytes"] = container.BlockIO.WriteBytes
}
m.checkMetric(resourceID, containerName, nodeName, instanceName, resourceType, "disk", diskPercent, thresholds.Disk, &metricOptions{Metadata: diskMetadata})
} else {
m.clearDockerContainerMetricAlerts(resourceID, "disk")
}
}
}
m.checkDockerContainerHealth(host, container, resourceID, containerName, instanceName, nodeName)
@ -3552,7 +3590,7 @@ func (m *Manager) checkDockerContainerMemoryLimit(host models.DockerHost, contai
func (m *Manager) clearDockerContainerMetricAlerts(resourceID string, metrics ...string) {
if len(metrics) == 0 {
metrics = []string{"cpu", "memory"}
metrics = []string{"cpu", "memory", "disk"}
}
for _, metric := range metrics {
alertID := fmt.Sprintf("%s-%s", resourceID, metric)

View file

@ -1146,6 +1146,68 @@ func TestDockerContainerMemoryLimitHysteresis(t *testing.T) {
}
}
func TestDockerContainerDiskUsageAlert(t *testing.T) {
m := NewManager()
cfg := m.GetConfig()
cfg.Enabled = true
cfg.TimeThreshold = 0
if cfg.TimeThresholds == nil {
cfg.TimeThresholds = make(map[string]int)
}
cfg.TimeThresholds["docker"] = 0
cfg.TimeThresholds["guest"] = 0
cfg.DockerDefaults.Disk = HysteresisThreshold{Trigger: 75, Clear: 65}
m.UpdateConfig(cfg)
const gib = 1024 * 1024 * 1024
host := models.DockerHost{
ID: "host-disk",
DisplayName: "Docker Host",
Hostname: "docker.disk",
Containers: []models.DockerContainer{
{
ID: "container-disk",
Name: "disk-hog",
State: "running",
Status: "Up 5 minutes",
WritableLayerBytes: int64(8 * gib),
RootFilesystemBytes: int64(10 * gib),
},
},
}
m.CheckDockerHost(host)
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
alertID := fmt.Sprintf("%s-%s", resourceID, "disk")
alert, exists := m.activeAlerts[alertID]
if !exists {
t.Fatalf("expected docker container disk alert %s to be raised", alertID)
}
if alert.Level != AlertLevelWarning {
t.Fatalf("expected warning severity for disk usage alert, got %s", alert.Level)
}
if alert.Metadata == nil {
t.Fatalf("expected disk alert metadata to be populated")
}
if percent, ok := alert.Metadata["diskPercent"].(float64); !ok || percent < 79.5 || percent > 80.5 {
t.Fatalf("expected diskPercent metadata to be ~80%%, got %v", alert.Metadata["diskPercent"])
}
if used, ok := alert.Metadata["writableLayerBytes"].(int64); !ok || used != int64(8*gib) {
t.Fatalf("expected writableLayerBytes metadata to be %d, got %v", int64(8*gib), alert.Metadata["writableLayerBytes"])
}
// Drop usage below the clear threshold and ensure the alert resolves.
host.Containers[0].WritableLayerBytes = int64(4 * gib)
m.CheckDockerHost(host)
if _, stillActive := m.activeAlerts[alertID]; stillActive {
t.Fatalf("expected docker container disk alert %s to clear after usage dropped", alertID)
}
}
func TestUpdateConfigClampsDockerServiceCriticalGap(t *testing.T) {
t.Parallel()