mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 03:20:11 +00:00
Add disk usage threshold support for Docker containers
Extends the Docker monitoring and alerting system to track writable layer usage as a percentage of the container's root filesystem. This helps identify containers with bloated copy-on-write layers before they consume excessive disk space. - Add disk threshold to DockerThresholdConfig (default: 85% trigger, 80% clear) - Evaluate disk alerts for running containers when RootFilesystemBytes > 0 - Include disk metadata (writable layer, total filesystem, block I/O stats) - Update frontend to display and configure disk thresholds - Add test coverage for disk usage alert hysteresis - Document disk monitoring in DOCKER_MONITORING.md Per-container and per-host overrides apply to disk thresholds the same way they do for CPU and memory.
This commit is contained in:
parent
6b670a7af3
commit
fb22469eb0
8 changed files with 121 additions and 2 deletions
|
|
@ -223,6 +223,7 @@ PROXY_AUTH_LOGOUT_URL=/logout # URL for SSO logout
|
|||
"storageDefault": { "trigger": 85, "clear": 75 },
|
||||
"dockerDefaults": {
|
||||
"cpu": { "trigger": 75, "clear": 60 },
|
||||
"disk": { "trigger": 85, "clear": 75 },
|
||||
"restartCount": 3,
|
||||
"restartWindow": 300,
|
||||
"memoryWarnPct": 90,
|
||||
|
|
@ -288,6 +289,7 @@ PROXY_AUTH_LOGOUT_URL=/logout # URL for SSO logout
|
|||
- `dockerIgnoredContainerPrefixes` lets you silence state/metric/restart alerts for ephemeral containers whose names or IDs share a common, case-insensitive prefix. The Docker tab in the UI keeps this list in sync.
|
||||
- Swarm service alerts track missing replicas: `serviceWarnGapPercent` defines when a warning fires, and `serviceCriticalGapPercent` must be greater than or equal to the warning gap (Pulse automatically clamps the critical value upward if an older client submits something smaller).
|
||||
- Docker container state controls live in `dockerDefaults`: flip `stateDisableConnectivity` to silence exit/offline alerts globally, or change `statePoweredOffSeverity` to `critical` when you want exiting containers to page immediately. Per-container overrides still take precedence.
|
||||
- `dockerDefaults.disk` defines the writable-layer usage threshold (% of the container's upper filesystem compared to its base image). Defaults trigger at 85% and clear at 80%, and can be overridden per container or host when noisy workloads need a different window.
|
||||
- Quiet hours, escalation, deduplication, and restart loop detection are all managed here, and the UI keeps the JSON in sync automatically.
|
||||
|
||||
> Tip: Back up `alerts.json` alongside `.env` during exports. Restoring it preserves all overrides, quiet-hour schedules, and webhook routing.
|
||||
|
|
|
|||
|
|
@ -165,6 +165,10 @@ docker run -d \
|
|||
|
||||
The agent automatically discovers the Docker socket via the usual environment variables. To use SSH tunnels or TCP sockets, export `DOCKER_HOST` as you would for the Docker CLI.
|
||||
|
||||
### Disk usage monitoring & alerts
|
||||
|
||||
When `--collect-disk` is enabled (the default), Pulse records each container’s writable layer and root filesystem sizes. The Alerts engine treats the proportion of writable data to total filesystem as the disk usage percentage for that container. A fleet-wide threshold lives under **Alerts → Thresholds → Docker Containers** and defaults to 85% trigger / 80% clear; adjust or disable it per host/container when your workload makes heavy use of copy-on-write layers. Containers that stop reporting disk metrics (for example when size queries are disabled) automatically skip the disk alert evaluation.
|
||||
|
||||
### Suppressing ephemeral containers
|
||||
|
||||
CI runners and short-lived build containers can generate noisy state alerts when they exit on schedule. In Pulse v4.24.0 and later you can provide a list of prefixes to ignore under **Alerts → Thresholds → Docker → Ignored container prefixes**. Any container whose name *or* ID begins with a configured prefix is skipped for state, health, metric, restart-loop, and OOM alerts. Matching is case-insensitive and the list is saved as `dockerIgnoredContainerPrefixes` inside `alerts.json`. Use one entry per family of ephemeral containers (for example, `runner-` or `gitlab-job-`).
|
||||
|
|
|
|||
|
|
@ -181,6 +181,7 @@ interface ThresholdsTableProps {
|
|||
dockerDefaults: {
|
||||
cpu: number;
|
||||
memory: number;
|
||||
disk: number;
|
||||
restartCount: number;
|
||||
restartWindow: number;
|
||||
memoryWarnPct: number;
|
||||
|
|
@ -197,6 +198,7 @@ interface ThresholdsTableProps {
|
|||
| {
|
||||
cpu: number;
|
||||
memory: number;
|
||||
disk: number;
|
||||
restartCount: number;
|
||||
restartWindow: number;
|
||||
memoryWarnPct: number;
|
||||
|
|
@ -207,6 +209,7 @@ interface ThresholdsTableProps {
|
|||
| ((prev: {
|
||||
cpu: number;
|
||||
memory: number;
|
||||
disk: number;
|
||||
restartCount: number;
|
||||
restartWindow: number;
|
||||
memoryWarnPct: number;
|
||||
|
|
@ -216,6 +219,7 @@ interface ThresholdsTableProps {
|
|||
}) => {
|
||||
cpu: number;
|
||||
memory: number;
|
||||
disk: number;
|
||||
restartCount: number;
|
||||
restartWindow: number;
|
||||
memoryWarnPct: number;
|
||||
|
|
@ -2850,6 +2854,7 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
|
|||
columns={[
|
||||
'CPU %',
|
||||
'Memory %',
|
||||
'Disk %',
|
||||
'Restart Count',
|
||||
'Restart Window (s)',
|
||||
'Memory Warn %',
|
||||
|
|
@ -2871,6 +2876,7 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
|
|||
globalDefaults={{
|
||||
cpu: props.dockerDefaults.cpu,
|
||||
memory: props.dockerDefaults.memory,
|
||||
disk: props.dockerDefaults.disk,
|
||||
restartCount: props.dockerDefaults.restartCount,
|
||||
restartWindow: props.dockerDefaults.restartWindow,
|
||||
memoryWarnPct: props.dockerDefaults.memoryWarnPct,
|
||||
|
|
@ -2880,6 +2886,7 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
|
|||
const current = {
|
||||
cpu: props.dockerDefaults.cpu,
|
||||
memory: props.dockerDefaults.memory,
|
||||
disk: props.dockerDefaults.disk,
|
||||
restartCount: props.dockerDefaults.restartCount,
|
||||
restartWindow: props.dockerDefaults.restartWindow,
|
||||
memoryWarnPct: props.dockerDefaults.memoryWarnPct,
|
||||
|
|
@ -2892,6 +2899,7 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
|
|||
...prev,
|
||||
cpu: next.cpu ?? prev.cpu,
|
||||
memory: next.memory ?? prev.memory,
|
||||
disk: next.disk ?? prev.disk,
|
||||
restartCount: next.restartCount ?? prev.restartCount,
|
||||
restartWindow: next.restartWindow ?? prev.restartWindow,
|
||||
memoryWarnPct: next.memoryWarnPct ?? prev.memoryWarnPct,
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ const DEFAULT_PMG_THRESHOLDS: PMGThresholdDefaults = {
|
|||
const DEFAULT_DOCKER_DEFAULTS = {
|
||||
cpu: 80,
|
||||
memory: 85,
|
||||
disk: 85,
|
||||
restartCount: 3,
|
||||
restartWindow: 300,
|
||||
memoryWarnPct: 90,
|
||||
|
|
|
|||
|
|
@ -842,6 +842,7 @@ const [appriseConfig, setAppriseConfig] = createSignal<UIAppriseConfig>(
|
|||
setDockerDefaults({
|
||||
cpu: getTriggerValue(config.dockerDefaults.cpu) ?? 80,
|
||||
memory: getTriggerValue(config.dockerDefaults.memory) ?? 85,
|
||||
disk: getTriggerValue(config.dockerDefaults.disk) ?? FACTORY_DOCKER_DEFAULTS.disk,
|
||||
restartCount: config.dockerDefaults.restartCount ?? 3,
|
||||
restartWindow: config.dockerDefaults.restartWindow ?? 300,
|
||||
memoryWarnPct: config.dockerDefaults.memoryWarnPct ?? 90,
|
||||
|
|
@ -1205,6 +1206,7 @@ const [appriseConfig, setAppriseConfig] = createSignal<UIAppriseConfig>(
|
|||
const FACTORY_DOCKER_DEFAULTS = {
|
||||
cpu: 80,
|
||||
memory: 85,
|
||||
disk: 85,
|
||||
restartCount: 3,
|
||||
restartWindow: 300,
|
||||
memoryWarnPct: 90,
|
||||
|
|
@ -1526,6 +1528,7 @@ const [appriseConfig, setAppriseConfig] = createSignal<UIAppriseConfig>(
|
|||
dockerDefaults: {
|
||||
cpu: createHysteresisThreshold(dockerDefaultsValue.cpu),
|
||||
memory: createHysteresisThreshold(dockerDefaultsValue.memory),
|
||||
disk: createHysteresisThreshold(dockerDefaultsValue.disk),
|
||||
restartCount: dockerDefaultsValue.restartCount,
|
||||
restartWindow: dockerDefaultsValue.restartWindow,
|
||||
memoryWarnPct: dockerDefaultsValue.memoryWarnPct,
|
||||
|
|
|
|||
|
|
@ -60,6 +60,7 @@ export interface CustomAlertRule {
|
|||
export interface DockerThresholdConfig {
|
||||
cpu?: HysteresisThreshold;
|
||||
memory?: HysteresisThreshold;
|
||||
disk?: HysteresisThreshold;
|
||||
restartCount?: number;
|
||||
restartWindow?: number;
|
||||
memoryWarnPct?: number;
|
||||
|
|
|
|||
|
|
@ -266,6 +266,7 @@ type CustomAlertRule struct {
|
|||
type DockerThresholdConfig struct {
|
||||
CPU HysteresisThreshold `json:"cpu"` // CPU usage % threshold (default: 80%)
|
||||
Memory HysteresisThreshold `json:"memory"` // Memory usage % threshold (default: 85%)
|
||||
Disk HysteresisThreshold `json:"disk"` // Writable layer usage % threshold (default: 85%)
|
||||
RestartCount int `json:"restartCount"` // Number of restarts to trigger alert (default: 3)
|
||||
RestartWindow int `json:"restartWindow"` // Time window in seconds for restart loop detection (default: 300 = 5min)
|
||||
MemoryWarnPct int `json:"memoryWarnPct"` // Memory limit % to trigger warning (default: 90)
|
||||
|
|
@ -504,6 +505,7 @@ func NewManager() *Manager {
|
|||
DockerDefaults: DockerThresholdConfig{
|
||||
CPU: HysteresisThreshold{Trigger: 80, Clear: 75},
|
||||
Memory: HysteresisThreshold{Trigger: 85, Clear: 80},
|
||||
Disk: HysteresisThreshold{Trigger: 85, Clear: 80},
|
||||
RestartCount: 3,
|
||||
RestartWindow: 300, // 5 minutes
|
||||
MemoryWarnPct: 90,
|
||||
|
|
@ -807,6 +809,9 @@ func (m *Manager) UpdateConfig(config AlertConfig) {
|
|||
if config.DockerDefaults.Memory.Trigger <= 0 {
|
||||
config.DockerDefaults.Memory = HysteresisThreshold{Trigger: 85, Clear: 80}
|
||||
}
|
||||
if config.DockerDefaults.Disk.Trigger <= 0 {
|
||||
config.DockerDefaults.Disk = HysteresisThreshold{Trigger: 85, Clear: 80}
|
||||
}
|
||||
if config.DockerDefaults.RestartCount <= 0 {
|
||||
config.DockerDefaults.RestartCount = 3
|
||||
}
|
||||
|
|
@ -2780,7 +2785,7 @@ func (m *Manager) evaluateDockerContainer(host models.DockerHost, container mode
|
|||
|
||||
if state != "running" {
|
||||
m.checkDockerContainerState(host, container, resourceID, containerName, instanceName, nodeName)
|
||||
m.clearDockerContainerMetricAlerts(resourceID, "cpu", "memory")
|
||||
m.clearDockerContainerMetricAlerts(resourceID, "cpu", "memory", "disk")
|
||||
} else {
|
||||
m.clearDockerContainerStateAlert(resourceID)
|
||||
|
||||
|
|
@ -2788,6 +2793,7 @@ func (m *Manager) evaluateDockerContainer(host models.DockerHost, container mode
|
|||
thresholds := ThresholdConfig{
|
||||
CPU: &m.config.DockerDefaults.CPU,
|
||||
Memory: &m.config.DockerDefaults.Memory,
|
||||
Disk: &m.config.DockerDefaults.Disk,
|
||||
}
|
||||
if hasOverride {
|
||||
thresholds = m.applyThresholdOverride(thresholds, overrideConfig)
|
||||
|
|
@ -2832,6 +2838,38 @@ func (m *Manager) evaluateDockerContainer(host models.DockerHost, container mode
|
|||
}
|
||||
m.checkMetric(resourceID, containerName, nodeName, instanceName, resourceType, "memory", container.MemoryPercent, thresholds.Memory, &metricOptions{Metadata: memMetadata})
|
||||
}
|
||||
|
||||
if thresholds.Disk != nil {
|
||||
totalBytes := container.RootFilesystemBytes
|
||||
usedBytes := container.WritableLayerBytes
|
||||
if totalBytes > 0 && usedBytes >= 0 {
|
||||
diskPercent := (float64(usedBytes) / float64(totalBytes)) * 100
|
||||
diskMetadata := map[string]interface{}{
|
||||
"resourceType": resourceType,
|
||||
"hostId": host.ID,
|
||||
"hostName": host.DisplayName,
|
||||
"hostHostname": host.Hostname,
|
||||
"containerId": container.ID,
|
||||
"containerName": containerName,
|
||||
"image": container.Image,
|
||||
"state": container.State,
|
||||
"status": container.Status,
|
||||
"restartCount": container.RestartCount,
|
||||
"metric": "disk",
|
||||
"diskPercent": diskPercent,
|
||||
"writableLayerBytes": usedBytes,
|
||||
"rootFilesystemBytes": totalBytes,
|
||||
"mountCount": len(container.Mounts),
|
||||
}
|
||||
if container.BlockIO != nil {
|
||||
diskMetadata["blockIoReadBytes"] = container.BlockIO.ReadBytes
|
||||
diskMetadata["blockIoWriteBytes"] = container.BlockIO.WriteBytes
|
||||
}
|
||||
m.checkMetric(resourceID, containerName, nodeName, instanceName, resourceType, "disk", diskPercent, thresholds.Disk, &metricOptions{Metadata: diskMetadata})
|
||||
} else {
|
||||
m.clearDockerContainerMetricAlerts(resourceID, "disk")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
m.checkDockerContainerHealth(host, container, resourceID, containerName, instanceName, nodeName)
|
||||
|
|
@ -3552,7 +3590,7 @@ func (m *Manager) checkDockerContainerMemoryLimit(host models.DockerHost, contai
|
|||
|
||||
func (m *Manager) clearDockerContainerMetricAlerts(resourceID string, metrics ...string) {
|
||||
if len(metrics) == 0 {
|
||||
metrics = []string{"cpu", "memory"}
|
||||
metrics = []string{"cpu", "memory", "disk"}
|
||||
}
|
||||
for _, metric := range metrics {
|
||||
alertID := fmt.Sprintf("%s-%s", resourceID, metric)
|
||||
|
|
|
|||
|
|
@ -1146,6 +1146,68 @@ func TestDockerContainerMemoryLimitHysteresis(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestDockerContainerDiskUsageAlert(t *testing.T) {
|
||||
m := NewManager()
|
||||
|
||||
cfg := m.GetConfig()
|
||||
cfg.Enabled = true
|
||||
cfg.TimeThreshold = 0
|
||||
if cfg.TimeThresholds == nil {
|
||||
cfg.TimeThresholds = make(map[string]int)
|
||||
}
|
||||
cfg.TimeThresholds["docker"] = 0
|
||||
cfg.TimeThresholds["guest"] = 0
|
||||
cfg.DockerDefaults.Disk = HysteresisThreshold{Trigger: 75, Clear: 65}
|
||||
m.UpdateConfig(cfg)
|
||||
|
||||
const gib = 1024 * 1024 * 1024
|
||||
|
||||
host := models.DockerHost{
|
||||
ID: "host-disk",
|
||||
DisplayName: "Docker Host",
|
||||
Hostname: "docker.disk",
|
||||
Containers: []models.DockerContainer{
|
||||
{
|
||||
ID: "container-disk",
|
||||
Name: "disk-hog",
|
||||
State: "running",
|
||||
Status: "Up 5 minutes",
|
||||
WritableLayerBytes: int64(8 * gib),
|
||||
RootFilesystemBytes: int64(10 * gib),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
m.CheckDockerHost(host)
|
||||
|
||||
resourceID := dockerResourceID(host.ID, host.Containers[0].ID)
|
||||
alertID := fmt.Sprintf("%s-%s", resourceID, "disk")
|
||||
alert, exists := m.activeAlerts[alertID]
|
||||
if !exists {
|
||||
t.Fatalf("expected docker container disk alert %s to be raised", alertID)
|
||||
}
|
||||
if alert.Level != AlertLevelWarning {
|
||||
t.Fatalf("expected warning severity for disk usage alert, got %s", alert.Level)
|
||||
}
|
||||
if alert.Metadata == nil {
|
||||
t.Fatalf("expected disk alert metadata to be populated")
|
||||
}
|
||||
if percent, ok := alert.Metadata["diskPercent"].(float64); !ok || percent < 79.5 || percent > 80.5 {
|
||||
t.Fatalf("expected diskPercent metadata to be ~80%%, got %v", alert.Metadata["diskPercent"])
|
||||
}
|
||||
if used, ok := alert.Metadata["writableLayerBytes"].(int64); !ok || used != int64(8*gib) {
|
||||
t.Fatalf("expected writableLayerBytes metadata to be %d, got %v", int64(8*gib), alert.Metadata["writableLayerBytes"])
|
||||
}
|
||||
|
||||
// Drop usage below the clear threshold and ensure the alert resolves.
|
||||
host.Containers[0].WritableLayerBytes = int64(4 * gib)
|
||||
m.CheckDockerHost(host)
|
||||
|
||||
if _, stillActive := m.activeAlerts[alertID]; stillActive {
|
||||
t.Fatalf("expected docker container disk alert %s to clear after usage dropped", alertID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateConfigClampsDockerServiceCriticalGap(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue