Consolidate pending changes

- Add Docker metadata test comment
- Update alerts configuration and thresholds
- Enhance config file watcher
- Update documentation
- Refine settings UI
This commit is contained in:
rcourtman 2025-10-28 23:20:44 +00:00
parent bd4f12c98f
commit b3285c05c8
14 changed files with 311 additions and 106 deletions

View file

@ -228,7 +228,9 @@ PROXY_AUTH_LOGOUT_URL=/logout # URL for SSO logout
"memoryWarnPct": 90,
"memoryCriticalPct": 95,
"serviceWarnGapPercent": 10,
"serviceCriticalGapPercent": 50
"serviceCriticalGapPercent": 50,
"stateDisableConnectivity": false,
"statePoweredOffSeverity": "warning"
},
"dockerIgnoredContainerPrefixes": [
"runner-",
@ -285,6 +287,7 @@ PROXY_AUTH_LOGOUT_URL=/logout # URL for SSO logout
- `overrides` are indexed by the stable resource ID returned from `/api/state` (VMs: `instance/qemu/vmid`, containers: `instance/lxc/ctid`, nodes: `instance/node`).
- `dockerIgnoredContainerPrefixes` lets you silence state/metric/restart alerts for ephemeral containers whose names or IDs share a common, case-insensitive prefix. The Docker tab in the UI keeps this list in sync.
- Swarm service alerts track missing replicas: `serviceWarnGapPercent` defines when a warning fires, and `serviceCriticalGapPercent` must be greater than or equal to the warning gap (Pulse automatically clamps the critical value upward if an older client submits something smaller).
- Docker container state controls live in `dockerDefaults`: flip `stateDisableConnectivity` to silence exit/offline alerts globally, or change `statePoweredOffSeverity` to `critical` when you want exiting containers to page immediately. Per-container overrides still take precedence.
- Quiet hours, escalation, deduplication, and restart loop detection are all managed here, and the UI keeps the JSON in sync automatically.
> Tip: Back up `alerts.json` alongside `.env` during exports. Restoring it preserves all overrides, quiet-hour schedules, and webhook routing.

View file

@ -167,6 +167,8 @@ The agent automatically discovers the Docker socket via the usual environment va
CI runners and short-lived build containers can generate noisy state alerts when they exit on schedule. In Pulse v4.24.0 and later you can provide a list of prefixes to ignore under **Alerts → Thresholds → Docker → Ignored container prefixes**. Any container whose name *or* ID begins with a configured prefix is skipped for state, health, metric, restart-loop, and OOM alerts. Matching is case-insensitive and the list is saved as `dockerIgnoredContainerPrefixes` inside `alerts.json`. Use one entry per family of ephemeral containers (for example, `runner-` or `gitlab-job-`).
Need the alerts but at a different tone? The same Docker tab exposes global controls for the container state detector. Flip **Disable container state alerts** (`stateDisableConnectivity`) to mute powered-off/offline warnings across the fleet, or change **Default severity** (`statePoweredOffSeverity`) to `critical` so unexpected exits page immediately. Individual host/container overrides still win when you need exceptions.
## Testing and troubleshooting
- Run with `--interval 15s --insecure` in a terminal to see log output while testing.

View file

@ -188,6 +188,10 @@ interface ThresholdsTableProps {
serviceWarnGapPercent: number;
serviceCriticalGapPercent: number;
};
dockerDisableConnectivity: () => boolean;
setDockerDisableConnectivity: (value: boolean) => void;
dockerPoweredOffSeverity: () => 'warning' | 'critical';
setDockerPoweredOffSeverity: (value: 'warning' | 'critical') => void;
setDockerDefaults: (
value:
| {
@ -1982,8 +1986,13 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
const resource = [...guests, ...dockerContainers].find((r) => r.id === resourceId);
if (!resource) return;
const defaultDisabled = props.guestDisableConnectivity();
const defaultSeverity = props.guestPoweredOffSeverity();
const isDockerContainer = resource.type === 'dockerContainer';
const defaultDisabled = isDockerContainer
? props.dockerDisableConnectivity()
: props.guestDisableConnectivity();
const defaultSeverity = isDockerContainer
? props.dockerPoweredOffSeverity()
: props.guestPoweredOffSeverity();
const existingOverride = props.overrides().find((o) => o.id === resourceId);
const cleanThresholds: Record<string, number> = { ...(existingOverride?.thresholds || {}) };
@ -2407,7 +2416,7 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
onSaveEdit={saveEdit}
onCancelEdit={cancelEdit}
onRemoveOverride={removeOverride}
showOfflineAlertsColumn={false}
showOfflineAlertsColumn={true}
editingId={editingId}
editingThresholds={editingThresholds}
setEditingThresholds={setEditingThresholds}
@ -2479,7 +2488,7 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
onSaveEdit={saveEdit}
onCancelEdit={cancelEdit}
onRemoveOverride={removeOverride}
showOfflineAlertsColumn={false}
showOfflineAlertsColumn={true}
editingId={editingId}
editingThresholds={editingThresholds}
setEditingThresholds={setEditingThresholds}
@ -2894,15 +2903,24 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
onToggleGlobalDisable={() =>
props.setDisableAllDockerContainers(!props.disableAllDockerContainers())
}
globalDisableOfflineFlag={() => props.guestDisableConnectivity()}
globalDisableOfflineFlag={() => props.dockerDisableConnectivity()}
onToggleGlobalDisableOffline={() =>
props.setGuestDisableConnectivity(!props.guestDisableConnectivity())
props.setDockerDisableConnectivity(!props.dockerDisableConnectivity())
}
showDelayColumn={true}
globalDelaySeconds={props.timeThresholds().guest}
metricDelaySeconds={props.metricTimeThresholds().guest ?? {}}
onMetricDelayChange={(metric, value) => updateMetricDelay('guest', metric, value)}
globalOfflineSeverity={props.guestPoweredOffSeverity()}
globalOfflineSeverity={props.dockerPoweredOffSeverity()}
onSetGlobalOfflineState={(state) => {
if (state === 'off') {
props.setDockerDisableConnectivity(true);
} else {
props.setDockerDisableConnectivity(false);
props.setDockerPoweredOffSeverity(state === 'critical' ? 'critical' : 'warning');
}
props.setHasUnsavedChanges(true);
}}
onSetOfflineState={setOfflineState}
factoryDefaults={props.factoryDockerDefaults}
onResetDefaults={props.resetDockerDefaults}

View file

@ -84,6 +84,10 @@ const baseProps = () => ({
hostDefaults: { cpu: 80, memory: 85, disk: 90 },
setHostDefaults: vi.fn(),
dockerDefaults: DEFAULT_DOCKER_DEFAULTS,
dockerDisableConnectivity: () => false,
setDockerDisableConnectivity: vi.fn(),
dockerPoweredOffSeverity: () => 'warning' as const,
setDockerPoweredOffSeverity: vi.fn(),
setDockerDefaults: vi.fn(),
storageDefault: () => 85,
setStorageDefault: vi.fn(),

View file

@ -1030,6 +1030,7 @@ export function Dashboard(props: DashboardProps) {
activeSearch={search()}
parentNodeOnline={parentNodeOnline}
onCustomUrlUpdate={handleCustomUrlUpdate}
isGroupedView={groupingMode() === 'grouped'}
/>
</ComponentErrorBoundary>
);

View file

@ -53,6 +53,7 @@ interface GuestRowProps {
activeSearch?: string;
parentNodeOnline?: boolean;
onCustomUrlUpdate?: (guestId: string, url: string) => void;
isGroupedView?: boolean;
}
export function GuestRow(props: GuestRowProps) {
@ -410,7 +411,9 @@ export function GuestRow(props: GuestRowProps) {
const firstCellClass = createMemo(() => {
const base =
'py-0.5 pr-2 whitespace-nowrap relative w-[160px] sm:w-[200px] lg:w-[240px] xl:w-[280px] 2xl:w-[340px]';
const indent = 'pl-4';
const indent = props.isGroupedView
? 'pl-5 sm:pl-6 lg:pl-8'
: 'pl-4';
return `${base} ${indent}`;
});

View file

@ -36,7 +36,6 @@ import Lock from 'lucide-solid/icons/lock';
import Key from 'lucide-solid/icons/key';
import Activity from 'lucide-solid/icons/activity';
import Loader from 'lucide-solid/icons/loader';
import Boxes from 'lucide-solid/icons/boxes';
import Network from 'lucide-solid/icons/network';
import Monitor from 'lucide-solid/icons/monitor';
import Sliders from 'lucide-solid/icons/sliders-horizontal';
@ -237,8 +236,6 @@ type SettingsTab =
| 'proxmox'
| 'docker'
| 'hosts'
| 'podman'
| 'kubernetes'
| 'system-general'
| 'system-network'
| 'system-updates'
@ -250,7 +247,7 @@ type SettingsTab =
| 'diagnostics'
| 'updates';
type AgentKey = 'pve' | 'pbs' | 'pmg' | 'docker' | 'host' | 'podman' | 'kubernetes';
type AgentKey = 'pve' | 'pbs' | 'pmg' | 'docker' | 'host';
const SETTINGS_HEADER_META: Record<SettingsTab, { title: string; description: string }> = {
proxmox: {
@ -265,14 +262,6 @@ const SETTINGS_HEADER_META: Record<SettingsTab, { title: string; description: st
title: 'Hosts',
description: 'Monitor Linux, macOS, and Windows machines—servers, desktops, and laptops.',
},
podman: {
title: 'Podman',
description: 'Container monitoring for Podman hosts is coming soon.',
},
kubernetes: {
title: 'Kubernetes',
description: 'Cluster-wide monitoring via Pulse is coming soon. Watch this space for Helm charts and operators.',
},
'system-general': {
title: 'General Settings',
description: 'Configure appearance preferences and UI behavior.',
@ -349,8 +338,6 @@ const Settings: Component<SettingsProps> = (props) => {
if (path.includes('/settings/agent-hub')) return 'proxmox';
if (path.includes('/settings/docker')) return 'docker';
if (path.includes('/settings/hosts') || path.includes('/settings/host-agents') || path.includes('/settings/servers')) return 'hosts';
if (path.includes('/settings/podman')) return 'podman';
if (path.includes('/settings/kubernetes')) return 'kubernetes';
if (path.includes('/settings/system-general')) return 'system-general';
if (path.includes('/settings/system-network')) return 'system-network';
if (path.includes('/settings/system-updates')) return 'system-updates';
@ -392,8 +379,6 @@ const Settings: Component<SettingsProps> = (props) => {
) {
return 'host';
}
if (path.includes('/settings/podman')) return 'podman';
if (path.includes('/settings/kubernetes')) return 'kubernetes';
return null;
};
@ -408,9 +393,7 @@ const Settings: Component<SettingsProps> = (props) => {
pmg: '/settings/pmg',
docker: '/settings/docker',
host: '/settings/host-agents',
podman: '/settings/podman',
kubernetes: '/settings/kubernetes',
} as Record<AgentKey, string>;
};
const handleSelectAgent = (agent: AgentKey) => {
setSelectedAgent(agent);
@ -424,22 +407,10 @@ const Settings: Component<SettingsProps> = (props) => {
};
const setActiveTab = (tab: SettingsTab) => {
if (tab === 'proxmox' && selectedAgent() === 'podman') {
setSelectedAgent('pve');
} else if (tab === 'proxmox' && selectedAgent() === 'kubernetes') {
setSelectedAgent('pve');
}
if (tab === 'proxmox' && !['pve', 'pbs', 'pmg', 'docker', 'host'].includes(selectedAgent())) {
setSelectedAgent('pve');
}
if (tab === 'podman') {
setSelectedAgent('podman');
} else if (tab === 'kubernetes') {
setSelectedAgent('kubernetes');
}
const targetPath = `/settings/${tab}`;
if (location.pathname !== targetPath) {
navigate(targetPath, { scroll: false });
@ -500,10 +471,6 @@ const Settings: Component<SettingsProps> = (props) => {
} else if (!['pve', 'pbs', 'pmg', 'docker', 'host'].includes(selectedAgent())) {
setSelectedAgent('pve');
}
} else if (resolved === 'podman') {
setSelectedAgent('podman');
} else if (resolved === 'kubernetes') {
setSelectedAgent('kubernetes');
}
},
),
@ -868,8 +835,6 @@ const Settings: Component<SettingsProps> = (props) => {
{ id: 'proxmox', label: 'Proxmox', icon: <ProxmoxIcon class="w-4 h-4" /> },
{ id: 'docker', label: 'Docker', icon: <DockerIcon class="w-4 h-4" /> },
{ id: 'hosts', label: 'Hosts', icon: <Monitor class="w-4 h-4" strokeWidth={2} /> },
{ id: 'podman', label: 'Podman', icon: <Boxes class="w-4 h-4" strokeWidth={2} />, disabled: true },
{ id: 'kubernetes', label: 'Kubernetes', icon: <Network class="w-4 h-4" strokeWidth={2} />, disabled: true },
],
},
{
@ -2954,22 +2919,6 @@ const Settings: Component<SettingsProps> = (props) => {
<HostAgents />
</Show>
{/* Podman Tab */}
<Show when={activeTab() === 'podman'}>
<PlatformComingSoon
name="Podman"
description="Pulse will support Podman hosts via the same lightweight agent workflow as Docker. Keep an eye on the release notes for availability."
/>
</Show>
{/* Kubernetes Tab */}
<Show when={activeTab() === 'kubernetes'}>
<PlatformComingSoon
name="Kubernetes"
description="Native Kubernetes monitoring (agents, Helm chart, RBAC templates) is in design. Join the GitHub discussions to weigh in."
/>
</Show>
{/* Host Agents */}
<Show when={activeTab() === 'proxmox' && selectedAgent() === 'host'}>
<div class="space-y-6 mt-6">
@ -6589,22 +6538,4 @@ const Settings: Component<SettingsProps> = (props) => {
);
};
const PlatformComingSoon: Component<{ name: string; description?: string }> = (props) => {
const description =
props.description ??
`Support for ${props.name} is on the roadmap. Track progress on GitHub or join our community discussions to weigh in on requirements.`;
return (
<div class="space-y-6">
<SectionHeader title={`${props.name} integration`} description={description} />
<Card padding="lg">
<p class="text-sm text-gray-600 dark:text-gray-400">
Were collecting feedback and prioritising the engineering work for this platform. If youd like to influence
the roadmap or volunteer for early testing, please open a discussion on GitHub or reach out via Discord.
</p>
</Card>
</div>
);
};
export default Settings;

View file

@ -4,7 +4,7 @@ import Server from 'lucide-solid/icons/server';
import HardDrive from 'lucide-solid/icons/hard-drive';
import Mail from 'lucide-solid/icons/mail';
type SettingsSection = 'pve' | 'pbs' | 'pmg' | 'docker' | 'host' | 'podman' | 'kubernetes';
type SettingsSection = 'pve' | 'pbs' | 'pmg' | 'docker' | 'host';
interface SettingsSectionNavProps {
current: SettingsSection;

View file

@ -849,8 +849,14 @@ const [appriseConfig, setAppriseConfig] = createSignal<UIAppriseConfig>(
serviceWarnGapPercent: serviceWarnGap,
serviceCriticalGapPercent: serviceCriticalGap,
});
setDockerDisableConnectivity(Boolean(config.dockerDefaults.stateDisableConnectivity));
setDockerPoweredOffSeverity(
config.dockerDefaults.statePoweredOffSeverity === 'critical' ? 'critical' : 'warning',
);
} else {
setDockerDefaults({ ...FACTORY_DOCKER_DEFAULTS });
setDockerDisableConnectivity(FACTORY_DOCKER_STATE_DISABLE_CONNECTIVITY);
setDockerPoweredOffSeverity(FACTORY_DOCKER_STATE_SEVERITY);
}
setDockerIgnoredPrefixes(config.dockerIgnoredContainerPrefixes ?? []);
@ -1206,6 +1212,8 @@ const [appriseConfig, setAppriseConfig] = createSignal<UIAppriseConfig>(
serviceWarnGapPercent: 10,
serviceCriticalGapPercent: 50,
};
const FACTORY_DOCKER_STATE_DISABLE_CONNECTIVITY = false;
const FACTORY_DOCKER_STATE_SEVERITY: 'warning' | 'critical' = 'warning';
const FACTORY_STORAGE_DEFAULT = 85;
const FACTORY_SNAPSHOT_DEFAULTS: SnapshotAlertConfig = {
@ -1228,6 +1236,12 @@ const [appriseConfig, setAppriseConfig] = createSignal<UIAppriseConfig>(
const [hostDefaults, setHostDefaults] = createSignal<Record<string, number | undefined>>({ ...FACTORY_HOST_DEFAULTS });
const [dockerDefaults, setDockerDefaults] = createSignal({ ...FACTORY_DOCKER_DEFAULTS });
const [dockerDisableConnectivity, setDockerDisableConnectivity] = createSignal(
FACTORY_DOCKER_STATE_DISABLE_CONNECTIVITY,
);
const [dockerPoweredOffSeverity, setDockerPoweredOffSeverity] = createSignal<'warning' | 'critical'>(
FACTORY_DOCKER_STATE_SEVERITY,
);
const [dockerIgnoredPrefixes, setDockerIgnoredPrefixes] = createSignal<string[]>([]);
const [storageDefault, setStorageDefault] = createSignal(FACTORY_STORAGE_DEFAULT);
@ -1253,6 +1267,8 @@ const [appriseConfig, setAppriseConfig] = createSignal<UIAppriseConfig>(
const resetDockerDefaults = () => {
setDockerDefaults({ ...FACTORY_DOCKER_DEFAULTS });
setDockerDisableConnectivity(FACTORY_DOCKER_STATE_DISABLE_CONNECTIVITY);
setDockerPoweredOffSeverity(FACTORY_DOCKER_STATE_SEVERITY);
setHasUnsavedChanges(true);
};
@ -1516,6 +1532,8 @@ const [appriseConfig, setAppriseConfig] = createSignal<UIAppriseConfig>(
memoryCriticalPct: dockerDefaultsValue.memoryCriticalPct,
serviceWarnGapPercent: dockerDefaultsValue.serviceWarnGapPercent,
serviceCriticalGapPercent: dockerDefaultsValue.serviceCriticalGapPercent,
stateDisableConnectivity: dockerDisableConnectivity(),
statePoweredOffSeverity: dockerPoweredOffSeverity(),
},
dockerIgnoredContainerPrefixes: dockerIgnoredPrefixes()
.map((prefix) => prefix.trim())
@ -1766,6 +1784,10 @@ const [appriseConfig, setAppriseConfig] = createSignal<UIAppriseConfig>(
hostDefaults={hostDefaults}
setHostDefaults={setHostDefaults}
dockerDefaults={dockerDefaults}
dockerDisableConnectivity={dockerDisableConnectivity}
setDockerDisableConnectivity={setDockerDisableConnectivity}
dockerPoweredOffSeverity={dockerPoweredOffSeverity}
setDockerPoweredOffSeverity={setDockerPoweredOffSeverity}
setDockerDefaults={setDockerDefaults}
dockerIgnoredPrefixes={dockerIgnoredPrefixes}
setDockerIgnoredPrefixes={setDockerIgnoredPrefixes}
@ -2294,6 +2316,8 @@ interface ThresholdsTabProps {
serviceWarnGapPercent: number;
serviceCriticalGapPercent: number;
};
dockerDisableConnectivity: () => boolean;
dockerPoweredOffSeverity: () => 'warning' | 'critical';
dockerIgnoredPrefixes: () => string[];
storageDefault: () => number;
timeThresholds: () => { guest: number; node: number; storage: number; pbs: number };
@ -2357,6 +2381,8 @@ interface ThresholdsTabProps {
serviceCriticalGapPercent: number;
}),
) => void;
setDockerDisableConnectivity: (value: boolean) => void;
setDockerPoweredOffSeverity: (value: 'warning' | 'critical') => void;
setDockerIgnoredPrefixes: (value: string[] | ((prev: string[]) => string[])) => void;
setStorageDefault: (value: number) => void;
setMetricTimeThresholds: (
@ -2463,7 +2489,11 @@ function ThresholdsTab(props: ThresholdsTabProps) {
setNodeDefaults={props.setNodeDefaults}
setHostDefaults={props.setHostDefaults}
dockerDefaults={props.dockerDefaults()}
dockerDisableConnectivity={props.dockerDisableConnectivity}
dockerPoweredOffSeverity={props.dockerPoweredOffSeverity}
setDockerDefaults={props.setDockerDefaults}
setDockerDisableConnectivity={props.setDockerDisableConnectivity}
setDockerPoweredOffSeverity={props.setDockerPoweredOffSeverity}
dockerIgnoredPrefixes={props.dockerIgnoredPrefixes}
setDockerIgnoredPrefixes={props.setDockerIgnoredPrefixes}
storageDefault={props.storageDefault}

View file

@ -66,6 +66,8 @@ export interface DockerThresholdConfig {
memoryCriticalPct?: number;
serviceWarnGapPercent?: number;
serviceCriticalGapPercent?: number;
stateDisableConnectivity?: boolean;
statePoweredOffSeverity?: 'warning' | 'critical';
}
export interface PMGThresholdDefaults {

View file

@ -264,14 +264,16 @@ type CustomAlertRule struct {
// DockerThresholdConfig represents Docker-specific alert thresholds
type DockerThresholdConfig struct {
CPU HysteresisThreshold `json:"cpu"` // CPU usage % threshold (default: 80%)
Memory HysteresisThreshold `json:"memory"` // Memory usage % threshold (default: 85%)
RestartCount int `json:"restartCount"` // Number of restarts to trigger alert (default: 3)
RestartWindow int `json:"restartWindow"` // Time window in seconds for restart loop detection (default: 300 = 5min)
MemoryWarnPct int `json:"memoryWarnPct"` // Memory limit % to trigger warning (default: 90)
MemoryCriticalPct int `json:"memoryCriticalPct"` // Memory limit % to trigger critical (default: 95)
ServiceWarnGapPct int `json:"serviceWarnGapPercent"` // % of desired tasks missing to trigger warning (default: 10)
ServiceCritGapPct int `json:"serviceCriticalGapPercent"` // % of desired tasks missing to trigger critical (default: 50)
CPU HysteresisThreshold `json:"cpu"` // CPU usage % threshold (default: 80%)
Memory HysteresisThreshold `json:"memory"` // Memory usage % threshold (default: 85%)
RestartCount int `json:"restartCount"` // Number of restarts to trigger alert (default: 3)
RestartWindow int `json:"restartWindow"` // Time window in seconds for restart loop detection (default: 300 = 5min)
MemoryWarnPct int `json:"memoryWarnPct"` // Memory limit % to trigger warning (default: 90)
MemoryCriticalPct int `json:"memoryCriticalPct"` // Memory limit % to trigger critical (default: 95)
ServiceWarnGapPct int `json:"serviceWarnGapPercent"` // % of desired tasks missing to trigger warning (default: 10)
ServiceCritGapPct int `json:"serviceCriticalGapPercent"` // % of desired tasks missing to trigger critical (default: 50)
StateDisableConnectivity bool `json:"stateDisableConnectivity,omitempty"` // Disable container offline/state alerts globally
StatePoweredOffSeverity AlertLevel `json:"statePoweredOffSeverity,omitempty"` // Default severity for container state/offline alerts
}
// PMGThresholdConfig represents Proxmox Mail Gateway-specific alert thresholds
@ -500,12 +502,13 @@ func NewManager() *Manager {
Disk: &HysteresisThreshold{Trigger: 90, Clear: 85},
},
DockerDefaults: DockerThresholdConfig{
CPU: HysteresisThreshold{Trigger: 80, Clear: 75},
Memory: HysteresisThreshold{Trigger: 85, Clear: 80},
RestartCount: 3,
RestartWindow: 300, // 5 minutes
MemoryWarnPct: 90,
MemoryCriticalPct: 95,
CPU: HysteresisThreshold{Trigger: 80, Clear: 75},
Memory: HysteresisThreshold{Trigger: 85, Clear: 80},
RestartCount: 3,
RestartWindow: 300, // 5 minutes
MemoryWarnPct: 90,
MemoryCriticalPct: 95,
StatePoweredOffSeverity: AlertLevelWarning,
},
PMGDefaults: PMGThresholdConfig{
QueueTotalWarning: 500, // Warning at 500 total queued messages
@ -830,6 +833,10 @@ func (m *Manager) UpdateConfig(config AlertConfig) {
Msg("Adjusting Docker service critical gap to match warning gap")
config.DockerDefaults.ServiceCritGapPct = config.DockerDefaults.ServiceWarnGapPct
}
if config.DockerDefaults.StatePoweredOffSeverity == "" {
config.DockerDefaults.StatePoweredOffSeverity = AlertLevelWarning
}
config.DockerDefaults.StatePoweredOffSeverity = normalizePoweredOffSeverity(config.DockerDefaults.StatePoweredOffSeverity)
// Initialize PMG defaults if missing/zero
if config.PMGDefaults.QueueTotalWarning <= 0 {
@ -3117,14 +3124,25 @@ func (m *Manager) checkDockerContainerState(host models.DockerHost, container mo
stateKey := resourceID
m.mu.RLock()
thresholds, exists := m.config.Overrides[resourceID]
if !exists {
thresholds = m.config.GuestDefaults
}
disableConnectivity := thresholds.DisableConnectivity
severity := normalizePoweredOffSeverity(thresholds.PoweredOffSeverity)
override, hasOverride := m.config.Overrides[resourceID]
defaultDisable := m.config.DockerDefaults.StateDisableConnectivity
defaultSeverity := normalizePoweredOffSeverity(m.config.DockerDefaults.StatePoweredOffSeverity)
m.mu.RUnlock()
disableConnectivity := defaultDisable
severity := defaultSeverity
if hasOverride {
if defaultDisable && !override.DisableConnectivity {
disableConnectivity = false
} else if override.DisableConnectivity {
disableConnectivity = true
}
if override.PoweredOffSeverity != "" {
severity = normalizePoweredOffSeverity(override.PoweredOffSeverity)
}
}
if disableConnectivity {
m.clearDockerContainerStateAlert(resourceID)
return

View file

@ -990,6 +990,162 @@ func TestDockerServiceReplicaAlerts(t *testing.T) {
}
}
func TestDockerServiceUpdateStateAlert(t *testing.T) {
m := NewManager()
cfg := m.GetConfig()
cfg.Enabled = true
m.UpdateConfig(cfg)
now := time.Now()
host := models.DockerHost{
ID: "host-update",
DisplayName: "Swarm",
Hostname: "swarm.local",
Services: []models.DockerService{
{
ID: "svc-update",
Name: "api",
DesiredTasks: 1,
RunningTasks: 1,
UpdateStatus: &models.DockerServiceUpdate{
State: "rollback_failed",
Message: "Rollback failed",
CompletedAt: &now,
},
},
},
}
m.CheckDockerHost(host)
resourceID := dockerServiceResourceID(host.ID, "svc-update", "api")
alertID := fmt.Sprintf("docker-service-health-%s", resourceID)
alert, exists := m.activeAlerts[alertID]
if !exists {
t.Fatalf("expected docker service alert %s to be raised", alertID)
}
if alert.Level != AlertLevelCritical {
t.Fatalf("expected critical severity for rollback failure, got %s", alert.Level)
}
if state, ok := alert.Metadata["updateState"].(string); !ok || state != "rollback_failed" {
t.Fatalf("expected updateState metadata to be rollback_failed, got %v", alert.Metadata["updateState"])
}
}
func TestDockerContainerStateUsesDockerDefaults(t *testing.T) {
m := NewManager()
cfg := m.GetConfig()
cfg.DockerDefaults.StatePoweredOffSeverity = AlertLevelCritical
m.UpdateConfig(cfg)
container := models.DockerContainer{
ID: "container-1",
Name: "web",
State: "exited",
Status: "Exited (1) seconds ago",
}
host := models.DockerHost{
ID: "host-1",
DisplayName: "Docker Host",
Hostname: "docker.local",
Containers: []models.DockerContainer{container},
}
m.CheckDockerHost(host)
m.CheckDockerHost(host)
resourceID := dockerResourceID(host.ID, container.ID)
alertID := fmt.Sprintf("docker-container-state-%s", resourceID)
alert, exists := m.activeAlerts[alertID]
if !exists {
t.Fatalf("expected docker container state alert %s to be raised", alertID)
}
if alert.Level != AlertLevelCritical {
t.Fatalf("expected critical severity from docker defaults, got %s", alert.Level)
}
}
func TestDockerContainerStateRespectsDisableDefault(t *testing.T) {
m := NewManager()
cfg := m.GetConfig()
cfg.DockerDefaults.StateDisableConnectivity = true
m.UpdateConfig(cfg)
container := models.DockerContainer{
ID: "container-2",
Name: "batch",
State: "exited",
Status: "Exited (0) seconds ago",
}
host := models.DockerHost{
ID: "host-2",
DisplayName: "Docker Host",
Hostname: "docker.example",
Containers: []models.DockerContainer{container},
}
m.CheckDockerHost(host)
m.CheckDockerHost(host)
resourceID := dockerResourceID(host.ID, container.ID)
alertID := fmt.Sprintf("docker-container-state-%s", resourceID)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatalf("did not expect docker container state alert when defaults disable connectivity")
}
}
func TestDockerContainerMemoryLimitHysteresis(t *testing.T) {
m := NewManager()
hostID := "host-mem"
containerID := "container-mem"
hostHigh := models.DockerHost{
ID: hostID,
DisplayName: "Docker Host",
Hostname: "docker.mem",
Containers: []models.DockerContainer{
{
ID: containerID,
Name: "memory-hog",
State: "running",
Status: "Up 10 minutes",
MemoryUsage: 96 * 1024 * 1024,
MemoryLimit: 100 * 1024 * 1024,
},
},
}
m.CheckDockerHost(hostHigh)
resourceID := dockerResourceID(hostID, containerID)
alertID := fmt.Sprintf("docker-container-memory-limit-%s", resourceID)
if _, exists := m.activeAlerts[alertID]; !exists {
t.Fatalf("expected memory limit alert to be raised")
}
hostLow := models.DockerHost{
ID: hostID,
DisplayName: "Docker Host",
Hostname: "docker.mem",
Containers: []models.DockerContainer{
{
ID: containerID,
Name: "memory-hog",
State: "running",
Status: "Up 12 minutes",
MemoryUsage: 80 * 1024 * 1024,
MemoryLimit: 100 * 1024 * 1024,
},
},
}
m.CheckDockerHost(hostLow)
if _, exists := m.activeAlerts[alertID]; exists {
t.Fatalf("expected memory limit alert to clear after usage dropped below hysteresis threshold")
}
}
func TestUpdateConfigClampsDockerServiceCriticalGap(t *testing.T) {
t.Parallel()

View file

@ -159,3 +159,4 @@ func (h *DockerMetadataHandler) HandleDeleteMetadata(w http.ResponseWriter, r *h
w.WriteHeader(http.StatusNoContent)
}
// test comment

View file

@ -162,8 +162,9 @@ func (cw *ConfigWatcher) watchForChanges() {
}
if cw.apiTokensPath != "" && (filepath.Base(event.Name) == filepath.Base(cw.apiTokensPath) || event.Name == cw.apiTokensPath) {
// Debounce
time.Sleep(100 * time.Millisecond)
// Debounce - wait longer for atomic file operations to complete
// (write to .tmp, rename to final file)
time.Sleep(250 * time.Millisecond)
if event.Op&(fsnotify.Write|fsnotify.Create) != 0 {
log.Info().Str("event", event.Op.String()).Msg("Detected API token file change")
@ -399,12 +400,44 @@ func (cw *ConfigWatcher) reloadAPITokens() {
return
}
tokens, err := globalPersistence.LoadAPITokens()
// Preserve existing tokens in case reload fails
existingTokens := cw.config.APITokens
existingCount := len(existingTokens)
// Retry logic to handle temporary file system issues
var tokens []APITokenRecord
var err error
maxRetries := 3
retryDelay := 50 * time.Millisecond
for attempt := 1; attempt <= maxRetries; attempt++ {
tokens, err = globalPersistence.LoadAPITokens()
if err == nil {
break
}
if attempt < maxRetries {
log.Warn().
Err(err).
Int("attempt", attempt).
Int("maxRetries", maxRetries).
Dur("retryDelay", retryDelay).
Msg("Failed to reload API tokens, retrying...")
time.Sleep(retryDelay)
retryDelay *= 2 // Exponential backoff
}
}
if err != nil {
log.Error().Err(err).Msg("Failed to reload API tokens")
log.Error().
Err(err).
Int("existingTokens", existingCount).
Msg("Failed to reload API tokens after retries - preserving existing tokens")
// CRITICAL: Keep existing tokens rather than clearing them
return
}
// Only update if we successfully loaded tokens
cw.config.APITokens = tokens
cw.config.SortAPITokens()
cw.config.APITokenEnabled = len(tokens) > 0
@ -415,7 +448,10 @@ func (cw *ConfigWatcher) reloadAPITokens() {
}
}
log.Info().Int("count", len(tokens)).Msg("Reloaded API tokens from disk")
log.Info().
Int("count", len(tokens)).
Int("previousCount", existingCount).
Msg("Reloaded API tokens from disk")
}
// reloadMockConfig handles mock.env file changes