mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 03:20:11 +00:00
Consolidate pending changes
- Add Docker metadata test comment - Update alerts configuration and thresholds - Enhance config file watcher - Update documentation - Refine settings UI
This commit is contained in:
parent
bd4f12c98f
commit
b3285c05c8
14 changed files with 311 additions and 106 deletions
|
|
@ -228,7 +228,9 @@ PROXY_AUTH_LOGOUT_URL=/logout # URL for SSO logout
|
|||
"memoryWarnPct": 90,
|
||||
"memoryCriticalPct": 95,
|
||||
"serviceWarnGapPercent": 10,
|
||||
"serviceCriticalGapPercent": 50
|
||||
"serviceCriticalGapPercent": 50,
|
||||
"stateDisableConnectivity": false,
|
||||
"statePoweredOffSeverity": "warning"
|
||||
},
|
||||
"dockerIgnoredContainerPrefixes": [
|
||||
"runner-",
|
||||
|
|
@ -285,6 +287,7 @@ PROXY_AUTH_LOGOUT_URL=/logout # URL for SSO logout
|
|||
- `overrides` are indexed by the stable resource ID returned from `/api/state` (VMs: `instance/qemu/vmid`, containers: `instance/lxc/ctid`, nodes: `instance/node`).
|
||||
- `dockerIgnoredContainerPrefixes` lets you silence state/metric/restart alerts for ephemeral containers whose names or IDs share a common, case-insensitive prefix. The Docker tab in the UI keeps this list in sync.
|
||||
- Swarm service alerts track missing replicas: `serviceWarnGapPercent` defines when a warning fires, and `serviceCriticalGapPercent` must be greater than or equal to the warning gap (Pulse automatically clamps the critical value upward if an older client submits something smaller).
|
||||
- Docker container state controls live in `dockerDefaults`: flip `stateDisableConnectivity` to silence exit/offline alerts globally, or change `statePoweredOffSeverity` to `critical` when you want exiting containers to page immediately. Per-container overrides still take precedence.
|
||||
- Quiet hours, escalation, deduplication, and restart loop detection are all managed here, and the UI keeps the JSON in sync automatically.
|
||||
|
||||
> Tip: Back up `alerts.json` alongside `.env` during exports. Restoring it preserves all overrides, quiet-hour schedules, and webhook routing.
|
||||
|
|
|
|||
|
|
@ -167,6 +167,8 @@ The agent automatically discovers the Docker socket via the usual environment va
|
|||
|
||||
CI runners and short-lived build containers can generate noisy state alerts when they exit on schedule. In Pulse v4.24.0 and later you can provide a list of prefixes to ignore under **Alerts → Thresholds → Docker → Ignored container prefixes**. Any container whose name *or* ID begins with a configured prefix is skipped for state, health, metric, restart-loop, and OOM alerts. Matching is case-insensitive and the list is saved as `dockerIgnoredContainerPrefixes` inside `alerts.json`. Use one entry per family of ephemeral containers (for example, `runner-` or `gitlab-job-`).
|
||||
|
||||
Need the alerts but at a different tone? The same Docker tab exposes global controls for the container state detector. Flip **Disable container state alerts** (`stateDisableConnectivity`) to mute powered-off/offline warnings across the fleet, or change **Default severity** (`statePoweredOffSeverity`) to `critical` so unexpected exits page immediately. Individual host/container overrides still win when you need exceptions.
|
||||
|
||||
## Testing and troubleshooting
|
||||
|
||||
- Run with `--interval 15s --insecure` in a terminal to see log output while testing.
|
||||
|
|
|
|||
|
|
@ -188,6 +188,10 @@ interface ThresholdsTableProps {
|
|||
serviceWarnGapPercent: number;
|
||||
serviceCriticalGapPercent: number;
|
||||
};
|
||||
dockerDisableConnectivity: () => boolean;
|
||||
setDockerDisableConnectivity: (value: boolean) => void;
|
||||
dockerPoweredOffSeverity: () => 'warning' | 'critical';
|
||||
setDockerPoweredOffSeverity: (value: 'warning' | 'critical') => void;
|
||||
setDockerDefaults: (
|
||||
value:
|
||||
| {
|
||||
|
|
@ -1982,8 +1986,13 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
|
|||
const resource = [...guests, ...dockerContainers].find((r) => r.id === resourceId);
|
||||
if (!resource) return;
|
||||
|
||||
const defaultDisabled = props.guestDisableConnectivity();
|
||||
const defaultSeverity = props.guestPoweredOffSeverity();
|
||||
const isDockerContainer = resource.type === 'dockerContainer';
|
||||
const defaultDisabled = isDockerContainer
|
||||
? props.dockerDisableConnectivity()
|
||||
: props.guestDisableConnectivity();
|
||||
const defaultSeverity = isDockerContainer
|
||||
? props.dockerPoweredOffSeverity()
|
||||
: props.guestPoweredOffSeverity();
|
||||
|
||||
const existingOverride = props.overrides().find((o) => o.id === resourceId);
|
||||
const cleanThresholds: Record<string, number> = { ...(existingOverride?.thresholds || {}) };
|
||||
|
|
@ -2407,7 +2416,7 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
|
|||
onSaveEdit={saveEdit}
|
||||
onCancelEdit={cancelEdit}
|
||||
onRemoveOverride={removeOverride}
|
||||
showOfflineAlertsColumn={false}
|
||||
showOfflineAlertsColumn={true}
|
||||
editingId={editingId}
|
||||
editingThresholds={editingThresholds}
|
||||
setEditingThresholds={setEditingThresholds}
|
||||
|
|
@ -2479,7 +2488,7 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
|
|||
onSaveEdit={saveEdit}
|
||||
onCancelEdit={cancelEdit}
|
||||
onRemoveOverride={removeOverride}
|
||||
showOfflineAlertsColumn={false}
|
||||
showOfflineAlertsColumn={true}
|
||||
editingId={editingId}
|
||||
editingThresholds={editingThresholds}
|
||||
setEditingThresholds={setEditingThresholds}
|
||||
|
|
@ -2894,15 +2903,24 @@ export function ThresholdsTable(props: ThresholdsTableProps) {
|
|||
onToggleGlobalDisable={() =>
|
||||
props.setDisableAllDockerContainers(!props.disableAllDockerContainers())
|
||||
}
|
||||
globalDisableOfflineFlag={() => props.guestDisableConnectivity()}
|
||||
globalDisableOfflineFlag={() => props.dockerDisableConnectivity()}
|
||||
onToggleGlobalDisableOffline={() =>
|
||||
props.setGuestDisableConnectivity(!props.guestDisableConnectivity())
|
||||
props.setDockerDisableConnectivity(!props.dockerDisableConnectivity())
|
||||
}
|
||||
showDelayColumn={true}
|
||||
globalDelaySeconds={props.timeThresholds().guest}
|
||||
metricDelaySeconds={props.metricTimeThresholds().guest ?? {}}
|
||||
onMetricDelayChange={(metric, value) => updateMetricDelay('guest', metric, value)}
|
||||
globalOfflineSeverity={props.guestPoweredOffSeverity()}
|
||||
globalOfflineSeverity={props.dockerPoweredOffSeverity()}
|
||||
onSetGlobalOfflineState={(state) => {
|
||||
if (state === 'off') {
|
||||
props.setDockerDisableConnectivity(true);
|
||||
} else {
|
||||
props.setDockerDisableConnectivity(false);
|
||||
props.setDockerPoweredOffSeverity(state === 'critical' ? 'critical' : 'warning');
|
||||
}
|
||||
props.setHasUnsavedChanges(true);
|
||||
}}
|
||||
onSetOfflineState={setOfflineState}
|
||||
factoryDefaults={props.factoryDockerDefaults}
|
||||
onResetDefaults={props.resetDockerDefaults}
|
||||
|
|
|
|||
|
|
@ -84,6 +84,10 @@ const baseProps = () => ({
|
|||
hostDefaults: { cpu: 80, memory: 85, disk: 90 },
|
||||
setHostDefaults: vi.fn(),
|
||||
dockerDefaults: DEFAULT_DOCKER_DEFAULTS,
|
||||
dockerDisableConnectivity: () => false,
|
||||
setDockerDisableConnectivity: vi.fn(),
|
||||
dockerPoweredOffSeverity: () => 'warning' as const,
|
||||
setDockerPoweredOffSeverity: vi.fn(),
|
||||
setDockerDefaults: vi.fn(),
|
||||
storageDefault: () => 85,
|
||||
setStorageDefault: vi.fn(),
|
||||
|
|
|
|||
|
|
@ -1030,6 +1030,7 @@ export function Dashboard(props: DashboardProps) {
|
|||
activeSearch={search()}
|
||||
parentNodeOnline={parentNodeOnline}
|
||||
onCustomUrlUpdate={handleCustomUrlUpdate}
|
||||
isGroupedView={groupingMode() === 'grouped'}
|
||||
/>
|
||||
</ComponentErrorBoundary>
|
||||
);
|
||||
|
|
|
|||
|
|
@ -53,6 +53,7 @@ interface GuestRowProps {
|
|||
activeSearch?: string;
|
||||
parentNodeOnline?: boolean;
|
||||
onCustomUrlUpdate?: (guestId: string, url: string) => void;
|
||||
isGroupedView?: boolean;
|
||||
}
|
||||
|
||||
export function GuestRow(props: GuestRowProps) {
|
||||
|
|
@ -410,7 +411,9 @@ export function GuestRow(props: GuestRowProps) {
|
|||
const firstCellClass = createMemo(() => {
|
||||
const base =
|
||||
'py-0.5 pr-2 whitespace-nowrap relative w-[160px] sm:w-[200px] lg:w-[240px] xl:w-[280px] 2xl:w-[340px]';
|
||||
const indent = 'pl-4';
|
||||
const indent = props.isGroupedView
|
||||
? 'pl-5 sm:pl-6 lg:pl-8'
|
||||
: 'pl-4';
|
||||
return `${base} ${indent}`;
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -36,7 +36,6 @@ import Lock from 'lucide-solid/icons/lock';
|
|||
import Key from 'lucide-solid/icons/key';
|
||||
import Activity from 'lucide-solid/icons/activity';
|
||||
import Loader from 'lucide-solid/icons/loader';
|
||||
import Boxes from 'lucide-solid/icons/boxes';
|
||||
import Network from 'lucide-solid/icons/network';
|
||||
import Monitor from 'lucide-solid/icons/monitor';
|
||||
import Sliders from 'lucide-solid/icons/sliders-horizontal';
|
||||
|
|
@ -237,8 +236,6 @@ type SettingsTab =
|
|||
| 'proxmox'
|
||||
| 'docker'
|
||||
| 'hosts'
|
||||
| 'podman'
|
||||
| 'kubernetes'
|
||||
| 'system-general'
|
||||
| 'system-network'
|
||||
| 'system-updates'
|
||||
|
|
@ -250,7 +247,7 @@ type SettingsTab =
|
|||
| 'diagnostics'
|
||||
| 'updates';
|
||||
|
||||
type AgentKey = 'pve' | 'pbs' | 'pmg' | 'docker' | 'host' | 'podman' | 'kubernetes';
|
||||
type AgentKey = 'pve' | 'pbs' | 'pmg' | 'docker' | 'host';
|
||||
|
||||
const SETTINGS_HEADER_META: Record<SettingsTab, { title: string; description: string }> = {
|
||||
proxmox: {
|
||||
|
|
@ -265,14 +262,6 @@ const SETTINGS_HEADER_META: Record<SettingsTab, { title: string; description: st
|
|||
title: 'Hosts',
|
||||
description: 'Monitor Linux, macOS, and Windows machines—servers, desktops, and laptops.',
|
||||
},
|
||||
podman: {
|
||||
title: 'Podman',
|
||||
description: 'Container monitoring for Podman hosts is coming soon.',
|
||||
},
|
||||
kubernetes: {
|
||||
title: 'Kubernetes',
|
||||
description: 'Cluster-wide monitoring via Pulse is coming soon. Watch this space for Helm charts and operators.',
|
||||
},
|
||||
'system-general': {
|
||||
title: 'General Settings',
|
||||
description: 'Configure appearance preferences and UI behavior.',
|
||||
|
|
@ -349,8 +338,6 @@ const Settings: Component<SettingsProps> = (props) => {
|
|||
if (path.includes('/settings/agent-hub')) return 'proxmox';
|
||||
if (path.includes('/settings/docker')) return 'docker';
|
||||
if (path.includes('/settings/hosts') || path.includes('/settings/host-agents') || path.includes('/settings/servers')) return 'hosts';
|
||||
if (path.includes('/settings/podman')) return 'podman';
|
||||
if (path.includes('/settings/kubernetes')) return 'kubernetes';
|
||||
if (path.includes('/settings/system-general')) return 'system-general';
|
||||
if (path.includes('/settings/system-network')) return 'system-network';
|
||||
if (path.includes('/settings/system-updates')) return 'system-updates';
|
||||
|
|
@ -392,8 +379,6 @@ const Settings: Component<SettingsProps> = (props) => {
|
|||
) {
|
||||
return 'host';
|
||||
}
|
||||
if (path.includes('/settings/podman')) return 'podman';
|
||||
if (path.includes('/settings/kubernetes')) return 'kubernetes';
|
||||
return null;
|
||||
};
|
||||
|
||||
|
|
@ -408,9 +393,7 @@ const Settings: Component<SettingsProps> = (props) => {
|
|||
pmg: '/settings/pmg',
|
||||
docker: '/settings/docker',
|
||||
host: '/settings/host-agents',
|
||||
podman: '/settings/podman',
|
||||
kubernetes: '/settings/kubernetes',
|
||||
} as Record<AgentKey, string>;
|
||||
};
|
||||
|
||||
const handleSelectAgent = (agent: AgentKey) => {
|
||||
setSelectedAgent(agent);
|
||||
|
|
@ -424,22 +407,10 @@ const Settings: Component<SettingsProps> = (props) => {
|
|||
};
|
||||
|
||||
const setActiveTab = (tab: SettingsTab) => {
|
||||
if (tab === 'proxmox' && selectedAgent() === 'podman') {
|
||||
setSelectedAgent('pve');
|
||||
} else if (tab === 'proxmox' && selectedAgent() === 'kubernetes') {
|
||||
setSelectedAgent('pve');
|
||||
}
|
||||
|
||||
if (tab === 'proxmox' && !['pve', 'pbs', 'pmg', 'docker', 'host'].includes(selectedAgent())) {
|
||||
setSelectedAgent('pve');
|
||||
}
|
||||
|
||||
if (tab === 'podman') {
|
||||
setSelectedAgent('podman');
|
||||
} else if (tab === 'kubernetes') {
|
||||
setSelectedAgent('kubernetes');
|
||||
}
|
||||
|
||||
const targetPath = `/settings/${tab}`;
|
||||
if (location.pathname !== targetPath) {
|
||||
navigate(targetPath, { scroll: false });
|
||||
|
|
@ -500,10 +471,6 @@ const Settings: Component<SettingsProps> = (props) => {
|
|||
} else if (!['pve', 'pbs', 'pmg', 'docker', 'host'].includes(selectedAgent())) {
|
||||
setSelectedAgent('pve');
|
||||
}
|
||||
} else if (resolved === 'podman') {
|
||||
setSelectedAgent('podman');
|
||||
} else if (resolved === 'kubernetes') {
|
||||
setSelectedAgent('kubernetes');
|
||||
}
|
||||
},
|
||||
),
|
||||
|
|
@ -868,8 +835,6 @@ const Settings: Component<SettingsProps> = (props) => {
|
|||
{ id: 'proxmox', label: 'Proxmox', icon: <ProxmoxIcon class="w-4 h-4" /> },
|
||||
{ id: 'docker', label: 'Docker', icon: <DockerIcon class="w-4 h-4" /> },
|
||||
{ id: 'hosts', label: 'Hosts', icon: <Monitor class="w-4 h-4" strokeWidth={2} /> },
|
||||
{ id: 'podman', label: 'Podman', icon: <Boxes class="w-4 h-4" strokeWidth={2} />, disabled: true },
|
||||
{ id: 'kubernetes', label: 'Kubernetes', icon: <Network class="w-4 h-4" strokeWidth={2} />, disabled: true },
|
||||
],
|
||||
},
|
||||
{
|
||||
|
|
@ -2954,22 +2919,6 @@ const Settings: Component<SettingsProps> = (props) => {
|
|||
<HostAgents />
|
||||
</Show>
|
||||
|
||||
{/* Podman Tab */}
|
||||
<Show when={activeTab() === 'podman'}>
|
||||
<PlatformComingSoon
|
||||
name="Podman"
|
||||
description="Pulse will support Podman hosts via the same lightweight agent workflow as Docker. Keep an eye on the release notes for availability."
|
||||
/>
|
||||
</Show>
|
||||
|
||||
{/* Kubernetes Tab */}
|
||||
<Show when={activeTab() === 'kubernetes'}>
|
||||
<PlatformComingSoon
|
||||
name="Kubernetes"
|
||||
description="Native Kubernetes monitoring (agents, Helm chart, RBAC templates) is in design. Join the GitHub discussions to weigh in."
|
||||
/>
|
||||
</Show>
|
||||
|
||||
{/* Host Agents */}
|
||||
<Show when={activeTab() === 'proxmox' && selectedAgent() === 'host'}>
|
||||
<div class="space-y-6 mt-6">
|
||||
|
|
@ -6589,22 +6538,4 @@ const Settings: Component<SettingsProps> = (props) => {
|
|||
);
|
||||
};
|
||||
|
||||
const PlatformComingSoon: Component<{ name: string; description?: string }> = (props) => {
|
||||
const description =
|
||||
props.description ??
|
||||
`Support for ${props.name} is on the roadmap. Track progress on GitHub or join our community discussions to weigh in on requirements.`;
|
||||
|
||||
return (
|
||||
<div class="space-y-6">
|
||||
<SectionHeader title={`${props.name} integration`} description={description} />
|
||||
<Card padding="lg">
|
||||
<p class="text-sm text-gray-600 dark:text-gray-400">
|
||||
We’re collecting feedback and prioritising the engineering work for this platform. If you’d like to influence
|
||||
the roadmap or volunteer for early testing, please open a discussion on GitHub or reach out via Discord.
|
||||
</p>
|
||||
</Card>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
export default Settings;
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ import Server from 'lucide-solid/icons/server';
|
|||
import HardDrive from 'lucide-solid/icons/hard-drive';
|
||||
import Mail from 'lucide-solid/icons/mail';
|
||||
|
||||
type SettingsSection = 'pve' | 'pbs' | 'pmg' | 'docker' | 'host' | 'podman' | 'kubernetes';
|
||||
type SettingsSection = 'pve' | 'pbs' | 'pmg' | 'docker' | 'host';
|
||||
|
||||
interface SettingsSectionNavProps {
|
||||
current: SettingsSection;
|
||||
|
|
|
|||
|
|
@ -849,8 +849,14 @@ const [appriseConfig, setAppriseConfig] = createSignal<UIAppriseConfig>(
|
|||
serviceWarnGapPercent: serviceWarnGap,
|
||||
serviceCriticalGapPercent: serviceCriticalGap,
|
||||
});
|
||||
setDockerDisableConnectivity(Boolean(config.dockerDefaults.stateDisableConnectivity));
|
||||
setDockerPoweredOffSeverity(
|
||||
config.dockerDefaults.statePoweredOffSeverity === 'critical' ? 'critical' : 'warning',
|
||||
);
|
||||
} else {
|
||||
setDockerDefaults({ ...FACTORY_DOCKER_DEFAULTS });
|
||||
setDockerDisableConnectivity(FACTORY_DOCKER_STATE_DISABLE_CONNECTIVITY);
|
||||
setDockerPoweredOffSeverity(FACTORY_DOCKER_STATE_SEVERITY);
|
||||
}
|
||||
setDockerIgnoredPrefixes(config.dockerIgnoredContainerPrefixes ?? []);
|
||||
|
||||
|
|
@ -1206,6 +1212,8 @@ const [appriseConfig, setAppriseConfig] = createSignal<UIAppriseConfig>(
|
|||
serviceWarnGapPercent: 10,
|
||||
serviceCriticalGapPercent: 50,
|
||||
};
|
||||
const FACTORY_DOCKER_STATE_DISABLE_CONNECTIVITY = false;
|
||||
const FACTORY_DOCKER_STATE_SEVERITY: 'warning' | 'critical' = 'warning';
|
||||
|
||||
const FACTORY_STORAGE_DEFAULT = 85;
|
||||
const FACTORY_SNAPSHOT_DEFAULTS: SnapshotAlertConfig = {
|
||||
|
|
@ -1228,6 +1236,12 @@ const [appriseConfig, setAppriseConfig] = createSignal<UIAppriseConfig>(
|
|||
const [hostDefaults, setHostDefaults] = createSignal<Record<string, number | undefined>>({ ...FACTORY_HOST_DEFAULTS });
|
||||
|
||||
const [dockerDefaults, setDockerDefaults] = createSignal({ ...FACTORY_DOCKER_DEFAULTS });
|
||||
const [dockerDisableConnectivity, setDockerDisableConnectivity] = createSignal(
|
||||
FACTORY_DOCKER_STATE_DISABLE_CONNECTIVITY,
|
||||
);
|
||||
const [dockerPoweredOffSeverity, setDockerPoweredOffSeverity] = createSignal<'warning' | 'critical'>(
|
||||
FACTORY_DOCKER_STATE_SEVERITY,
|
||||
);
|
||||
const [dockerIgnoredPrefixes, setDockerIgnoredPrefixes] = createSignal<string[]>([]);
|
||||
|
||||
const [storageDefault, setStorageDefault] = createSignal(FACTORY_STORAGE_DEFAULT);
|
||||
|
|
@ -1253,6 +1267,8 @@ const [appriseConfig, setAppriseConfig] = createSignal<UIAppriseConfig>(
|
|||
|
||||
const resetDockerDefaults = () => {
|
||||
setDockerDefaults({ ...FACTORY_DOCKER_DEFAULTS });
|
||||
setDockerDisableConnectivity(FACTORY_DOCKER_STATE_DISABLE_CONNECTIVITY);
|
||||
setDockerPoweredOffSeverity(FACTORY_DOCKER_STATE_SEVERITY);
|
||||
setHasUnsavedChanges(true);
|
||||
};
|
||||
|
||||
|
|
@ -1516,6 +1532,8 @@ const [appriseConfig, setAppriseConfig] = createSignal<UIAppriseConfig>(
|
|||
memoryCriticalPct: dockerDefaultsValue.memoryCriticalPct,
|
||||
serviceWarnGapPercent: dockerDefaultsValue.serviceWarnGapPercent,
|
||||
serviceCriticalGapPercent: dockerDefaultsValue.serviceCriticalGapPercent,
|
||||
stateDisableConnectivity: dockerDisableConnectivity(),
|
||||
statePoweredOffSeverity: dockerPoweredOffSeverity(),
|
||||
},
|
||||
dockerIgnoredContainerPrefixes: dockerIgnoredPrefixes()
|
||||
.map((prefix) => prefix.trim())
|
||||
|
|
@ -1766,6 +1784,10 @@ const [appriseConfig, setAppriseConfig] = createSignal<UIAppriseConfig>(
|
|||
hostDefaults={hostDefaults}
|
||||
setHostDefaults={setHostDefaults}
|
||||
dockerDefaults={dockerDefaults}
|
||||
dockerDisableConnectivity={dockerDisableConnectivity}
|
||||
setDockerDisableConnectivity={setDockerDisableConnectivity}
|
||||
dockerPoweredOffSeverity={dockerPoweredOffSeverity}
|
||||
setDockerPoweredOffSeverity={setDockerPoweredOffSeverity}
|
||||
setDockerDefaults={setDockerDefaults}
|
||||
dockerIgnoredPrefixes={dockerIgnoredPrefixes}
|
||||
setDockerIgnoredPrefixes={setDockerIgnoredPrefixes}
|
||||
|
|
@ -2294,6 +2316,8 @@ interface ThresholdsTabProps {
|
|||
serviceWarnGapPercent: number;
|
||||
serviceCriticalGapPercent: number;
|
||||
};
|
||||
dockerDisableConnectivity: () => boolean;
|
||||
dockerPoweredOffSeverity: () => 'warning' | 'critical';
|
||||
dockerIgnoredPrefixes: () => string[];
|
||||
storageDefault: () => number;
|
||||
timeThresholds: () => { guest: number; node: number; storage: number; pbs: number };
|
||||
|
|
@ -2357,6 +2381,8 @@ interface ThresholdsTabProps {
|
|||
serviceCriticalGapPercent: number;
|
||||
}),
|
||||
) => void;
|
||||
setDockerDisableConnectivity: (value: boolean) => void;
|
||||
setDockerPoweredOffSeverity: (value: 'warning' | 'critical') => void;
|
||||
setDockerIgnoredPrefixes: (value: string[] | ((prev: string[]) => string[])) => void;
|
||||
setStorageDefault: (value: number) => void;
|
||||
setMetricTimeThresholds: (
|
||||
|
|
@ -2463,7 +2489,11 @@ function ThresholdsTab(props: ThresholdsTabProps) {
|
|||
setNodeDefaults={props.setNodeDefaults}
|
||||
setHostDefaults={props.setHostDefaults}
|
||||
dockerDefaults={props.dockerDefaults()}
|
||||
dockerDisableConnectivity={props.dockerDisableConnectivity}
|
||||
dockerPoweredOffSeverity={props.dockerPoweredOffSeverity}
|
||||
setDockerDefaults={props.setDockerDefaults}
|
||||
setDockerDisableConnectivity={props.setDockerDisableConnectivity}
|
||||
setDockerPoweredOffSeverity={props.setDockerPoweredOffSeverity}
|
||||
dockerIgnoredPrefixes={props.dockerIgnoredPrefixes}
|
||||
setDockerIgnoredPrefixes={props.setDockerIgnoredPrefixes}
|
||||
storageDefault={props.storageDefault}
|
||||
|
|
|
|||
|
|
@ -66,6 +66,8 @@ export interface DockerThresholdConfig {
|
|||
memoryCriticalPct?: number;
|
||||
serviceWarnGapPercent?: number;
|
||||
serviceCriticalGapPercent?: number;
|
||||
stateDisableConnectivity?: boolean;
|
||||
statePoweredOffSeverity?: 'warning' | 'critical';
|
||||
}
|
||||
|
||||
export interface PMGThresholdDefaults {
|
||||
|
|
|
|||
|
|
@ -264,14 +264,16 @@ type CustomAlertRule struct {
|
|||
|
||||
// DockerThresholdConfig represents Docker-specific alert thresholds
|
||||
type DockerThresholdConfig struct {
|
||||
CPU HysteresisThreshold `json:"cpu"` // CPU usage % threshold (default: 80%)
|
||||
Memory HysteresisThreshold `json:"memory"` // Memory usage % threshold (default: 85%)
|
||||
RestartCount int `json:"restartCount"` // Number of restarts to trigger alert (default: 3)
|
||||
RestartWindow int `json:"restartWindow"` // Time window in seconds for restart loop detection (default: 300 = 5min)
|
||||
MemoryWarnPct int `json:"memoryWarnPct"` // Memory limit % to trigger warning (default: 90)
|
||||
MemoryCriticalPct int `json:"memoryCriticalPct"` // Memory limit % to trigger critical (default: 95)
|
||||
ServiceWarnGapPct int `json:"serviceWarnGapPercent"` // % of desired tasks missing to trigger warning (default: 10)
|
||||
ServiceCritGapPct int `json:"serviceCriticalGapPercent"` // % of desired tasks missing to trigger critical (default: 50)
|
||||
CPU HysteresisThreshold `json:"cpu"` // CPU usage % threshold (default: 80%)
|
||||
Memory HysteresisThreshold `json:"memory"` // Memory usage % threshold (default: 85%)
|
||||
RestartCount int `json:"restartCount"` // Number of restarts to trigger alert (default: 3)
|
||||
RestartWindow int `json:"restartWindow"` // Time window in seconds for restart loop detection (default: 300 = 5min)
|
||||
MemoryWarnPct int `json:"memoryWarnPct"` // Memory limit % to trigger warning (default: 90)
|
||||
MemoryCriticalPct int `json:"memoryCriticalPct"` // Memory limit % to trigger critical (default: 95)
|
||||
ServiceWarnGapPct int `json:"serviceWarnGapPercent"` // % of desired tasks missing to trigger warning (default: 10)
|
||||
ServiceCritGapPct int `json:"serviceCriticalGapPercent"` // % of desired tasks missing to trigger critical (default: 50)
|
||||
StateDisableConnectivity bool `json:"stateDisableConnectivity,omitempty"` // Disable container offline/state alerts globally
|
||||
StatePoweredOffSeverity AlertLevel `json:"statePoweredOffSeverity,omitempty"` // Default severity for container state/offline alerts
|
||||
}
|
||||
|
||||
// PMGThresholdConfig represents Proxmox Mail Gateway-specific alert thresholds
|
||||
|
|
@ -500,12 +502,13 @@ func NewManager() *Manager {
|
|||
Disk: &HysteresisThreshold{Trigger: 90, Clear: 85},
|
||||
},
|
||||
DockerDefaults: DockerThresholdConfig{
|
||||
CPU: HysteresisThreshold{Trigger: 80, Clear: 75},
|
||||
Memory: HysteresisThreshold{Trigger: 85, Clear: 80},
|
||||
RestartCount: 3,
|
||||
RestartWindow: 300, // 5 minutes
|
||||
MemoryWarnPct: 90,
|
||||
MemoryCriticalPct: 95,
|
||||
CPU: HysteresisThreshold{Trigger: 80, Clear: 75},
|
||||
Memory: HysteresisThreshold{Trigger: 85, Clear: 80},
|
||||
RestartCount: 3,
|
||||
RestartWindow: 300, // 5 minutes
|
||||
MemoryWarnPct: 90,
|
||||
MemoryCriticalPct: 95,
|
||||
StatePoweredOffSeverity: AlertLevelWarning,
|
||||
},
|
||||
PMGDefaults: PMGThresholdConfig{
|
||||
QueueTotalWarning: 500, // Warning at 500 total queued messages
|
||||
|
|
@ -830,6 +833,10 @@ func (m *Manager) UpdateConfig(config AlertConfig) {
|
|||
Msg("Adjusting Docker service critical gap to match warning gap")
|
||||
config.DockerDefaults.ServiceCritGapPct = config.DockerDefaults.ServiceWarnGapPct
|
||||
}
|
||||
if config.DockerDefaults.StatePoweredOffSeverity == "" {
|
||||
config.DockerDefaults.StatePoweredOffSeverity = AlertLevelWarning
|
||||
}
|
||||
config.DockerDefaults.StatePoweredOffSeverity = normalizePoweredOffSeverity(config.DockerDefaults.StatePoweredOffSeverity)
|
||||
|
||||
// Initialize PMG defaults if missing/zero
|
||||
if config.PMGDefaults.QueueTotalWarning <= 0 {
|
||||
|
|
@ -3117,14 +3124,25 @@ func (m *Manager) checkDockerContainerState(host models.DockerHost, container mo
|
|||
stateKey := resourceID
|
||||
|
||||
m.mu.RLock()
|
||||
thresholds, exists := m.config.Overrides[resourceID]
|
||||
if !exists {
|
||||
thresholds = m.config.GuestDefaults
|
||||
}
|
||||
disableConnectivity := thresholds.DisableConnectivity
|
||||
severity := normalizePoweredOffSeverity(thresholds.PoweredOffSeverity)
|
||||
override, hasOverride := m.config.Overrides[resourceID]
|
||||
defaultDisable := m.config.DockerDefaults.StateDisableConnectivity
|
||||
defaultSeverity := normalizePoweredOffSeverity(m.config.DockerDefaults.StatePoweredOffSeverity)
|
||||
m.mu.RUnlock()
|
||||
|
||||
disableConnectivity := defaultDisable
|
||||
severity := defaultSeverity
|
||||
if hasOverride {
|
||||
if defaultDisable && !override.DisableConnectivity {
|
||||
disableConnectivity = false
|
||||
} else if override.DisableConnectivity {
|
||||
disableConnectivity = true
|
||||
}
|
||||
|
||||
if override.PoweredOffSeverity != "" {
|
||||
severity = normalizePoweredOffSeverity(override.PoweredOffSeverity)
|
||||
}
|
||||
}
|
||||
|
||||
if disableConnectivity {
|
||||
m.clearDockerContainerStateAlert(resourceID)
|
||||
return
|
||||
|
|
|
|||
|
|
@ -990,6 +990,162 @@ func TestDockerServiceReplicaAlerts(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestDockerServiceUpdateStateAlert(t *testing.T) {
|
||||
m := NewManager()
|
||||
cfg := m.GetConfig()
|
||||
cfg.Enabled = true
|
||||
m.UpdateConfig(cfg)
|
||||
|
||||
now := time.Now()
|
||||
host := models.DockerHost{
|
||||
ID: "host-update",
|
||||
DisplayName: "Swarm",
|
||||
Hostname: "swarm.local",
|
||||
Services: []models.DockerService{
|
||||
{
|
||||
ID: "svc-update",
|
||||
Name: "api",
|
||||
DesiredTasks: 1,
|
||||
RunningTasks: 1,
|
||||
UpdateStatus: &models.DockerServiceUpdate{
|
||||
State: "rollback_failed",
|
||||
Message: "Rollback failed",
|
||||
CompletedAt: &now,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
m.CheckDockerHost(host)
|
||||
|
||||
resourceID := dockerServiceResourceID(host.ID, "svc-update", "api")
|
||||
alertID := fmt.Sprintf("docker-service-health-%s", resourceID)
|
||||
alert, exists := m.activeAlerts[alertID]
|
||||
if !exists {
|
||||
t.Fatalf("expected docker service alert %s to be raised", alertID)
|
||||
}
|
||||
if alert.Level != AlertLevelCritical {
|
||||
t.Fatalf("expected critical severity for rollback failure, got %s", alert.Level)
|
||||
}
|
||||
if state, ok := alert.Metadata["updateState"].(string); !ok || state != "rollback_failed" {
|
||||
t.Fatalf("expected updateState metadata to be rollback_failed, got %v", alert.Metadata["updateState"])
|
||||
}
|
||||
}
|
||||
|
||||
func TestDockerContainerStateUsesDockerDefaults(t *testing.T) {
|
||||
m := NewManager()
|
||||
cfg := m.GetConfig()
|
||||
cfg.DockerDefaults.StatePoweredOffSeverity = AlertLevelCritical
|
||||
m.UpdateConfig(cfg)
|
||||
|
||||
container := models.DockerContainer{
|
||||
ID: "container-1",
|
||||
Name: "web",
|
||||
State: "exited",
|
||||
Status: "Exited (1) seconds ago",
|
||||
}
|
||||
host := models.DockerHost{
|
||||
ID: "host-1",
|
||||
DisplayName: "Docker Host",
|
||||
Hostname: "docker.local",
|
||||
Containers: []models.DockerContainer{container},
|
||||
}
|
||||
|
||||
m.CheckDockerHost(host)
|
||||
m.CheckDockerHost(host)
|
||||
|
||||
resourceID := dockerResourceID(host.ID, container.ID)
|
||||
alertID := fmt.Sprintf("docker-container-state-%s", resourceID)
|
||||
alert, exists := m.activeAlerts[alertID]
|
||||
if !exists {
|
||||
t.Fatalf("expected docker container state alert %s to be raised", alertID)
|
||||
}
|
||||
if alert.Level != AlertLevelCritical {
|
||||
t.Fatalf("expected critical severity from docker defaults, got %s", alert.Level)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDockerContainerStateRespectsDisableDefault(t *testing.T) {
|
||||
m := NewManager()
|
||||
cfg := m.GetConfig()
|
||||
cfg.DockerDefaults.StateDisableConnectivity = true
|
||||
m.UpdateConfig(cfg)
|
||||
|
||||
container := models.DockerContainer{
|
||||
ID: "container-2",
|
||||
Name: "batch",
|
||||
State: "exited",
|
||||
Status: "Exited (0) seconds ago",
|
||||
}
|
||||
host := models.DockerHost{
|
||||
ID: "host-2",
|
||||
DisplayName: "Docker Host",
|
||||
Hostname: "docker.example",
|
||||
Containers: []models.DockerContainer{container},
|
||||
}
|
||||
|
||||
m.CheckDockerHost(host)
|
||||
m.CheckDockerHost(host)
|
||||
|
||||
resourceID := dockerResourceID(host.ID, container.ID)
|
||||
alertID := fmt.Sprintf("docker-container-state-%s", resourceID)
|
||||
if _, exists := m.activeAlerts[alertID]; exists {
|
||||
t.Fatalf("did not expect docker container state alert when defaults disable connectivity")
|
||||
}
|
||||
}
|
||||
|
||||
func TestDockerContainerMemoryLimitHysteresis(t *testing.T) {
|
||||
m := NewManager()
|
||||
|
||||
hostID := "host-mem"
|
||||
containerID := "container-mem"
|
||||
hostHigh := models.DockerHost{
|
||||
ID: hostID,
|
||||
DisplayName: "Docker Host",
|
||||
Hostname: "docker.mem",
|
||||
Containers: []models.DockerContainer{
|
||||
{
|
||||
ID: containerID,
|
||||
Name: "memory-hog",
|
||||
State: "running",
|
||||
Status: "Up 10 minutes",
|
||||
MemoryUsage: 96 * 1024 * 1024,
|
||||
MemoryLimit: 100 * 1024 * 1024,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
m.CheckDockerHost(hostHigh)
|
||||
|
||||
resourceID := dockerResourceID(hostID, containerID)
|
||||
alertID := fmt.Sprintf("docker-container-memory-limit-%s", resourceID)
|
||||
if _, exists := m.activeAlerts[alertID]; !exists {
|
||||
t.Fatalf("expected memory limit alert to be raised")
|
||||
}
|
||||
|
||||
hostLow := models.DockerHost{
|
||||
ID: hostID,
|
||||
DisplayName: "Docker Host",
|
||||
Hostname: "docker.mem",
|
||||
Containers: []models.DockerContainer{
|
||||
{
|
||||
ID: containerID,
|
||||
Name: "memory-hog",
|
||||
State: "running",
|
||||
Status: "Up 12 minutes",
|
||||
MemoryUsage: 80 * 1024 * 1024,
|
||||
MemoryLimit: 100 * 1024 * 1024,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
m.CheckDockerHost(hostLow)
|
||||
|
||||
if _, exists := m.activeAlerts[alertID]; exists {
|
||||
t.Fatalf("expected memory limit alert to clear after usage dropped below hysteresis threshold")
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateConfigClampsDockerServiceCriticalGap(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
|
|
|||
|
|
@ -159,3 +159,4 @@ func (h *DockerMetadataHandler) HandleDeleteMetadata(w http.ResponseWriter, r *h
|
|||
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}
|
||||
// test comment
|
||||
|
|
|
|||
|
|
@ -162,8 +162,9 @@ func (cw *ConfigWatcher) watchForChanges() {
|
|||
}
|
||||
|
||||
if cw.apiTokensPath != "" && (filepath.Base(event.Name) == filepath.Base(cw.apiTokensPath) || event.Name == cw.apiTokensPath) {
|
||||
// Debounce
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
// Debounce - wait longer for atomic file operations to complete
|
||||
// (write to .tmp, rename to final file)
|
||||
time.Sleep(250 * time.Millisecond)
|
||||
|
||||
if event.Op&(fsnotify.Write|fsnotify.Create) != 0 {
|
||||
log.Info().Str("event", event.Op.String()).Msg("Detected API token file change")
|
||||
|
|
@ -399,12 +400,44 @@ func (cw *ConfigWatcher) reloadAPITokens() {
|
|||
return
|
||||
}
|
||||
|
||||
tokens, err := globalPersistence.LoadAPITokens()
|
||||
// Preserve existing tokens in case reload fails
|
||||
existingTokens := cw.config.APITokens
|
||||
existingCount := len(existingTokens)
|
||||
|
||||
// Retry logic to handle temporary file system issues
|
||||
var tokens []APITokenRecord
|
||||
var err error
|
||||
maxRetries := 3
|
||||
retryDelay := 50 * time.Millisecond
|
||||
|
||||
for attempt := 1; attempt <= maxRetries; attempt++ {
|
||||
tokens, err = globalPersistence.LoadAPITokens()
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
|
||||
if attempt < maxRetries {
|
||||
log.Warn().
|
||||
Err(err).
|
||||
Int("attempt", attempt).
|
||||
Int("maxRetries", maxRetries).
|
||||
Dur("retryDelay", retryDelay).
|
||||
Msg("Failed to reload API tokens, retrying...")
|
||||
time.Sleep(retryDelay)
|
||||
retryDelay *= 2 // Exponential backoff
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msg("Failed to reload API tokens")
|
||||
log.Error().
|
||||
Err(err).
|
||||
Int("existingTokens", existingCount).
|
||||
Msg("Failed to reload API tokens after retries - preserving existing tokens")
|
||||
// CRITICAL: Keep existing tokens rather than clearing them
|
||||
return
|
||||
}
|
||||
|
||||
// Only update if we successfully loaded tokens
|
||||
cw.config.APITokens = tokens
|
||||
cw.config.SortAPITokens()
|
||||
cw.config.APITokenEnabled = len(tokens) > 0
|
||||
|
|
@ -415,7 +448,10 @@ func (cw *ConfigWatcher) reloadAPITokens() {
|
|||
}
|
||||
}
|
||||
|
||||
log.Info().Int("count", len(tokens)).Msg("Reloaded API tokens from disk")
|
||||
log.Info().
|
||||
Int("count", len(tokens)).
|
||||
Int("previousCount", existingCount).
|
||||
Msg("Reloaded API tokens from disk")
|
||||
}
|
||||
|
||||
// reloadMockConfig handles mock.env file changes
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue