mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-09 19:32:24 +00:00
Explain canonical monitored system status reasons
This commit is contained in:
parent
3a6a376b33
commit
1e93ede11e
14 changed files with 625 additions and 47 deletions
|
|
@ -234,6 +234,10 @@ That same ledger read now also carries backend-owned status explanation copy,
|
|||
and lifecycle-adjacent details must render it beside the counting rationale so
|
||||
operators can interpret warning, offline, and unknown states without inventing
|
||||
local status semantics.
|
||||
Those status details are now structured as well: lifecycle-adjacent consumers
|
||||
must preserve the canonical reason list from the ledger read so operators can
|
||||
see which grouped source or surface degraded and when it last reported,
|
||||
instead of only seeing a generic warning/offline paragraph.
|
||||
Lifecycle-adjacent workspace copy must also keep the same commercial framing:
|
||||
infrastructure operations may point operators to Pulse Pro for billing, but it
|
||||
must describe that boundary in monitored-system, plan-limit, and license-status
|
||||
|
|
|
|||
|
|
@ -243,6 +243,11 @@ That same contract now also owns the backend-authored status explanation paired
|
|||
with that enum, and the monitored-system ledger details surface must render it
|
||||
alongside the counting explanation instead of inventing page-local wording for
|
||||
what online, warning, offline, or unknown means.
|
||||
That nested status explanation is now a structured contract, not summary-only
|
||||
copy: `/api/license/monitored-system-ledger` must preserve the canonical
|
||||
summary plus the ordered reason list from unified resources, including the
|
||||
degraded source or surface, its status, and its last-seen timestamp, so mixed
|
||||
fresh/stale grouped systems remain explainable through one governed API shape.
|
||||
That client contract must also fail closed when older or partial payloads omit
|
||||
the nested explanation object: the frontend may normalize missing explanation
|
||||
fields to empty reasons/surfaces plus a safe default summary, but it must not
|
||||
|
|
|
|||
|
|
@ -215,6 +215,10 @@ view may normalize a safe default when that field is absent during mixed-version
|
|||
rollouts, but it must render the canonical backend explanation when present
|
||||
instead of inventing page-local wording for what warning, offline, or unknown
|
||||
means on a counted monitored system.
|
||||
That same cloud-paid surface must now also render the canonical status reason
|
||||
list when present, so customers can see exactly which grouped source or
|
||||
top-level surface degraded and when it last reported rather than only reading
|
||||
generic status copy beside a fresh aggregate `Last Seen` value.
|
||||
Frontend billing/admin surfaces must not synthesize `plan_version` from
|
||||
subscription lifecycle state. When a hosted billing record lacks a plan label,
|
||||
the UI must preserve that absence instead of fabricating values like `active`
|
||||
|
|
|
|||
|
|
@ -370,6 +370,12 @@ storage-adjacent API wiring may consume the canonical monitored-system ledger
|
|||
and monitored-system cap helpers, but it must not revive deleted agent-era
|
||||
helper names or imply that API-backed infrastructure sits outside the counted
|
||||
system model.
|
||||
That same shared `internal/api/` dependency now also assumes monitored-system
|
||||
ledger status details stay canonical and source-aware: storage- or recovery-
|
||||
adjacent consumers may read the ledger’s nested status explanation, but they
|
||||
must preserve the backend-provided reason list for stale or offline grouped
|
||||
sources instead of reducing those mixed fresh/stale system states back to a
|
||||
generic label.
|
||||
That same shared `internal/api/` dependency now also assumes self-hosted
|
||||
commercial counting is canonical at the top-level monitored-system boundary:
|
||||
shared setup, deploy, entitlement, and API-backed monitoring helpers may not
|
||||
|
|
|
|||
|
|
@ -171,6 +171,14 @@ grouping reasons plus included top-level surfaces, and fall back to an
|
|||
explicit standalone explanation when no cross-source merge occurred. Support
|
||||
and billing surfaces must consume that shared explanation contract instead of
|
||||
reconstructing count reasons from API-local heuristics.
|
||||
That same monitored-system contract now also owns canonical runtime-status
|
||||
explanations. When a grouped monitored system resolves to warning, offline, or
|
||||
unknown, unified resources must expose the shared summary plus structured
|
||||
degraded-status reasons derived from the grouped top-level resources and their
|
||||
source freshness state, including which source or surface degraded and the
|
||||
corresponding last-seen timestamp. Billing and support surfaces must consume
|
||||
that shared reason list instead of trying to infer why a fresh overall
|
||||
`last_seen` can still coincide with warning status.
|
||||
|
||||
The unified-resource runtime now also owns the durable change timeline for the
|
||||
canonical resource view. `internal/unifiedresources/monitor_adapter.go` feeds
|
||||
|
|
|
|||
|
|
@ -38,6 +38,7 @@ describe('MonitoredSystemLedgerAPI', () => {
|
|||
status: 'online',
|
||||
status_explanation: {
|
||||
summary: 'All included top-level collection paths currently report online status.',
|
||||
reasons: [],
|
||||
},
|
||||
last_seen: '2026-01-01T00:00:00Z',
|
||||
source: 'agent',
|
||||
|
|
@ -63,6 +64,7 @@ describe('MonitoredSystemLedgerAPI', () => {
|
|||
|
||||
expect(result.systems[0]?.explanation.summary).toContain('Counts as one monitored system');
|
||||
expect(result.systems[0]?.status_explanation?.summary).toContain('currently report online');
|
||||
expect(result.systems[0]?.status_explanation?.reasons).toEqual([]);
|
||||
expect(result.systems[0]?.explanation.reasons).toHaveLength(1);
|
||||
expect(result.systems[0]?.explanation.surfaces).toHaveLength(1);
|
||||
});
|
||||
|
|
@ -86,6 +88,7 @@ describe('MonitoredSystemLedgerAPI', () => {
|
|||
|
||||
expect(result.systems[0]?.explanation.summary).toContain('counts this top-level collection path');
|
||||
expect(result.systems[0]?.status_explanation?.summary).toContain('currently report online');
|
||||
expect(result.systems[0]?.status_explanation?.reasons).toEqual([]);
|
||||
expect(result.systems[0]?.explanation.reasons).toEqual([]);
|
||||
expect(result.systems[0]?.explanation.surfaces).toEqual([]);
|
||||
});
|
||||
|
|
@ -110,6 +113,50 @@ describe('MonitoredSystemLedgerAPI', () => {
|
|||
expect(result.systems[0]?.status).toBe('warning');
|
||||
});
|
||||
|
||||
it('preserves canonical status explanation reasons from the API contract', async () => {
|
||||
vi.mocked(apiFetchJSON).mockResolvedValueOnce({
|
||||
systems: [
|
||||
{
|
||||
name: 'Tower',
|
||||
type: 'host',
|
||||
status: 'warning',
|
||||
status_explanation: {
|
||||
summary: 'At least one included source is stale, so Pulse marks this monitored system as warning.',
|
||||
reasons: [
|
||||
{
|
||||
kind: 'source-stale',
|
||||
name: 'Tower',
|
||||
type: 'host',
|
||||
source: 'agent',
|
||||
status: 'stale',
|
||||
last_seen: '2026-03-23T11:55:00Z',
|
||||
summary: 'Agent data for Tower is stale (last reported 2026-03-23T11:55:00Z).',
|
||||
},
|
||||
],
|
||||
},
|
||||
last_seen: '2026-03-23T11:59:50Z',
|
||||
source: 'multiple',
|
||||
},
|
||||
],
|
||||
total: 1,
|
||||
limit: 5,
|
||||
});
|
||||
|
||||
const result = await MonitoredSystemLedgerAPI.getLedger();
|
||||
|
||||
expect(result.systems[0]?.status_explanation?.reasons).toEqual([
|
||||
{
|
||||
kind: 'source-stale',
|
||||
name: 'Tower',
|
||||
type: 'host',
|
||||
source: 'agent',
|
||||
status: 'stale',
|
||||
last_seen: '2026-03-23T11:55:00Z',
|
||||
summary: 'Agent data for Tower is stale (last reported 2026-03-23T11:55:00Z).',
|
||||
},
|
||||
]);
|
||||
});
|
||||
|
||||
it('fails closed to unknown for unsupported status values', async () => {
|
||||
vi.mocked(apiFetchJSON).mockResolvedValueOnce({
|
||||
systems: [
|
||||
|
|
|
|||
|
|
@ -22,6 +22,23 @@ export interface MonitoredSystemLedgerExplanation {
|
|||
|
||||
export interface MonitoredSystemLedgerStatusExplanation {
|
||||
summary: string;
|
||||
reasons: MonitoredSystemLedgerStatusReason[];
|
||||
}
|
||||
|
||||
export type MonitoredSystemLedgerStatusReasonStatus =
|
||||
| 'online'
|
||||
| 'stale'
|
||||
| 'offline'
|
||||
| 'unknown';
|
||||
|
||||
export interface MonitoredSystemLedgerStatusReason {
|
||||
kind: string;
|
||||
name: string;
|
||||
type: string;
|
||||
source: string;
|
||||
status: MonitoredSystemLedgerStatusReasonStatus;
|
||||
last_seen: string;
|
||||
summary: string;
|
||||
}
|
||||
|
||||
export interface MonitoredSystemLedgerEntry {
|
||||
|
|
@ -62,6 +79,7 @@ function normalizeMonitoredSystemLedgerEntry(
|
|||
status,
|
||||
status_explanation: {
|
||||
summary: entry.status_explanation?.summary ?? defaultMonitoredSystemStatusExplanation(status),
|
||||
reasons: (entry.status_explanation?.reasons ?? []).map(normalizeMonitoredSystemLedgerStatusReason),
|
||||
},
|
||||
explanation: {
|
||||
summary:
|
||||
|
|
@ -92,10 +110,34 @@ function defaultMonitoredSystemStatusExplanation(status: MonitoredSystemLedgerSt
|
|||
case 'online':
|
||||
return 'All included top-level collection paths currently report online status.';
|
||||
case 'warning':
|
||||
return 'At least one included top-level collection path is degraded or stale.';
|
||||
return 'At least one included top-level collection path is degraded, so Pulse marks this monitored system as warning.';
|
||||
case 'offline':
|
||||
return 'At least one included top-level collection path is offline or disconnected.';
|
||||
return 'At least one included source is offline or disconnected, so Pulse marks this monitored system as offline.';
|
||||
default:
|
||||
return 'Pulse cannot determine a canonical runtime status for this monitored system yet.';
|
||||
}
|
||||
}
|
||||
|
||||
function normalizeMonitoredSystemLedgerStatusReason(
|
||||
reason: MonitoredSystemLedgerStatusReason,
|
||||
): MonitoredSystemLedgerStatusReason {
|
||||
return {
|
||||
...reason,
|
||||
status: normalizeMonitoredSystemLedgerStatusReasonStatus(reason.status),
|
||||
last_seen: reason.last_seen ?? '',
|
||||
};
|
||||
}
|
||||
|
||||
function normalizeMonitoredSystemLedgerStatusReasonStatus(
|
||||
status: MonitoredSystemLedgerStatusReasonStatus | string | null | undefined,
|
||||
): MonitoredSystemLedgerStatusReasonStatus {
|
||||
switch ((status ?? '').trim().toLowerCase()) {
|
||||
case 'online':
|
||||
case 'stale':
|
||||
case 'offline':
|
||||
case 'unknown':
|
||||
return status.trim().toLowerCase() as MonitoredSystemLedgerStatusReasonStatus;
|
||||
default:
|
||||
return 'unknown';
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ function systemStatusExplanation(system: MonitoredSystemLedgerEntry): MonitoredS
|
|||
summary:
|
||||
system.status_explanation?.summary ??
|
||||
'Pulse cannot determine a canonical runtime status for this monitored system yet.',
|
||||
reasons: system.status_explanation?.reasons ?? [],
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -179,6 +180,13 @@ export function MonitoredSystemLedgerPanel(props: MonitoredSystemLedgerPanelProp
|
|||
<p class="whitespace-normal text-base-content">
|
||||
{statusExplanation.summary}
|
||||
</p>
|
||||
<Show when={statusExplanation.reasons.length > 0}>
|
||||
<ul class="space-y-1 whitespace-normal text-base-content">
|
||||
<For each={statusExplanation.reasons}>
|
||||
{(reason) => <li>{reason.summary}</li>}
|
||||
</For>
|
||||
</ul>
|
||||
</Show>
|
||||
</div>
|
||||
<p class="whitespace-normal text-base-content">
|
||||
{explanation.summary}
|
||||
|
|
|
|||
|
|
@ -77,6 +77,7 @@ describe('MonitoredSystemLedgerPanel', () => {
|
|||
status: 'online',
|
||||
status_explanation: {
|
||||
summary: 'All included top-level collection paths currently report online status.',
|
||||
reasons: [],
|
||||
},
|
||||
last_seen: '2026-01-01T00:00:00Z',
|
||||
source: 'agent',
|
||||
|
|
@ -139,6 +140,7 @@ describe('MonitoredSystemLedgerPanel', () => {
|
|||
status: 'online',
|
||||
status_explanation: {
|
||||
summary: 'All included top-level collection paths currently report online status.',
|
||||
reasons: [],
|
||||
},
|
||||
last_seen: '2026-01-01T00:00:00Z',
|
||||
source: 'agent',
|
||||
|
|
@ -161,7 +163,19 @@ describe('MonitoredSystemLedgerPanel', () => {
|
|||
status: 'offline',
|
||||
status_explanation: {
|
||||
summary:
|
||||
'At least one included top-level collection path is offline or disconnected, so Pulse marks this monitored system as offline.',
|
||||
'At least one included source is offline or disconnected, so Pulse marks this monitored system as offline.',
|
||||
reasons: [
|
||||
{
|
||||
kind: 'source-offline',
|
||||
name: 'server-b',
|
||||
type: 'pbs-server',
|
||||
source: 'pbs',
|
||||
status: 'offline',
|
||||
last_seen: '2026-01-01T23:55:00Z',
|
||||
summary:
|
||||
'PBS data for server-b is offline or disconnected (last reported 2026-01-01T23:55:00Z).',
|
||||
},
|
||||
],
|
||||
},
|
||||
last_seen: '2026-01-02T00:00:00Z',
|
||||
source: 'pbs',
|
||||
|
|
@ -220,7 +234,12 @@ describe('MonitoredSystemLedgerPanel', () => {
|
|||
expect(screen.getByText('Current status')).toBeInTheDocument();
|
||||
expect(
|
||||
screen.getByText(
|
||||
'At least one included top-level collection path is offline or disconnected, so Pulse marks this monitored system as offline.',
|
||||
'At least one included source is offline or disconnected, so Pulse marks this monitored system as offline.',
|
||||
),
|
||||
).toBeInTheDocument();
|
||||
expect(
|
||||
screen.getByText(
|
||||
'PBS data for server-b is offline or disconnected (last reported 2026-01-01T23:55:00Z).',
|
||||
),
|
||||
).toBeInTheDocument();
|
||||
expect(screen.getByText('Included collection paths')).toBeInTheDocument();
|
||||
|
|
@ -239,6 +258,7 @@ describe('MonitoredSystemLedgerPanel', () => {
|
|||
status: 'online',
|
||||
status_explanation: {
|
||||
summary: 'All included top-level collection paths currently report online status.',
|
||||
reasons: [],
|
||||
},
|
||||
last_seen: '2026-01-01T00:00:00Z',
|
||||
source: 'agent',
|
||||
|
|
|
|||
|
|
@ -581,7 +581,18 @@ func TestContract_MonitoredSystemLedgerJSONSnapshot(t *testing.T) {
|
|||
Type: "host",
|
||||
Status: "warning",
|
||||
StatusExplanation: MonitoredSystemLedgerStatusExplanation{
|
||||
Summary: "At least one included top-level collection path is degraded or stale, so Pulse marks this monitored system as warning.",
|
||||
Summary: "At least one included source is stale, so Pulse marks this monitored system as warning.",
|
||||
Reasons: []MonitoredSystemLedgerStatusReason{
|
||||
{
|
||||
Kind: "source-stale",
|
||||
Name: "Tower",
|
||||
Type: "host",
|
||||
Source: "agent",
|
||||
Status: "stale",
|
||||
LastSeen: "2026-03-18T17:25:00Z",
|
||||
Summary: "Agent data for Tower is stale (last reported 2026-03-18T17:25:00Z).",
|
||||
},
|
||||
},
|
||||
},
|
||||
LastSeen: "2026-03-18T17:30:00Z",
|
||||
Source: "agent",
|
||||
|
|
@ -620,7 +631,18 @@ func TestContract_MonitoredSystemLedgerJSONSnapshot(t *testing.T) {
|
|||
"type":"host",
|
||||
"status":"warning",
|
||||
"status_explanation":{
|
||||
"summary":"At least one included top-level collection path is degraded or stale, so Pulse marks this monitored system as warning."
|
||||
"summary":"At least one included source is stale, so Pulse marks this monitored system as warning.",
|
||||
"reasons":[
|
||||
{
|
||||
"kind":"source-stale",
|
||||
"name":"Tower",
|
||||
"type":"host",
|
||||
"source":"agent",
|
||||
"status":"stale",
|
||||
"last_seen":"2026-03-18T17:25:00Z",
|
||||
"summary":"Agent data for Tower is stale (last reported 2026-03-18T17:25:00Z)."
|
||||
}
|
||||
]
|
||||
},
|
||||
"last_seen":"2026-03-18T17:30:00Z",
|
||||
"source":"agent",
|
||||
|
|
|
|||
|
|
@ -23,7 +23,18 @@ type MonitoredSystemLedgerEntry struct {
|
|||
}
|
||||
|
||||
type MonitoredSystemLedgerStatusExplanation struct {
|
||||
Summary string `json:"summary"`
|
||||
Summary string `json:"summary"`
|
||||
Reasons []MonitoredSystemLedgerStatusReason `json:"reasons"`
|
||||
}
|
||||
|
||||
type MonitoredSystemLedgerStatusReason struct {
|
||||
Kind string `json:"kind"`
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Source string `json:"source"`
|
||||
Status string `json:"status"`
|
||||
LastSeen string `json:"last_seen"`
|
||||
Summary string `json:"summary"`
|
||||
}
|
||||
|
||||
type MonitoredSystemLedgerExplanation struct {
|
||||
|
|
@ -66,6 +77,9 @@ func (r MonitoredSystemLedgerResponse) NormalizeCollections() MonitoredSystemLed
|
|||
}
|
||||
|
||||
func (e MonitoredSystemLedgerEntry) NormalizeCollections() MonitoredSystemLedgerEntry {
|
||||
if e.StatusExplanation.Reasons == nil {
|
||||
e.StatusExplanation.Reasons = []MonitoredSystemLedgerStatusReason{}
|
||||
}
|
||||
if e.Explanation.Reasons == nil {
|
||||
e.Explanation.Reasons = []MonitoredSystemLedgerExplanationReason{}
|
||||
}
|
||||
|
|
@ -107,7 +121,7 @@ func (r *Router) handleMonitoredSystemLedger(w http.ResponseWriter, req *http.Re
|
|||
Name: system.Name,
|
||||
Type: system.Type,
|
||||
Status: status,
|
||||
StatusExplanation: monitoredSystemLedgerStatusExplanation(status),
|
||||
StatusExplanation: monitoredSystemLedgerStatusExplanation(system.StatusExplanation, status),
|
||||
LastSeen: formatLastSeen(system.LastSeen),
|
||||
Source: system.Source,
|
||||
Explanation: monitoredSystemLedgerExplanation(system.Explanation),
|
||||
|
|
@ -138,24 +152,53 @@ func normalizeStatus(s string) string {
|
|||
}
|
||||
}
|
||||
|
||||
func monitoredSystemLedgerStatusExplanation(status string) MonitoredSystemLedgerStatusExplanation {
|
||||
func monitoredSystemLedgerStatusExplanation(
|
||||
explanation unifiedresources.MonitoredSystemStatusExplanation,
|
||||
status string,
|
||||
) MonitoredSystemLedgerStatusExplanation {
|
||||
reasons := make([]MonitoredSystemLedgerStatusReason, 0, len(explanation.Reasons))
|
||||
for _, reason := range explanation.Reasons {
|
||||
reasons = append(reasons, MonitoredSystemLedgerStatusReason{
|
||||
Kind: reason.Kind,
|
||||
Name: reason.Name,
|
||||
Type: reason.Type,
|
||||
Source: reason.Source,
|
||||
Status: normalizeMonitoredSystemLedgerReasonStatus(reason.Status),
|
||||
LastSeen: formatLastSeen(reason.LastSeen),
|
||||
Summary: reason.Summary,
|
||||
})
|
||||
}
|
||||
|
||||
summary := explanation.Summary
|
||||
if summary == "" {
|
||||
summary = defaultMonitoredSystemLedgerStatusSummary(status)
|
||||
}
|
||||
|
||||
return MonitoredSystemLedgerStatusExplanation{
|
||||
Summary: summary,
|
||||
Reasons: reasons,
|
||||
}
|
||||
}
|
||||
|
||||
func defaultMonitoredSystemLedgerStatusSummary(status string) string {
|
||||
switch status {
|
||||
case "online":
|
||||
return MonitoredSystemLedgerStatusExplanation{
|
||||
Summary: "All included top-level collection paths currently report online status.",
|
||||
}
|
||||
return "All included top-level collection paths currently report online status."
|
||||
case "warning":
|
||||
return MonitoredSystemLedgerStatusExplanation{
|
||||
Summary: "At least one included top-level collection path is degraded or stale, so Pulse marks this monitored system as warning.",
|
||||
}
|
||||
return "At least one included top-level collection path is degraded, so Pulse marks this monitored system as warning."
|
||||
case "offline":
|
||||
return MonitoredSystemLedgerStatusExplanation{
|
||||
Summary: "At least one included top-level collection path is offline or disconnected, so Pulse marks this monitored system as offline.",
|
||||
}
|
||||
return "At least one included source is offline or disconnected, so Pulse marks this monitored system as offline."
|
||||
default:
|
||||
return MonitoredSystemLedgerStatusExplanation{
|
||||
Summary: "Pulse cannot determine a canonical runtime status for this monitored system yet.",
|
||||
}
|
||||
return "Pulse cannot determine a canonical runtime status for this monitored system yet."
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeMonitoredSystemLedgerReasonStatus(status string) string {
|
||||
switch status {
|
||||
case "online", "stale", "offline", "unknown":
|
||||
return status
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -6,6 +6,8 @@ import (
|
|||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
|
||||
)
|
||||
|
||||
func TestMonitoredSystemLedgerEntryTypes(t *testing.T) {
|
||||
|
|
@ -15,6 +17,7 @@ func TestMonitoredSystemLedgerEntryTypes(t *testing.T) {
|
|||
Status: "online",
|
||||
StatusExplanation: MonitoredSystemLedgerStatusExplanation{
|
||||
Summary: "All included top-level collection paths currently report online status.",
|
||||
Reasons: []MonitoredSystemLedgerStatusReason{},
|
||||
},
|
||||
LastSeen: "2025-01-01T00:00:00Z",
|
||||
Source: "agent",
|
||||
|
|
@ -42,6 +45,9 @@ func TestMonitoredSystemLedgerEntryTypes(t *testing.T) {
|
|||
if decoded.StatusExplanation.Summary == "" {
|
||||
t.Errorf("status explanation mismatch: %+v", decoded.StatusExplanation)
|
||||
}
|
||||
if decoded.StatusExplanation.Reasons == nil {
|
||||
t.Errorf("status explanation reasons mismatch: %+v", decoded.StatusExplanation)
|
||||
}
|
||||
if decoded.Source != "agent" {
|
||||
t.Errorf("source mismatch: got %q", decoded.Source)
|
||||
}
|
||||
|
|
@ -85,21 +91,31 @@ func TestFormatLastSeen(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestMonitoredSystemLedgerStatusExplanation(t *testing.T) {
|
||||
tests := []struct {
|
||||
status string
|
||||
want string
|
||||
}{
|
||||
{"online", "All included top-level collection paths currently report online status."},
|
||||
{"warning", "At least one included top-level collection path is degraded or stale, so Pulse marks this monitored system as warning."},
|
||||
{"offline", "At least one included top-level collection path is offline or disconnected, so Pulse marks this monitored system as offline."},
|
||||
{"unknown", "Pulse cannot determine a canonical runtime status for this monitored system yet."},
|
||||
got := monitoredSystemLedgerStatusExplanation(unifiedresources.MonitoredSystemStatusExplanation{
|
||||
Summary: "At least one included source is stale, so Pulse marks this monitored system as warning.",
|
||||
Reasons: []unifiedresources.MonitoredSystemStatusReason{
|
||||
{
|
||||
Kind: "source-stale",
|
||||
Name: "Tower",
|
||||
Type: "host",
|
||||
Source: "agent",
|
||||
Status: "stale",
|
||||
LastSeen: time.Date(2026, 3, 23, 11, 55, 0, 0, time.UTC),
|
||||
Summary: "Agent data for Tower is stale (last reported 2026-03-23T11:55:00Z).",
|
||||
},
|
||||
},
|
||||
}, "warning")
|
||||
if got.Summary != "At least one included source is stale, so Pulse marks this monitored system as warning." {
|
||||
t.Fatalf("unexpected status summary: %+v", got)
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
got := monitoredSystemLedgerStatusExplanation(tt.status)
|
||||
if got.Summary != tt.want {
|
||||
t.Errorf("monitoredSystemLedgerStatusExplanation(%q) = %q, want %q", tt.status, got.Summary, tt.want)
|
||||
}
|
||||
if len(got.Reasons) != 1 {
|
||||
t.Fatalf("expected one status reason, got %+v", got)
|
||||
}
|
||||
if got.Reasons[0].Status != "stale" {
|
||||
t.Fatalf("expected stale status reason, got %+v", got.Reasons[0])
|
||||
}
|
||||
if got.Reasons[0].LastSeen != "2026-03-23T11:55:00Z" {
|
||||
t.Fatalf("expected formatted reason last_seen, got %+v", got.Reasons[0])
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -141,11 +157,17 @@ func TestMonitoredSystemLedgerNilSystemsBecomesEmptyArray(t *testing.T) {
|
|||
func TestMonitoredSystemLedgerEntryNormalizeCollections(t *testing.T) {
|
||||
entry := MonitoredSystemLedgerEntry{
|
||||
Name: "server-1",
|
||||
StatusExplanation: MonitoredSystemLedgerStatusExplanation{
|
||||
Summary: "Pulse cannot determine a canonical runtime status for this monitored system yet.",
|
||||
},
|
||||
Explanation: MonitoredSystemLedgerExplanation{
|
||||
Summary: "Counts as one monitored system because Pulse sees one top-level host view from agent.",
|
||||
},
|
||||
}.NormalizeCollections()
|
||||
|
||||
if entry.StatusExplanation.Reasons == nil {
|
||||
t.Fatal("expected status explanation reasons to normalize to an empty slice")
|
||||
}
|
||||
if entry.Explanation.Reasons == nil {
|
||||
t.Fatal("expected explanation reasons to normalize to an empty slice")
|
||||
}
|
||||
|
|
@ -166,6 +188,7 @@ func TestHandleMonitoredSystemLedgerHTTP(t *testing.T) {
|
|||
Status: "online",
|
||||
StatusExplanation: MonitoredSystemLedgerStatusExplanation{
|
||||
Summary: "All included top-level collection paths currently report online status.",
|
||||
Reasons: []MonitoredSystemLedgerStatusReason{},
|
||||
},
|
||||
LastSeen: "2025-01-01T00:00:00Z",
|
||||
Source: "agent",
|
||||
|
|
@ -206,6 +229,9 @@ func TestHandleMonitoredSystemLedgerHTTP(t *testing.T) {
|
|||
if decoded.Systems[0].StatusExplanation.Summary == "" {
|
||||
t.Errorf("expected status explanation summary, got %+v", decoded.Systems[0].StatusExplanation)
|
||||
}
|
||||
if decoded.Systems[0].StatusExplanation.Reasons == nil {
|
||||
t.Errorf("expected status explanation reasons, got %+v", decoded.Systems[0].StatusExplanation)
|
||||
}
|
||||
if decoded.Systems[0].Explanation.Summary == "" {
|
||||
t.Errorf("expected explanation summary, got %+v", decoded.Systems[0].Explanation)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -42,15 +42,35 @@ type MonitoredSystemGroupingSurface struct {
|
|||
Source string
|
||||
}
|
||||
|
||||
// MonitoredSystemStatusExplanation explains why Pulse chose the canonical
|
||||
// monitored-system runtime status.
|
||||
type MonitoredSystemStatusExplanation struct {
|
||||
Summary string
|
||||
Reasons []MonitoredSystemStatusReason
|
||||
}
|
||||
|
||||
// MonitoredSystemStatusReason captures one canonical degraded-status signal
|
||||
// that contributed to the monitored-system runtime status.
|
||||
type MonitoredSystemStatusReason struct {
|
||||
Kind string
|
||||
Name string
|
||||
Type string
|
||||
Source string
|
||||
Status string
|
||||
LastSeen time.Time
|
||||
Summary string
|
||||
}
|
||||
|
||||
// MonitoredSystemRecord describes a counted top-level monitored system after
|
||||
// canonical cross-view deduplication.
|
||||
type MonitoredSystemRecord struct {
|
||||
Name string
|
||||
Type string
|
||||
Status ResourceStatus
|
||||
LastSeen time.Time
|
||||
Source string
|
||||
Explanation MonitoredSystemGroupingExplanation
|
||||
Name string
|
||||
Type string
|
||||
Status ResourceStatus
|
||||
StatusExplanation MonitoredSystemStatusExplanation
|
||||
LastSeen time.Time
|
||||
Source string
|
||||
Explanation MonitoredSystemGroupingExplanation
|
||||
}
|
||||
|
||||
// MonitoredSystemCount returns the number of top-level monitored systems after
|
||||
|
|
@ -152,13 +172,15 @@ func resolveMonitoredSystemTopLevelSystems(rs ReadState) TopLevelSystemResolver
|
|||
|
||||
func monitoredSystemRecord(group monitoredSystemGroup) MonitoredSystemRecord {
|
||||
resource := preferredMonitoredSystemResource(group.resources)
|
||||
status := monitoredSystemStatus(group.resources)
|
||||
record := MonitoredSystemRecord{
|
||||
Name: monitoredSystemDisplayName(group.resources, resource),
|
||||
Type: monitoredSystemType(resource),
|
||||
Status: monitoredSystemStatus(group.resources),
|
||||
LastSeen: monitoredSystemLastSeen(group.resources),
|
||||
Source: monitoredSystemSource(group.resources),
|
||||
Explanation: normalizeMonitoredSystemGroupingExplanation(group.explanation),
|
||||
Name: monitoredSystemDisplayName(group.resources, resource),
|
||||
Type: monitoredSystemType(resource),
|
||||
Status: status,
|
||||
StatusExplanation: monitoredSystemStatusExplanation(group.resources, status),
|
||||
LastSeen: monitoredSystemLastSeen(group.resources),
|
||||
Source: monitoredSystemSource(group.resources),
|
||||
Explanation: normalizeMonitoredSystemGroupingExplanation(group.explanation),
|
||||
}
|
||||
if record.Name == "" {
|
||||
record.Name = "Unnamed system"
|
||||
|
|
@ -169,6 +191,10 @@ func monitoredSystemRecord(group monitoredSystemGroup) MonitoredSystemRecord {
|
|||
if record.Status == "" {
|
||||
record.Status = StatusUnknown
|
||||
}
|
||||
record.StatusExplanation = normalizeMonitoredSystemStatusExplanation(record.StatusExplanation)
|
||||
if record.StatusExplanation.Summary == "" {
|
||||
record.StatusExplanation.Summary = monitoredSystemStatusSummary(record.Status, record.StatusExplanation.Reasons)
|
||||
}
|
||||
if record.Source == "" {
|
||||
record.Source = "unknown"
|
||||
}
|
||||
|
|
@ -190,6 +216,15 @@ func normalizeMonitoredSystemGroupingExplanation(
|
|||
return explanation
|
||||
}
|
||||
|
||||
func normalizeMonitoredSystemStatusExplanation(
|
||||
explanation MonitoredSystemStatusExplanation,
|
||||
) MonitoredSystemStatusExplanation {
|
||||
if explanation.Reasons == nil {
|
||||
explanation.Reasons = []MonitoredSystemStatusReason{}
|
||||
}
|
||||
return explanation
|
||||
}
|
||||
|
||||
func monitoredSystemStandaloneExplanation(resources []*Resource) MonitoredSystemGroupingExplanation {
|
||||
surfaces := monitoredSystemGroupingSurfaces(resources)
|
||||
resource := preferredMonitoredSystemResource(resources)
|
||||
|
|
@ -412,6 +447,170 @@ func monitoredSystemStatus(resources []*Resource) ResourceStatus {
|
|||
return best
|
||||
}
|
||||
|
||||
func monitoredSystemStatusExplanation(
|
||||
resources []*Resource,
|
||||
status ResourceStatus,
|
||||
) MonitoredSystemStatusExplanation {
|
||||
reasons := monitoredSystemStatusReasons(resources)
|
||||
return normalizeMonitoredSystemStatusExplanation(MonitoredSystemStatusExplanation{
|
||||
Summary: monitoredSystemStatusSummary(status, reasons),
|
||||
Reasons: reasons,
|
||||
})
|
||||
}
|
||||
|
||||
func monitoredSystemStatusReasons(resources []*Resource) []MonitoredSystemStatusReason {
|
||||
reasons := make([]MonitoredSystemStatusReason, 0)
|
||||
for _, resource := range resources {
|
||||
reasons = append(reasons, monitoredSystemResourceStatusReasons(resource)...)
|
||||
}
|
||||
sort.Slice(reasons, func(i, j int) bool {
|
||||
if monitoredSystemStatusReasonPriority(reasons[i]) != monitoredSystemStatusReasonPriority(reasons[j]) {
|
||||
return monitoredSystemStatusReasonPriority(reasons[i]) < monitoredSystemStatusReasonPriority(reasons[j])
|
||||
}
|
||||
if reasons[i].Name != reasons[j].Name {
|
||||
return reasons[i].Name < reasons[j].Name
|
||||
}
|
||||
if reasons[i].Type != reasons[j].Type {
|
||||
return reasons[i].Type < reasons[j].Type
|
||||
}
|
||||
if reasons[i].Source != reasons[j].Source {
|
||||
return reasons[i].Source < reasons[j].Source
|
||||
}
|
||||
if !reasons[i].LastSeen.Equal(reasons[j].LastSeen) {
|
||||
return reasons[i].LastSeen.Before(reasons[j].LastSeen)
|
||||
}
|
||||
return reasons[i].Summary < reasons[j].Summary
|
||||
})
|
||||
if reasons == nil {
|
||||
return []MonitoredSystemStatusReason{}
|
||||
}
|
||||
return reasons
|
||||
}
|
||||
|
||||
func monitoredSystemResourceStatusReasons(resource *Resource) []MonitoredSystemStatusReason {
|
||||
if resource == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
name := monitoredSystemResourceDisplayName(resource)
|
||||
if name == "" {
|
||||
name = "Unnamed source"
|
||||
}
|
||||
|
||||
resourceType := monitoredSystemType(resource)
|
||||
if resourceType == "" {
|
||||
resourceType = "system"
|
||||
}
|
||||
|
||||
reasons := make([]MonitoredSystemStatusReason, 0)
|
||||
if len(resource.SourceStatus) > 0 {
|
||||
sourceKeys := make([]DataSource, 0, len(resource.SourceStatus))
|
||||
for source := range resource.SourceStatus {
|
||||
sourceKeys = append(sourceKeys, source)
|
||||
}
|
||||
sort.Slice(sourceKeys, func(i, j int) bool {
|
||||
return sourceKeys[i] < sourceKeys[j]
|
||||
})
|
||||
|
||||
for _, source := range sourceKeys {
|
||||
sourceStatus := resource.SourceStatus[source]
|
||||
normalizedStatus := normalizeMonitoredSystemSourceStatus(sourceStatus.Status)
|
||||
if normalizedStatus == "online" {
|
||||
continue
|
||||
}
|
||||
reasons = append(reasons, MonitoredSystemStatusReason{
|
||||
Kind: "source-" + normalizedStatus,
|
||||
Name: name,
|
||||
Type: resourceType,
|
||||
Source: string(source),
|
||||
Status: normalizedStatus,
|
||||
LastSeen: sourceStatus.LastSeen,
|
||||
Summary: monitoredSystemSourceStatusReasonSummary(name, source, normalizedStatus, sourceStatus.LastSeen),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
if len(reasons) > 0 {
|
||||
return reasons
|
||||
}
|
||||
|
||||
normalizedStatus := normalizeMonitoredSystemSourceStatus(string(resource.Status))
|
||||
if normalizedStatus == "online" {
|
||||
return nil
|
||||
}
|
||||
|
||||
source := monitoredSystemPrimarySource(resource)
|
||||
if source == "" {
|
||||
source = "unknown"
|
||||
}
|
||||
return []MonitoredSystemStatusReason{
|
||||
{
|
||||
Kind: "surface-" + normalizedStatus,
|
||||
Name: name,
|
||||
Type: resourceType,
|
||||
Source: source,
|
||||
Status: normalizedStatus,
|
||||
LastSeen: resource.LastSeen,
|
||||
Summary: monitoredSystemSurfaceStatusReasonSummary(name, resourceType, source, normalizedStatus, resource.LastSeen),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeMonitoredSystemSourceStatus(status string) string {
|
||||
switch strings.ToLower(strings.TrimSpace(status)) {
|
||||
case "online":
|
||||
return "online"
|
||||
case "stale", "warning":
|
||||
return "stale"
|
||||
case "offline":
|
||||
return "offline"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
func monitoredSystemStatusSummary(status ResourceStatus, reasons []MonitoredSystemStatusReason) string {
|
||||
switch status {
|
||||
case StatusOnline:
|
||||
return "All included top-level collection paths currently report online status."
|
||||
case StatusWarning:
|
||||
switch {
|
||||
case monitoredSystemHasReasonStatus(reasons, "stale"):
|
||||
return "At least one included source is stale, so Pulse marks this monitored system as warning."
|
||||
case monitoredSystemHasReasonStatus(reasons, "offline"):
|
||||
return "At least one included source is offline or disconnected, but the canonical grouped status currently resolves to warning."
|
||||
default:
|
||||
return "At least one included top-level collection path is degraded, so Pulse marks this monitored system as warning."
|
||||
}
|
||||
case StatusOffline:
|
||||
return "At least one included source is offline or disconnected, so Pulse marks this monitored system as offline."
|
||||
default:
|
||||
return "Pulse cannot determine a canonical runtime status for this monitored system yet."
|
||||
}
|
||||
}
|
||||
|
||||
func monitoredSystemHasReasonStatus(reasons []MonitoredSystemStatusReason, status string) bool {
|
||||
for _, reason := range reasons {
|
||||
if reason.Status == status {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func monitoredSystemStatusReasonPriority(reason MonitoredSystemStatusReason) int {
|
||||
switch reason.Status {
|
||||
case "offline":
|
||||
return 0
|
||||
case "stale":
|
||||
return 1
|
||||
case "unknown":
|
||||
return 2
|
||||
default:
|
||||
return 3
|
||||
}
|
||||
}
|
||||
|
||||
func monitoredSystemStatusPriority(status ResourceStatus) int {
|
||||
switch status {
|
||||
case StatusWarning:
|
||||
|
|
@ -427,6 +626,63 @@ func monitoredSystemStatusPriority(status ResourceStatus) int {
|
|||
}
|
||||
}
|
||||
|
||||
func monitoredSystemSourceStatusReasonSummary(
|
||||
name string,
|
||||
source DataSource,
|
||||
status string,
|
||||
lastSeen time.Time,
|
||||
) string {
|
||||
subject := name
|
||||
if strings.TrimSpace(subject) == "" {
|
||||
subject = "this monitored system"
|
||||
}
|
||||
|
||||
summary := monitoredSystemStatusSourceLabel(string(source)) + " data for " + subject
|
||||
switch status {
|
||||
case "stale":
|
||||
summary += " is stale"
|
||||
case "offline":
|
||||
summary += " is offline or disconnected"
|
||||
default:
|
||||
summary += " does not report a canonical status yet"
|
||||
}
|
||||
|
||||
if !lastSeen.IsZero() {
|
||||
summary += " (last reported " + lastSeen.UTC().Format(time.RFC3339) + ")."
|
||||
return summary
|
||||
}
|
||||
return summary + "."
|
||||
}
|
||||
|
||||
func monitoredSystemSurfaceStatusReasonSummary(
|
||||
name string,
|
||||
resourceType string,
|
||||
source string,
|
||||
status string,
|
||||
lastSeen time.Time,
|
||||
) string {
|
||||
subject := name
|
||||
if strings.TrimSpace(subject) == "" {
|
||||
subject = "This monitored system"
|
||||
}
|
||||
|
||||
summary := monitoredSystemGroupingTypeLabel(resourceType) + " view for " + subject + " currently reports "
|
||||
switch status {
|
||||
case "stale":
|
||||
summary += "warning"
|
||||
case "offline":
|
||||
summary += "offline"
|
||||
default:
|
||||
summary += "unknown"
|
||||
}
|
||||
summary += " status from " + monitoredSystemStatusSourceLabel(source)
|
||||
if !lastSeen.IsZero() {
|
||||
summary += " (last reported " + lastSeen.UTC().Format(time.RFC3339) + ")."
|
||||
return summary
|
||||
}
|
||||
return summary + "."
|
||||
}
|
||||
|
||||
func monitoredSystemLastSeen(resources []*Resource) time.Time {
|
||||
var lastSeen time.Time
|
||||
for _, resource := range resources {
|
||||
|
|
@ -488,6 +744,29 @@ func monitoredSystemPrimarySource(resource *Resource) string {
|
|||
return ""
|
||||
}
|
||||
|
||||
func monitoredSystemStatusSourceLabel(value string) string {
|
||||
switch strings.TrimSpace(value) {
|
||||
case "agent":
|
||||
return "Agent"
|
||||
case "docker":
|
||||
return "Docker"
|
||||
case "kubernetes":
|
||||
return "Kubernetes"
|
||||
case "pbs":
|
||||
return "PBS"
|
||||
case "pmg":
|
||||
return "PMG"
|
||||
case "proxmox":
|
||||
return "Proxmox"
|
||||
case "truenas":
|
||||
return "TrueNAS"
|
||||
case "", "unknown":
|
||||
return "Unknown source"
|
||||
default:
|
||||
return strings.TrimSpace(value)
|
||||
}
|
||||
}
|
||||
|
||||
func cloneStringSet(in map[string]struct{}) map[string]struct{} {
|
||||
out := make(map[string]struct{}, len(in))
|
||||
for key := range in {
|
||||
|
|
|
|||
|
|
@ -261,6 +261,70 @@ func TestResourceRegistry_MonitoredSystemsSummarizeCanonicalTopLevelViews(t *tes
|
|||
}
|
||||
}
|
||||
|
||||
func TestMonitoredSystemsExplainsStaleGroupedSourceWhileLastSeenStaysFresh(t *testing.T) {
|
||||
rr := NewRegistry(nil)
|
||||
now := time.Date(2026, 3, 23, 12, 0, 0, 0, time.UTC)
|
||||
|
||||
agentResource := topLevelTestAgent("agent-host", "tower.local", "machine-1", "agent-1")
|
||||
agentResource.LastSeen = now.Add(-5 * time.Minute)
|
||||
dockerResource := topLevelTestDockerHost("docker-host", "tower.local", "docker-runtime-1", "agent-1")
|
||||
dockerResource.LastSeen = now.Add(-10 * time.Second)
|
||||
|
||||
rr.IngestRecords(SourceAgent, []IngestRecord{
|
||||
{
|
||||
SourceID: "agent-host",
|
||||
Resource: agentResource,
|
||||
},
|
||||
})
|
||||
rr.IngestRecords(SourceDocker, []IngestRecord{
|
||||
{
|
||||
SourceID: "docker-host",
|
||||
Resource: dockerResource,
|
||||
},
|
||||
})
|
||||
|
||||
rr.MarkStale(now, map[DataSource]time.Duration{
|
||||
SourceAgent: 60 * time.Second,
|
||||
SourceDocker: 60 * time.Second,
|
||||
})
|
||||
|
||||
systems := MonitoredSystems(rr)
|
||||
if len(systems) != 1 {
|
||||
t.Fatalf("MonitoredSystems() returned %d systems, want 1", len(systems))
|
||||
}
|
||||
|
||||
system := systems[0]
|
||||
if system.Status != StatusWarning {
|
||||
t.Fatalf("expected grouped monitored system status warning, got %+v", system)
|
||||
}
|
||||
if !system.LastSeen.Equal(dockerResource.LastSeen) {
|
||||
t.Fatalf("expected grouped last_seen %s, got %s", dockerResource.LastSeen, system.LastSeen)
|
||||
}
|
||||
if system.StatusExplanation.Summary == "" {
|
||||
t.Fatal("expected grouped monitored system status explanation summary")
|
||||
}
|
||||
if len(system.StatusExplanation.Reasons) != 1 {
|
||||
t.Fatalf("expected one stale grouped-source reason, got %+v", system.StatusExplanation.Reasons)
|
||||
}
|
||||
|
||||
reason := system.StatusExplanation.Reasons[0]
|
||||
if reason.Kind != "source-stale" {
|
||||
t.Fatalf("expected stale source reason kind, got %+v", reason)
|
||||
}
|
||||
if reason.Source != string(SourceAgent) {
|
||||
t.Fatalf("expected agent source reason, got %+v", reason)
|
||||
}
|
||||
if reason.Status != "stale" {
|
||||
t.Fatalf("expected stale reason status, got %+v", reason)
|
||||
}
|
||||
if !reason.LastSeen.Equal(agentResource.LastSeen) {
|
||||
t.Fatalf("expected stale reason last_seen %s, got %s", agentResource.LastSeen, reason.LastSeen)
|
||||
}
|
||||
if reason.Summary == "" {
|
||||
t.Fatalf("expected stale reason summary, got %+v", reason)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResourceRegistry_IngestRecords_UnknownSource(t *testing.T) {
|
||||
rr := NewRegistry(nil)
|
||||
now := time.Date(2026, 2, 20, 12, 0, 0, 0, time.UTC)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue