diff --git a/docs/release-control/v6/internal/subsystems/agent-lifecycle.md b/docs/release-control/v6/internal/subsystems/agent-lifecycle.md index c42b86013..f39d58d4c 100644 --- a/docs/release-control/v6/internal/subsystems/agent-lifecycle.md +++ b/docs/release-control/v6/internal/subsystems/agent-lifecycle.md @@ -234,6 +234,10 @@ That same ledger read now also carries backend-owned status explanation copy, and lifecycle-adjacent details must render it beside the counting rationale so operators can interpret warning, offline, and unknown states without inventing local status semantics. +Those status details are now structured as well: lifecycle-adjacent consumers +must preserve the canonical reason list from the ledger read so operators can +see which grouped source or surface degraded and when it last reported, +instead of only seeing a generic warning/offline paragraph. Lifecycle-adjacent workspace copy must also keep the same commercial framing: infrastructure operations may point operators to Pulse Pro for billing, but it must describe that boundary in monitored-system, plan-limit, and license-status diff --git a/docs/release-control/v6/internal/subsystems/api-contracts.md b/docs/release-control/v6/internal/subsystems/api-contracts.md index 523dde46b..0b83bb09d 100644 --- a/docs/release-control/v6/internal/subsystems/api-contracts.md +++ b/docs/release-control/v6/internal/subsystems/api-contracts.md @@ -243,6 +243,11 @@ That same contract now also owns the backend-authored status explanation paired with that enum, and the monitored-system ledger details surface must render it alongside the counting explanation instead of inventing page-local wording for what online, warning, offline, or unknown means. +That nested status explanation is now a structured contract, not summary-only +copy: `/api/license/monitored-system-ledger` must preserve the canonical +summary plus the ordered reason list from unified resources, including the +degraded source or surface, its status, and its last-seen timestamp, so mixed +fresh/stale grouped systems remain explainable through one governed API shape. That client contract must also fail closed when older or partial payloads omit the nested explanation object: the frontend may normalize missing explanation fields to empty reasons/surfaces plus a safe default summary, but it must not diff --git a/docs/release-control/v6/internal/subsystems/cloud-paid.md b/docs/release-control/v6/internal/subsystems/cloud-paid.md index 4b003ca35..1e6613835 100644 --- a/docs/release-control/v6/internal/subsystems/cloud-paid.md +++ b/docs/release-control/v6/internal/subsystems/cloud-paid.md @@ -215,6 +215,10 @@ view may normalize a safe default when that field is absent during mixed-version rollouts, but it must render the canonical backend explanation when present instead of inventing page-local wording for what warning, offline, or unknown means on a counted monitored system. +That same cloud-paid surface must now also render the canonical status reason +list when present, so customers can see exactly which grouped source or +top-level surface degraded and when it last reported rather than only reading +generic status copy beside a fresh aggregate `Last Seen` value. Frontend billing/admin surfaces must not synthesize `plan_version` from subscription lifecycle state. When a hosted billing record lacks a plan label, the UI must preserve that absence instead of fabricating values like `active` diff --git a/docs/release-control/v6/internal/subsystems/storage-recovery.md b/docs/release-control/v6/internal/subsystems/storage-recovery.md index f114cfaaa..5ccf77df6 100644 --- a/docs/release-control/v6/internal/subsystems/storage-recovery.md +++ b/docs/release-control/v6/internal/subsystems/storage-recovery.md @@ -370,6 +370,12 @@ storage-adjacent API wiring may consume the canonical monitored-system ledger and monitored-system cap helpers, but it must not revive deleted agent-era helper names or imply that API-backed infrastructure sits outside the counted system model. +That same shared `internal/api/` dependency now also assumes monitored-system +ledger status details stay canonical and source-aware: storage- or recovery- +adjacent consumers may read the ledger’s nested status explanation, but they +must preserve the backend-provided reason list for stale or offline grouped +sources instead of reducing those mixed fresh/stale system states back to a +generic label. That same shared `internal/api/` dependency now also assumes self-hosted commercial counting is canonical at the top-level monitored-system boundary: shared setup, deploy, entitlement, and API-backed monitoring helpers may not diff --git a/docs/release-control/v6/internal/subsystems/unified-resources.md b/docs/release-control/v6/internal/subsystems/unified-resources.md index a24dd0336..a9489e649 100644 --- a/docs/release-control/v6/internal/subsystems/unified-resources.md +++ b/docs/release-control/v6/internal/subsystems/unified-resources.md @@ -171,6 +171,14 @@ grouping reasons plus included top-level surfaces, and fall back to an explicit standalone explanation when no cross-source merge occurred. Support and billing surfaces must consume that shared explanation contract instead of reconstructing count reasons from API-local heuristics. +That same monitored-system contract now also owns canonical runtime-status +explanations. When a grouped monitored system resolves to warning, offline, or +unknown, unified resources must expose the shared summary plus structured +degraded-status reasons derived from the grouped top-level resources and their +source freshness state, including which source or surface degraded and the +corresponding last-seen timestamp. Billing and support surfaces must consume +that shared reason list instead of trying to infer why a fresh overall +`last_seen` can still coincide with warning status. The unified-resource runtime now also owns the durable change timeline for the canonical resource view. `internal/unifiedresources/monitor_adapter.go` feeds diff --git a/frontend-modern/src/api/__tests__/monitoredSystemLedger.test.ts b/frontend-modern/src/api/__tests__/monitoredSystemLedger.test.ts index 61a96f434..300d942ac 100644 --- a/frontend-modern/src/api/__tests__/monitoredSystemLedger.test.ts +++ b/frontend-modern/src/api/__tests__/monitoredSystemLedger.test.ts @@ -38,6 +38,7 @@ describe('MonitoredSystemLedgerAPI', () => { status: 'online', status_explanation: { summary: 'All included top-level collection paths currently report online status.', + reasons: [], }, last_seen: '2026-01-01T00:00:00Z', source: 'agent', @@ -63,6 +64,7 @@ describe('MonitoredSystemLedgerAPI', () => { expect(result.systems[0]?.explanation.summary).toContain('Counts as one monitored system'); expect(result.systems[0]?.status_explanation?.summary).toContain('currently report online'); + expect(result.systems[0]?.status_explanation?.reasons).toEqual([]); expect(result.systems[0]?.explanation.reasons).toHaveLength(1); expect(result.systems[0]?.explanation.surfaces).toHaveLength(1); }); @@ -86,6 +88,7 @@ describe('MonitoredSystemLedgerAPI', () => { expect(result.systems[0]?.explanation.summary).toContain('counts this top-level collection path'); expect(result.systems[0]?.status_explanation?.summary).toContain('currently report online'); + expect(result.systems[0]?.status_explanation?.reasons).toEqual([]); expect(result.systems[0]?.explanation.reasons).toEqual([]); expect(result.systems[0]?.explanation.surfaces).toEqual([]); }); @@ -110,6 +113,50 @@ describe('MonitoredSystemLedgerAPI', () => { expect(result.systems[0]?.status).toBe('warning'); }); + it('preserves canonical status explanation reasons from the API contract', async () => { + vi.mocked(apiFetchJSON).mockResolvedValueOnce({ + systems: [ + { + name: 'Tower', + type: 'host', + status: 'warning', + status_explanation: { + summary: 'At least one included source is stale, so Pulse marks this monitored system as warning.', + reasons: [ + { + kind: 'source-stale', + name: 'Tower', + type: 'host', + source: 'agent', + status: 'stale', + last_seen: '2026-03-23T11:55:00Z', + summary: 'Agent data for Tower is stale (last reported 2026-03-23T11:55:00Z).', + }, + ], + }, + last_seen: '2026-03-23T11:59:50Z', + source: 'multiple', + }, + ], + total: 1, + limit: 5, + }); + + const result = await MonitoredSystemLedgerAPI.getLedger(); + + expect(result.systems[0]?.status_explanation?.reasons).toEqual([ + { + kind: 'source-stale', + name: 'Tower', + type: 'host', + source: 'agent', + status: 'stale', + last_seen: '2026-03-23T11:55:00Z', + summary: 'Agent data for Tower is stale (last reported 2026-03-23T11:55:00Z).', + }, + ]); + }); + it('fails closed to unknown for unsupported status values', async () => { vi.mocked(apiFetchJSON).mockResolvedValueOnce({ systems: [ diff --git a/frontend-modern/src/api/monitoredSystemLedger.ts b/frontend-modern/src/api/monitoredSystemLedger.ts index 634f1c271..3c2bbe1fb 100644 --- a/frontend-modern/src/api/monitoredSystemLedger.ts +++ b/frontend-modern/src/api/monitoredSystemLedger.ts @@ -22,6 +22,23 @@ export interface MonitoredSystemLedgerExplanation { export interface MonitoredSystemLedgerStatusExplanation { summary: string; + reasons: MonitoredSystemLedgerStatusReason[]; +} + +export type MonitoredSystemLedgerStatusReasonStatus = + | 'online' + | 'stale' + | 'offline' + | 'unknown'; + +export interface MonitoredSystemLedgerStatusReason { + kind: string; + name: string; + type: string; + source: string; + status: MonitoredSystemLedgerStatusReasonStatus; + last_seen: string; + summary: string; } export interface MonitoredSystemLedgerEntry { @@ -62,6 +79,7 @@ function normalizeMonitoredSystemLedgerEntry( status, status_explanation: { summary: entry.status_explanation?.summary ?? defaultMonitoredSystemStatusExplanation(status), + reasons: (entry.status_explanation?.reasons ?? []).map(normalizeMonitoredSystemLedgerStatusReason), }, explanation: { summary: @@ -92,10 +110,34 @@ function defaultMonitoredSystemStatusExplanation(status: MonitoredSystemLedgerSt case 'online': return 'All included top-level collection paths currently report online status.'; case 'warning': - return 'At least one included top-level collection path is degraded or stale.'; + return 'At least one included top-level collection path is degraded, so Pulse marks this monitored system as warning.'; case 'offline': - return 'At least one included top-level collection path is offline or disconnected.'; + return 'At least one included source is offline or disconnected, so Pulse marks this monitored system as offline.'; default: return 'Pulse cannot determine a canonical runtime status for this monitored system yet.'; } } + +function normalizeMonitoredSystemLedgerStatusReason( + reason: MonitoredSystemLedgerStatusReason, +): MonitoredSystemLedgerStatusReason { + return { + ...reason, + status: normalizeMonitoredSystemLedgerStatusReasonStatus(reason.status), + last_seen: reason.last_seen ?? '', + }; +} + +function normalizeMonitoredSystemLedgerStatusReasonStatus( + status: MonitoredSystemLedgerStatusReasonStatus | string | null | undefined, +): MonitoredSystemLedgerStatusReasonStatus { + switch ((status ?? '').trim().toLowerCase()) { + case 'online': + case 'stale': + case 'offline': + case 'unknown': + return status.trim().toLowerCase() as MonitoredSystemLedgerStatusReasonStatus; + default: + return 'unknown'; + } +} diff --git a/frontend-modern/src/components/Settings/MonitoredSystemLedgerPanel.tsx b/frontend-modern/src/components/Settings/MonitoredSystemLedgerPanel.tsx index aef85fa29..60591f20f 100644 --- a/frontend-modern/src/components/Settings/MonitoredSystemLedgerPanel.tsx +++ b/frontend-modern/src/components/Settings/MonitoredSystemLedgerPanel.tsx @@ -51,6 +51,7 @@ function systemStatusExplanation(system: MonitoredSystemLedgerEntry): MonitoredS summary: system.status_explanation?.summary ?? 'Pulse cannot determine a canonical runtime status for this monitored system yet.', + reasons: system.status_explanation?.reasons ?? [], }; } @@ -179,6 +180,13 @@ export function MonitoredSystemLedgerPanel(props: MonitoredSystemLedgerPanelProp

{statusExplanation.summary}

+ 0}> + +

{explanation.summary} diff --git a/frontend-modern/src/components/Settings/__tests__/MonitoredSystemLedgerPanel.test.tsx b/frontend-modern/src/components/Settings/__tests__/MonitoredSystemLedgerPanel.test.tsx index 4818607a3..16f1613b9 100644 --- a/frontend-modern/src/components/Settings/__tests__/MonitoredSystemLedgerPanel.test.tsx +++ b/frontend-modern/src/components/Settings/__tests__/MonitoredSystemLedgerPanel.test.tsx @@ -77,6 +77,7 @@ describe('MonitoredSystemLedgerPanel', () => { status: 'online', status_explanation: { summary: 'All included top-level collection paths currently report online status.', + reasons: [], }, last_seen: '2026-01-01T00:00:00Z', source: 'agent', @@ -139,6 +140,7 @@ describe('MonitoredSystemLedgerPanel', () => { status: 'online', status_explanation: { summary: 'All included top-level collection paths currently report online status.', + reasons: [], }, last_seen: '2026-01-01T00:00:00Z', source: 'agent', @@ -161,7 +163,19 @@ describe('MonitoredSystemLedgerPanel', () => { status: 'offline', status_explanation: { summary: - 'At least one included top-level collection path is offline or disconnected, so Pulse marks this monitored system as offline.', + 'At least one included source is offline or disconnected, so Pulse marks this monitored system as offline.', + reasons: [ + { + kind: 'source-offline', + name: 'server-b', + type: 'pbs-server', + source: 'pbs', + status: 'offline', + last_seen: '2026-01-01T23:55:00Z', + summary: + 'PBS data for server-b is offline or disconnected (last reported 2026-01-01T23:55:00Z).', + }, + ], }, last_seen: '2026-01-02T00:00:00Z', source: 'pbs', @@ -220,7 +234,12 @@ describe('MonitoredSystemLedgerPanel', () => { expect(screen.getByText('Current status')).toBeInTheDocument(); expect( screen.getByText( - 'At least one included top-level collection path is offline or disconnected, so Pulse marks this monitored system as offline.', + 'At least one included source is offline or disconnected, so Pulse marks this monitored system as offline.', + ), + ).toBeInTheDocument(); + expect( + screen.getByText( + 'PBS data for server-b is offline or disconnected (last reported 2026-01-01T23:55:00Z).', ), ).toBeInTheDocument(); expect(screen.getByText('Included collection paths')).toBeInTheDocument(); @@ -239,6 +258,7 @@ describe('MonitoredSystemLedgerPanel', () => { status: 'online', status_explanation: { summary: 'All included top-level collection paths currently report online status.', + reasons: [], }, last_seen: '2026-01-01T00:00:00Z', source: 'agent', diff --git a/internal/api/contract_test.go b/internal/api/contract_test.go index a30d26e71..5125e1f92 100644 --- a/internal/api/contract_test.go +++ b/internal/api/contract_test.go @@ -581,7 +581,18 @@ func TestContract_MonitoredSystemLedgerJSONSnapshot(t *testing.T) { Type: "host", Status: "warning", StatusExplanation: MonitoredSystemLedgerStatusExplanation{ - Summary: "At least one included top-level collection path is degraded or stale, so Pulse marks this monitored system as warning.", + Summary: "At least one included source is stale, so Pulse marks this monitored system as warning.", + Reasons: []MonitoredSystemLedgerStatusReason{ + { + Kind: "source-stale", + Name: "Tower", + Type: "host", + Source: "agent", + Status: "stale", + LastSeen: "2026-03-18T17:25:00Z", + Summary: "Agent data for Tower is stale (last reported 2026-03-18T17:25:00Z).", + }, + }, }, LastSeen: "2026-03-18T17:30:00Z", Source: "agent", @@ -620,7 +631,18 @@ func TestContract_MonitoredSystemLedgerJSONSnapshot(t *testing.T) { "type":"host", "status":"warning", "status_explanation":{ - "summary":"At least one included top-level collection path is degraded or stale, so Pulse marks this monitored system as warning." + "summary":"At least one included source is stale, so Pulse marks this monitored system as warning.", + "reasons":[ + { + "kind":"source-stale", + "name":"Tower", + "type":"host", + "source":"agent", + "status":"stale", + "last_seen":"2026-03-18T17:25:00Z", + "summary":"Agent data for Tower is stale (last reported 2026-03-18T17:25:00Z)." + } + ] }, "last_seen":"2026-03-18T17:30:00Z", "source":"agent", diff --git a/internal/api/monitored_system_ledger.go b/internal/api/monitored_system_ledger.go index 160e9e516..414534344 100644 --- a/internal/api/monitored_system_ledger.go +++ b/internal/api/monitored_system_ledger.go @@ -23,7 +23,18 @@ type MonitoredSystemLedgerEntry struct { } type MonitoredSystemLedgerStatusExplanation struct { - Summary string `json:"summary"` + Summary string `json:"summary"` + Reasons []MonitoredSystemLedgerStatusReason `json:"reasons"` +} + +type MonitoredSystemLedgerStatusReason struct { + Kind string `json:"kind"` + Name string `json:"name"` + Type string `json:"type"` + Source string `json:"source"` + Status string `json:"status"` + LastSeen string `json:"last_seen"` + Summary string `json:"summary"` } type MonitoredSystemLedgerExplanation struct { @@ -66,6 +77,9 @@ func (r MonitoredSystemLedgerResponse) NormalizeCollections() MonitoredSystemLed } func (e MonitoredSystemLedgerEntry) NormalizeCollections() MonitoredSystemLedgerEntry { + if e.StatusExplanation.Reasons == nil { + e.StatusExplanation.Reasons = []MonitoredSystemLedgerStatusReason{} + } if e.Explanation.Reasons == nil { e.Explanation.Reasons = []MonitoredSystemLedgerExplanationReason{} } @@ -107,7 +121,7 @@ func (r *Router) handleMonitoredSystemLedger(w http.ResponseWriter, req *http.Re Name: system.Name, Type: system.Type, Status: status, - StatusExplanation: monitoredSystemLedgerStatusExplanation(status), + StatusExplanation: monitoredSystemLedgerStatusExplanation(system.StatusExplanation, status), LastSeen: formatLastSeen(system.LastSeen), Source: system.Source, Explanation: monitoredSystemLedgerExplanation(system.Explanation), @@ -138,24 +152,53 @@ func normalizeStatus(s string) string { } } -func monitoredSystemLedgerStatusExplanation(status string) MonitoredSystemLedgerStatusExplanation { +func monitoredSystemLedgerStatusExplanation( + explanation unifiedresources.MonitoredSystemStatusExplanation, + status string, +) MonitoredSystemLedgerStatusExplanation { + reasons := make([]MonitoredSystemLedgerStatusReason, 0, len(explanation.Reasons)) + for _, reason := range explanation.Reasons { + reasons = append(reasons, MonitoredSystemLedgerStatusReason{ + Kind: reason.Kind, + Name: reason.Name, + Type: reason.Type, + Source: reason.Source, + Status: normalizeMonitoredSystemLedgerReasonStatus(reason.Status), + LastSeen: formatLastSeen(reason.LastSeen), + Summary: reason.Summary, + }) + } + + summary := explanation.Summary + if summary == "" { + summary = defaultMonitoredSystemLedgerStatusSummary(status) + } + + return MonitoredSystemLedgerStatusExplanation{ + Summary: summary, + Reasons: reasons, + } +} + +func defaultMonitoredSystemLedgerStatusSummary(status string) string { switch status { case "online": - return MonitoredSystemLedgerStatusExplanation{ - Summary: "All included top-level collection paths currently report online status.", - } + return "All included top-level collection paths currently report online status." case "warning": - return MonitoredSystemLedgerStatusExplanation{ - Summary: "At least one included top-level collection path is degraded or stale, so Pulse marks this monitored system as warning.", - } + return "At least one included top-level collection path is degraded, so Pulse marks this monitored system as warning." case "offline": - return MonitoredSystemLedgerStatusExplanation{ - Summary: "At least one included top-level collection path is offline or disconnected, so Pulse marks this monitored system as offline.", - } + return "At least one included source is offline or disconnected, so Pulse marks this monitored system as offline." default: - return MonitoredSystemLedgerStatusExplanation{ - Summary: "Pulse cannot determine a canonical runtime status for this monitored system yet.", - } + return "Pulse cannot determine a canonical runtime status for this monitored system yet." + } +} + +func normalizeMonitoredSystemLedgerReasonStatus(status string) string { + switch status { + case "online", "stale", "offline", "unknown": + return status + default: + return "unknown" } } diff --git a/internal/api/monitored_system_ledger_test.go b/internal/api/monitored_system_ledger_test.go index e8d560dd6..5712e9dfb 100644 --- a/internal/api/monitored_system_ledger_test.go +++ b/internal/api/monitored_system_ledger_test.go @@ -6,6 +6,8 @@ import ( "net/http/httptest" "testing" "time" + + "github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources" ) func TestMonitoredSystemLedgerEntryTypes(t *testing.T) { @@ -15,6 +17,7 @@ func TestMonitoredSystemLedgerEntryTypes(t *testing.T) { Status: "online", StatusExplanation: MonitoredSystemLedgerStatusExplanation{ Summary: "All included top-level collection paths currently report online status.", + Reasons: []MonitoredSystemLedgerStatusReason{}, }, LastSeen: "2025-01-01T00:00:00Z", Source: "agent", @@ -42,6 +45,9 @@ func TestMonitoredSystemLedgerEntryTypes(t *testing.T) { if decoded.StatusExplanation.Summary == "" { t.Errorf("status explanation mismatch: %+v", decoded.StatusExplanation) } + if decoded.StatusExplanation.Reasons == nil { + t.Errorf("status explanation reasons mismatch: %+v", decoded.StatusExplanation) + } if decoded.Source != "agent" { t.Errorf("source mismatch: got %q", decoded.Source) } @@ -85,21 +91,31 @@ func TestFormatLastSeen(t *testing.T) { } func TestMonitoredSystemLedgerStatusExplanation(t *testing.T) { - tests := []struct { - status string - want string - }{ - {"online", "All included top-level collection paths currently report online status."}, - {"warning", "At least one included top-level collection path is degraded or stale, so Pulse marks this monitored system as warning."}, - {"offline", "At least one included top-level collection path is offline or disconnected, so Pulse marks this monitored system as offline."}, - {"unknown", "Pulse cannot determine a canonical runtime status for this monitored system yet."}, + got := monitoredSystemLedgerStatusExplanation(unifiedresources.MonitoredSystemStatusExplanation{ + Summary: "At least one included source is stale, so Pulse marks this monitored system as warning.", + Reasons: []unifiedresources.MonitoredSystemStatusReason{ + { + Kind: "source-stale", + Name: "Tower", + Type: "host", + Source: "agent", + Status: "stale", + LastSeen: time.Date(2026, 3, 23, 11, 55, 0, 0, time.UTC), + Summary: "Agent data for Tower is stale (last reported 2026-03-23T11:55:00Z).", + }, + }, + }, "warning") + if got.Summary != "At least one included source is stale, so Pulse marks this monitored system as warning." { + t.Fatalf("unexpected status summary: %+v", got) } - - for _, tt := range tests { - got := monitoredSystemLedgerStatusExplanation(tt.status) - if got.Summary != tt.want { - t.Errorf("monitoredSystemLedgerStatusExplanation(%q) = %q, want %q", tt.status, got.Summary, tt.want) - } + if len(got.Reasons) != 1 { + t.Fatalf("expected one status reason, got %+v", got) + } + if got.Reasons[0].Status != "stale" { + t.Fatalf("expected stale status reason, got %+v", got.Reasons[0]) + } + if got.Reasons[0].LastSeen != "2026-03-23T11:55:00Z" { + t.Fatalf("expected formatted reason last_seen, got %+v", got.Reasons[0]) } } @@ -141,11 +157,17 @@ func TestMonitoredSystemLedgerNilSystemsBecomesEmptyArray(t *testing.T) { func TestMonitoredSystemLedgerEntryNormalizeCollections(t *testing.T) { entry := MonitoredSystemLedgerEntry{ Name: "server-1", + StatusExplanation: MonitoredSystemLedgerStatusExplanation{ + Summary: "Pulse cannot determine a canonical runtime status for this monitored system yet.", + }, Explanation: MonitoredSystemLedgerExplanation{ Summary: "Counts as one monitored system because Pulse sees one top-level host view from agent.", }, }.NormalizeCollections() + if entry.StatusExplanation.Reasons == nil { + t.Fatal("expected status explanation reasons to normalize to an empty slice") + } if entry.Explanation.Reasons == nil { t.Fatal("expected explanation reasons to normalize to an empty slice") } @@ -166,6 +188,7 @@ func TestHandleMonitoredSystemLedgerHTTP(t *testing.T) { Status: "online", StatusExplanation: MonitoredSystemLedgerStatusExplanation{ Summary: "All included top-level collection paths currently report online status.", + Reasons: []MonitoredSystemLedgerStatusReason{}, }, LastSeen: "2025-01-01T00:00:00Z", Source: "agent", @@ -206,6 +229,9 @@ func TestHandleMonitoredSystemLedgerHTTP(t *testing.T) { if decoded.Systems[0].StatusExplanation.Summary == "" { t.Errorf("expected status explanation summary, got %+v", decoded.Systems[0].StatusExplanation) } + if decoded.Systems[0].StatusExplanation.Reasons == nil { + t.Errorf("expected status explanation reasons, got %+v", decoded.Systems[0].StatusExplanation) + } if decoded.Systems[0].Explanation.Summary == "" { t.Errorf("expected explanation summary, got %+v", decoded.Systems[0].Explanation) } diff --git a/internal/unifiedresources/monitored_systems.go b/internal/unifiedresources/monitored_systems.go index ade72dc6f..75f2d971c 100644 --- a/internal/unifiedresources/monitored_systems.go +++ b/internal/unifiedresources/monitored_systems.go @@ -42,15 +42,35 @@ type MonitoredSystemGroupingSurface struct { Source string } +// MonitoredSystemStatusExplanation explains why Pulse chose the canonical +// monitored-system runtime status. +type MonitoredSystemStatusExplanation struct { + Summary string + Reasons []MonitoredSystemStatusReason +} + +// MonitoredSystemStatusReason captures one canonical degraded-status signal +// that contributed to the monitored-system runtime status. +type MonitoredSystemStatusReason struct { + Kind string + Name string + Type string + Source string + Status string + LastSeen time.Time + Summary string +} + // MonitoredSystemRecord describes a counted top-level monitored system after // canonical cross-view deduplication. type MonitoredSystemRecord struct { - Name string - Type string - Status ResourceStatus - LastSeen time.Time - Source string - Explanation MonitoredSystemGroupingExplanation + Name string + Type string + Status ResourceStatus + StatusExplanation MonitoredSystemStatusExplanation + LastSeen time.Time + Source string + Explanation MonitoredSystemGroupingExplanation } // MonitoredSystemCount returns the number of top-level monitored systems after @@ -152,13 +172,15 @@ func resolveMonitoredSystemTopLevelSystems(rs ReadState) TopLevelSystemResolver func monitoredSystemRecord(group monitoredSystemGroup) MonitoredSystemRecord { resource := preferredMonitoredSystemResource(group.resources) + status := monitoredSystemStatus(group.resources) record := MonitoredSystemRecord{ - Name: monitoredSystemDisplayName(group.resources, resource), - Type: monitoredSystemType(resource), - Status: monitoredSystemStatus(group.resources), - LastSeen: monitoredSystemLastSeen(group.resources), - Source: monitoredSystemSource(group.resources), - Explanation: normalizeMonitoredSystemGroupingExplanation(group.explanation), + Name: monitoredSystemDisplayName(group.resources, resource), + Type: monitoredSystemType(resource), + Status: status, + StatusExplanation: monitoredSystemStatusExplanation(group.resources, status), + LastSeen: monitoredSystemLastSeen(group.resources), + Source: monitoredSystemSource(group.resources), + Explanation: normalizeMonitoredSystemGroupingExplanation(group.explanation), } if record.Name == "" { record.Name = "Unnamed system" @@ -169,6 +191,10 @@ func monitoredSystemRecord(group monitoredSystemGroup) MonitoredSystemRecord { if record.Status == "" { record.Status = StatusUnknown } + record.StatusExplanation = normalizeMonitoredSystemStatusExplanation(record.StatusExplanation) + if record.StatusExplanation.Summary == "" { + record.StatusExplanation.Summary = monitoredSystemStatusSummary(record.Status, record.StatusExplanation.Reasons) + } if record.Source == "" { record.Source = "unknown" } @@ -190,6 +216,15 @@ func normalizeMonitoredSystemGroupingExplanation( return explanation } +func normalizeMonitoredSystemStatusExplanation( + explanation MonitoredSystemStatusExplanation, +) MonitoredSystemStatusExplanation { + if explanation.Reasons == nil { + explanation.Reasons = []MonitoredSystemStatusReason{} + } + return explanation +} + func monitoredSystemStandaloneExplanation(resources []*Resource) MonitoredSystemGroupingExplanation { surfaces := monitoredSystemGroupingSurfaces(resources) resource := preferredMonitoredSystemResource(resources) @@ -412,6 +447,170 @@ func monitoredSystemStatus(resources []*Resource) ResourceStatus { return best } +func monitoredSystemStatusExplanation( + resources []*Resource, + status ResourceStatus, +) MonitoredSystemStatusExplanation { + reasons := monitoredSystemStatusReasons(resources) + return normalizeMonitoredSystemStatusExplanation(MonitoredSystemStatusExplanation{ + Summary: monitoredSystemStatusSummary(status, reasons), + Reasons: reasons, + }) +} + +func monitoredSystemStatusReasons(resources []*Resource) []MonitoredSystemStatusReason { + reasons := make([]MonitoredSystemStatusReason, 0) + for _, resource := range resources { + reasons = append(reasons, monitoredSystemResourceStatusReasons(resource)...) + } + sort.Slice(reasons, func(i, j int) bool { + if monitoredSystemStatusReasonPriority(reasons[i]) != monitoredSystemStatusReasonPriority(reasons[j]) { + return monitoredSystemStatusReasonPriority(reasons[i]) < monitoredSystemStatusReasonPriority(reasons[j]) + } + if reasons[i].Name != reasons[j].Name { + return reasons[i].Name < reasons[j].Name + } + if reasons[i].Type != reasons[j].Type { + return reasons[i].Type < reasons[j].Type + } + if reasons[i].Source != reasons[j].Source { + return reasons[i].Source < reasons[j].Source + } + if !reasons[i].LastSeen.Equal(reasons[j].LastSeen) { + return reasons[i].LastSeen.Before(reasons[j].LastSeen) + } + return reasons[i].Summary < reasons[j].Summary + }) + if reasons == nil { + return []MonitoredSystemStatusReason{} + } + return reasons +} + +func monitoredSystemResourceStatusReasons(resource *Resource) []MonitoredSystemStatusReason { + if resource == nil { + return nil + } + + name := monitoredSystemResourceDisplayName(resource) + if name == "" { + name = "Unnamed source" + } + + resourceType := monitoredSystemType(resource) + if resourceType == "" { + resourceType = "system" + } + + reasons := make([]MonitoredSystemStatusReason, 0) + if len(resource.SourceStatus) > 0 { + sourceKeys := make([]DataSource, 0, len(resource.SourceStatus)) + for source := range resource.SourceStatus { + sourceKeys = append(sourceKeys, source) + } + sort.Slice(sourceKeys, func(i, j int) bool { + return sourceKeys[i] < sourceKeys[j] + }) + + for _, source := range sourceKeys { + sourceStatus := resource.SourceStatus[source] + normalizedStatus := normalizeMonitoredSystemSourceStatus(sourceStatus.Status) + if normalizedStatus == "online" { + continue + } + reasons = append(reasons, MonitoredSystemStatusReason{ + Kind: "source-" + normalizedStatus, + Name: name, + Type: resourceType, + Source: string(source), + Status: normalizedStatus, + LastSeen: sourceStatus.LastSeen, + Summary: monitoredSystemSourceStatusReasonSummary(name, source, normalizedStatus, sourceStatus.LastSeen), + }) + } + } + + if len(reasons) > 0 { + return reasons + } + + normalizedStatus := normalizeMonitoredSystemSourceStatus(string(resource.Status)) + if normalizedStatus == "online" { + return nil + } + + source := monitoredSystemPrimarySource(resource) + if source == "" { + source = "unknown" + } + return []MonitoredSystemStatusReason{ + { + Kind: "surface-" + normalizedStatus, + Name: name, + Type: resourceType, + Source: source, + Status: normalizedStatus, + LastSeen: resource.LastSeen, + Summary: monitoredSystemSurfaceStatusReasonSummary(name, resourceType, source, normalizedStatus, resource.LastSeen), + }, + } +} + +func normalizeMonitoredSystemSourceStatus(status string) string { + switch strings.ToLower(strings.TrimSpace(status)) { + case "online": + return "online" + case "stale", "warning": + return "stale" + case "offline": + return "offline" + default: + return "unknown" + } +} + +func monitoredSystemStatusSummary(status ResourceStatus, reasons []MonitoredSystemStatusReason) string { + switch status { + case StatusOnline: + return "All included top-level collection paths currently report online status." + case StatusWarning: + switch { + case monitoredSystemHasReasonStatus(reasons, "stale"): + return "At least one included source is stale, so Pulse marks this monitored system as warning." + case monitoredSystemHasReasonStatus(reasons, "offline"): + return "At least one included source is offline or disconnected, but the canonical grouped status currently resolves to warning." + default: + return "At least one included top-level collection path is degraded, so Pulse marks this monitored system as warning." + } + case StatusOffline: + return "At least one included source is offline or disconnected, so Pulse marks this monitored system as offline." + default: + return "Pulse cannot determine a canonical runtime status for this monitored system yet." + } +} + +func monitoredSystemHasReasonStatus(reasons []MonitoredSystemStatusReason, status string) bool { + for _, reason := range reasons { + if reason.Status == status { + return true + } + } + return false +} + +func monitoredSystemStatusReasonPriority(reason MonitoredSystemStatusReason) int { + switch reason.Status { + case "offline": + return 0 + case "stale": + return 1 + case "unknown": + return 2 + default: + return 3 + } +} + func monitoredSystemStatusPriority(status ResourceStatus) int { switch status { case StatusWarning: @@ -427,6 +626,63 @@ func monitoredSystemStatusPriority(status ResourceStatus) int { } } +func monitoredSystemSourceStatusReasonSummary( + name string, + source DataSource, + status string, + lastSeen time.Time, +) string { + subject := name + if strings.TrimSpace(subject) == "" { + subject = "this monitored system" + } + + summary := monitoredSystemStatusSourceLabel(string(source)) + " data for " + subject + switch status { + case "stale": + summary += " is stale" + case "offline": + summary += " is offline or disconnected" + default: + summary += " does not report a canonical status yet" + } + + if !lastSeen.IsZero() { + summary += " (last reported " + lastSeen.UTC().Format(time.RFC3339) + ")." + return summary + } + return summary + "." +} + +func monitoredSystemSurfaceStatusReasonSummary( + name string, + resourceType string, + source string, + status string, + lastSeen time.Time, +) string { + subject := name + if strings.TrimSpace(subject) == "" { + subject = "This monitored system" + } + + summary := monitoredSystemGroupingTypeLabel(resourceType) + " view for " + subject + " currently reports " + switch status { + case "stale": + summary += "warning" + case "offline": + summary += "offline" + default: + summary += "unknown" + } + summary += " status from " + monitoredSystemStatusSourceLabel(source) + if !lastSeen.IsZero() { + summary += " (last reported " + lastSeen.UTC().Format(time.RFC3339) + ")." + return summary + } + return summary + "." +} + func monitoredSystemLastSeen(resources []*Resource) time.Time { var lastSeen time.Time for _, resource := range resources { @@ -488,6 +744,29 @@ func monitoredSystemPrimarySource(resource *Resource) string { return "" } +func monitoredSystemStatusSourceLabel(value string) string { + switch strings.TrimSpace(value) { + case "agent": + return "Agent" + case "docker": + return "Docker" + case "kubernetes": + return "Kubernetes" + case "pbs": + return "PBS" + case "pmg": + return "PMG" + case "proxmox": + return "Proxmox" + case "truenas": + return "TrueNAS" + case "", "unknown": + return "Unknown source" + default: + return strings.TrimSpace(value) + } +} + func cloneStringSet(in map[string]struct{}) map[string]struct{} { out := make(map[string]struct{}, len(in)) for key := range in { diff --git a/internal/unifiedresources/registry_test.go b/internal/unifiedresources/registry_test.go index 1be319b0e..9825bf7c5 100644 --- a/internal/unifiedresources/registry_test.go +++ b/internal/unifiedresources/registry_test.go @@ -261,6 +261,70 @@ func TestResourceRegistry_MonitoredSystemsSummarizeCanonicalTopLevelViews(t *tes } } +func TestMonitoredSystemsExplainsStaleGroupedSourceWhileLastSeenStaysFresh(t *testing.T) { + rr := NewRegistry(nil) + now := time.Date(2026, 3, 23, 12, 0, 0, 0, time.UTC) + + agentResource := topLevelTestAgent("agent-host", "tower.local", "machine-1", "agent-1") + agentResource.LastSeen = now.Add(-5 * time.Minute) + dockerResource := topLevelTestDockerHost("docker-host", "tower.local", "docker-runtime-1", "agent-1") + dockerResource.LastSeen = now.Add(-10 * time.Second) + + rr.IngestRecords(SourceAgent, []IngestRecord{ + { + SourceID: "agent-host", + Resource: agentResource, + }, + }) + rr.IngestRecords(SourceDocker, []IngestRecord{ + { + SourceID: "docker-host", + Resource: dockerResource, + }, + }) + + rr.MarkStale(now, map[DataSource]time.Duration{ + SourceAgent: 60 * time.Second, + SourceDocker: 60 * time.Second, + }) + + systems := MonitoredSystems(rr) + if len(systems) != 1 { + t.Fatalf("MonitoredSystems() returned %d systems, want 1", len(systems)) + } + + system := systems[0] + if system.Status != StatusWarning { + t.Fatalf("expected grouped monitored system status warning, got %+v", system) + } + if !system.LastSeen.Equal(dockerResource.LastSeen) { + t.Fatalf("expected grouped last_seen %s, got %s", dockerResource.LastSeen, system.LastSeen) + } + if system.StatusExplanation.Summary == "" { + t.Fatal("expected grouped monitored system status explanation summary") + } + if len(system.StatusExplanation.Reasons) != 1 { + t.Fatalf("expected one stale grouped-source reason, got %+v", system.StatusExplanation.Reasons) + } + + reason := system.StatusExplanation.Reasons[0] + if reason.Kind != "source-stale" { + t.Fatalf("expected stale source reason kind, got %+v", reason) + } + if reason.Source != string(SourceAgent) { + t.Fatalf("expected agent source reason, got %+v", reason) + } + if reason.Status != "stale" { + t.Fatalf("expected stale reason status, got %+v", reason) + } + if !reason.LastSeen.Equal(agentResource.LastSeen) { + t.Fatalf("expected stale reason last_seen %s, got %s", agentResource.LastSeen, reason.LastSeen) + } + if reason.Summary == "" { + t.Fatalf("expected stale reason summary, got %+v", reason) + } +} + func TestResourceRegistry_IngestRecords_UnknownSource(t *testing.T) { rr := NewRegistry(nil) now := time.Date(2026, 2, 20, 12, 0, 0, 0, time.UTC)