Correct fleet config drift truth

This commit is contained in:
rcourtman 2026-05-13 20:38:26 +01:00
parent 17253d27fd
commit ceb9b87cfb
7 changed files with 276 additions and 27 deletions

View file

@ -1927,8 +1927,10 @@ config `signature` backward-compatible with installed agents by signing the
legacy canonical payload shape only; newer clients validate `desiredConfig` by
recomputing it from the signed command decision and signed settings payload,
restricted to the agent-applied settings key schema. Broader applied-state
reporting and connections-ledger rollout presentation remain outside this
backend foundation.
reporting remains the next contract gap: until the runtime report carries a
comparable applied config fingerprint, `/api/connections` must surface desired
config metadata as pending or unknown and must not claim rollout convergence
from host report fields such as `commandsEnabled` or `diskExclude`.
That same canonical /api/auto-register path must also complete the live
post-registration contract after persistence: it must trigger discovery refresh
and emit the canonical `node_auto_registered` WebSocket payload instead of

View file

@ -868,6 +868,13 @@ the canonical monitored-system blocked payload.
kernel, architecture, and command capability, so settings surfaces can
render recognizable standalone-host identity without a second inventory
fetch or frontend-local host reconciliation rules.
Agent config drift on that same payload must source desired fingerprints
from `Monitor.GetHostAgentConfig(...).DesiredConfig` or the same
`remoteconfig.BuildDesiredConfigMetadata` path. The aggregator must not
manufacture convergence by assigning desired and applied to the same local
report-field fingerprint; when host state lacks a trustworthy applied
config fingerprint, `configDrift` stays pending or unknown and rollout
stays non-current.
Appliance-specific Pulse Agent compatibility is an additive host-profile
fact on that same identity payload. For Unraid and similar host profiles,
`agentIdentity.platform` remains the canonical runtime platform such as

View file

@ -583,7 +583,10 @@ bypass the API fail-closed execution gate.
but must not fork their own version-comparison semantics, desired/applied
config-drift classifier, rollout-state classifier, credential-health
classifier, command-policy vocabulary, or another agent lifecycle
vocabulary.
vocabulary. If `/api/connections` reports agent config drift as pending or
unknown because no trustworthy applied fingerprint exists, storage and
recovery must preserve that uncertainty instead of translating it into a
storage-local current/drifted verdict.
22. Keep backend-native platform actions on the adjacent AI/runtime and platform contracts. When `internal/api/` wires native TrueNAS app control for Assistant, storage and recovery may consume the refreshed recovery points afterward, but they must not grow a parallel recovery-local action transport or action-specific payload shape.
23. Keep backend-native platform diagnostics on the adjacent AI/runtime and platform contracts. When `internal/api/` wires native TrueNAS app log reads for Assistant, storage and recovery may use those diagnostics during investigation, but they must not grow a parallel recovery-local log transport or diagnostic payload shape.
24. Keep backend-native platform configuration reads on the adjacent AI/runtime and platform contracts. When `internal/api/` wires native TrueNAS app config for Assistant, storage and recovery may use that runtime shape during investigation, but they must not grow a parallel recovery-local config transport or provider-shaped configuration payload.

View file

@ -194,6 +194,60 @@ describe('ConnectionsAPI', () => {
});
});
it('list() preserves pending agent config drift without an applied fingerprint', async () => {
const connections: Connection[] = [
{
id: 'agent:mini-pc',
type: 'agent',
name: 'mini-pc',
address: 'mini-pc',
state: 'active',
stateReason: '',
enabled: true,
surfaces: ['host'],
scope: { host: true },
lastSeen: '2026-04-22T20:00:00Z',
lastError: null,
source: 'agent',
fleet: {
enrollmentState: 'enrolled',
livenessState: 'active',
versionDrift: 'current',
adapterHealth: 'healthy',
configRollout: 'reported',
credentialStatus: 'verified',
updateStatus: 'current',
remoteControl: 'enabled',
configDrift: {
status: 'pending',
desired: { version: 'host-agent-config/v1', hash: 'sha256:desired' },
reason: 'Pulse has not received a comparable applied agent configuration fingerprint yet',
},
rollout: {
status: 'pending',
stage: 'pending',
reason: 'waiting for the agent to report an applied configuration fingerprint',
},
},
capabilities: { supportsPause: false, supportsScope: false, supportsTest: false },
},
];
mockedApiFetchJSON.mockResolvedValueOnce({ connections });
const result = await ConnectionsAPI.list();
expect(result.connections[0]?.fleet?.configDrift).toEqual({
status: 'pending',
desired: { version: 'host-agent-config/v1', hash: 'sha256:desired' },
reason: 'Pulse has not received a comparable applied agent configuration fingerprint yet',
});
expect(result.connections[0]?.fleet?.rollout).toEqual({
status: 'pending',
stage: 'pending',
reason: 'waiting for the agent to report an applied configuration fingerprint',
});
});
it('list() preserves agent identity metadata on agent-backed connections', async () => {
const connections: Connection[] = [
{

View file

@ -94,6 +94,7 @@ type aggregatorInputs struct {
availabilityTargets []config.AvailabilityTarget
availabilityStatuses map[string]monitoring.AvailabilityProbeStatus
hosts []models.Host
agentDesiredConfigs map[string]ConnectionFleetConfigFingerprint
instanceHealth map[string]monitoring.InstanceHealth
expectedAgentVersion string
now time.Time
@ -131,7 +132,8 @@ func buildConnections(in aggregatorInputs) []Connection {
out = append(out, buildAvailabilityConnection(target, in.availabilityStatuses[target.ID], now))
}
for _, host := range in.hosts {
out = append(out, buildAgentConnection(host, in.expectedAgentVersion, now))
desiredConfig := connectionAgentConfigFingerprintForHost(in.agentDesiredConfigs, host.ID)
out = append(out, buildAgentConnection(host, in.expectedAgentVersion, now, desiredConfig))
}
sort.Slice(out, func(i, j int) bool {
@ -341,7 +343,7 @@ func buildAvailabilityConnection(target config.AvailabilityTarget, status monito
// buildAgentConnection derives a connection row from an agent Host record.
// Agents have no pause toggle and no scope — reports are all-or-nothing —
// so capability flags are off.
func buildAgentConnection(host models.Host, expectedAgentVersion string, now time.Time) Connection {
func buildAgentConnection(host models.Host, expectedAgentVersion string, now time.Time, desiredConfig *ConnectionFleetConfigFingerprint) Connection {
name := host.DisplayName
if strings.TrimSpace(name) == "" {
name = host.Hostname
@ -399,7 +401,7 @@ func buildAgentConnection(host models.Host, expectedAgentVersion string, now tim
AgentUpdateAvailable: updateAvailable,
Capabilities: ConnectionCapabilities{SupportsPause: false, SupportsScope: false, SupportsTest: false},
}, now)
conn.Fleet.ConfigDrift = connectionFleetAgentConfigDrift(conn, host)
conn.Fleet.ConfigDrift = connectionFleetAgentConfigDrift(conn, desiredConfig)
conn.Fleet.CredentialHealth = connectionFleetAgentCredentialHealth(conn, host, now)
conn.Fleet.CommandPolicy = connectionFleetAgentCommandPolicy(conn, host)
conn.Fleet.Rollout = connectionFleetRollout(conn)
@ -548,7 +550,11 @@ func connectionFleetConfigDrift(conn Connection) *ConnectionFleetConfigDrift {
}
}
func connectionFleetAgentConfigDrift(conn Connection, host models.Host) *ConnectionFleetConfigDrift {
func connectionFleetAgentConfigDrift(conn Connection, desired *ConnectionFleetConfigFingerprint) *ConnectionFleetConfigDrift {
return connectionFleetAgentConfigDriftForFingerprints(conn, desired, nil)
}
func connectionFleetAgentConfigDriftForFingerprints(conn Connection, desired, applied *ConnectionFleetConfigFingerprint) *ConnectionFleetConfigDrift {
if !conn.Enabled || conn.State == ConnectionStatePaused {
return &ConnectionFleetConfigDrift{
Status: fleetStatePaused,
@ -556,30 +562,41 @@ func connectionFleetAgentConfigDrift(conn Connection, host models.Host) *Connect
}
}
if desired == nil {
return &ConnectionFleetConfigDrift{
Status: fleetStateUnknown,
Reason: "Pulse has not resolved canonical desired agent configuration metadata",
}
}
if conn.LastSeen == nil {
return &ConnectionFleetConfigDrift{
Status: fleetStateUnknown,
Reason: "Pulse has not received an applied agent configuration report yet",
Status: fleetStateUnknown,
Desired: desired,
Reason: "Pulse has not received an agent report to compare against desired configuration",
}
}
applied := connectionConfigFingerprint(connectionAgentConfigFingerprintVersion, map[string]any{
"commandsEnabled": host.CommandsEnabled,
"diskExclude": host.DiskExclude,
})
if applied == nil {
return &ConnectionFleetConfigDrift{
Status: fleetStateUnknown,
Reason: "applied agent configuration fingerprint could not be derived",
Status: fleetStatePending,
Desired: desired,
Reason: "Pulse has not received a comparable applied agent configuration fingerprint yet",
}
}
status := fleetConfigDriftCurrent
reason := "reported applied agent configuration matches the desired fleet policy"
if desired.Version != applied.Version || desired.Hash != applied.Hash {
status = fleetConfigDriftDrifted
reason = "desired agent configuration fingerprint differs from the reported applied fingerprint"
}
return &ConnectionFleetConfigDrift{
Status: fleetConfigDriftCurrent,
Desired: applied,
Status: status,
Desired: desired,
Applied: applied,
LastObservedAt: conn.LastSeen,
Reason: "reported agent configuration matches the active fleet policy snapshot",
Reason: reason,
}
}
@ -617,11 +634,26 @@ func connectionFleetRollout(conn Connection) *ConnectionFleetRolloutState {
Reason: "waiting for the agent to report applied configuration",
}
}
if conn.Fleet.ConfigDrift != nil && conn.Fleet.ConfigDrift.Status == fleetConfigDriftDrifted {
return &ConnectionFleetRolloutState{
Status: fleetStatePending,
Stage: fleetRolloutStagePending,
Reason: "desired configuration has not converged on the reported runtime",
if conn.Type == ConnectionTypeAgent && conn.Fleet.ConfigDrift != nil {
switch conn.Fleet.ConfigDrift.Status {
case fleetConfigDriftDrifted:
return &ConnectionFleetRolloutState{
Status: fleetStatePending,
Stage: fleetRolloutStagePending,
Reason: "desired configuration has not converged on the reported runtime",
}
case fleetStatePending:
return &ConnectionFleetRolloutState{
Status: fleetStatePending,
Stage: fleetRolloutStagePending,
Reason: "waiting for the agent to report an applied configuration fingerprint",
}
case fleetStateUnknown:
return &ConnectionFleetRolloutState{
Status: fleetStateUnknown,
Stage: fleetRolloutStagePending,
Reason: "rollout state cannot be confirmed without comparable desired and applied agent config fingerprints",
}
}
}
stage := fleetRolloutStageLocal
@ -755,6 +787,54 @@ func connectionConfigFingerprint(version string, payload any) *ConnectionFleetCo
}
}
func connectionAgentDesiredConfigFingerprints(monitor *monitoring.Monitor, hosts []models.Host) map[string]ConnectionFleetConfigFingerprint {
if monitor == nil || len(hosts) == 0 {
return nil
}
fingerprints := make(map[string]ConnectionFleetConfigFingerprint, len(hosts))
for _, host := range hosts {
hostID := strings.TrimSpace(host.ID)
if hostID == "" {
continue
}
cfg := monitor.GetHostAgentConfig(hostID)
if cfg.DesiredConfig == nil {
continue
}
if fp := connectionConfigFingerprintFromMetadata(cfg.DesiredConfig.Version, cfg.DesiredConfig.Hash); fp != nil {
fingerprints[hostID] = *fp
}
}
if len(fingerprints) == 0 {
return nil
}
return fingerprints
}
func connectionAgentConfigFingerprintForHost(fingerprints map[string]ConnectionFleetConfigFingerprint, hostID string) *ConnectionFleetConfigFingerprint {
if len(fingerprints) == 0 {
return nil
}
fp, ok := fingerprints[strings.TrimSpace(hostID)]
if !ok {
return nil
}
return connectionConfigFingerprintFromMetadata(fp.Version, fp.Hash)
}
func connectionConfigFingerprintFromMetadata(version, hash string) *ConnectionFleetConfigFingerprint {
version = strings.TrimSpace(version)
hash = strings.TrimSpace(hash)
if version == "" || hash == "" {
return nil
}
return &ConnectionFleetConfigFingerprint{
Version: version,
Hash: hash,
}
}
func connectionProxmoxCredentialKind(user, password, tokenName, tokenValue string) string {
if strings.TrimSpace(tokenName) != "" || strings.TrimSpace(tokenValue) != "" {
return fleetCredentialKindToken

View file

@ -8,10 +8,13 @@ import (
"github.com/rcourtman/pulse-go-rewrite/internal/config"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/internal/monitoring"
"github.com/rcourtman/pulse-go-rewrite/internal/remoteconfig"
)
func ptrTime(t time.Time) *time.Time { return &t }
func ptrBool(v bool) *bool { return &v }
func healthEntry(lastSuccess *time.Time, errMessage, errCategory string, breakerState string) monitoring.InstanceHealth {
ps := monitoring.InstancePollStatus{LastSuccess: lastSuccess}
if errMessage != "" {
@ -27,6 +30,18 @@ func healthEntry(lastSuccess *time.Time, errMessage, errCategory string, breaker
}
}
func desiredAgentConfigFingerprint(t *testing.T, commandsEnabled *bool, settings map[string]interface{}) ConnectionFleetConfigFingerprint {
t.Helper()
metadata, err := remoteconfig.BuildDesiredConfigMetadata(commandsEnabled, settings)
if err != nil {
t.Fatalf("BuildDesiredConfigMetadata: %v", err)
}
return ConnectionFleetConfigFingerprint{
Version: metadata.Version,
Hash: metadata.Hash,
}
}
func TestDeriveConnectionState_Paused(t *testing.T) {
state, reason, _, _ := deriveConnectionState(false, monitoring.InstanceHealth{}, time.Now())
if state != ConnectionStatePaused {
@ -360,6 +375,7 @@ func TestBuildConnections_AgentVersionUpdateAvailability(t *testing.T) {
func TestBuildConnections_AgentFleetGovernance(t *testing.T) {
now := time.Now()
currentDesired := desiredAgentConfigFingerprint(t, ptrBool(true), nil)
in := aggregatorInputs{
hosts: []models.Host{
{
@ -380,6 +396,9 @@ func TestBuildConnections_AgentFleetGovernance(t *testing.T) {
Hostname: "pending",
},
},
agentDesiredConfigs: map[string]ConnectionFleetConfigFingerprint{
"current": currentDesired,
},
expectedAgentVersion: "6.0.2",
now: now,
}
@ -402,14 +421,14 @@ func TestBuildConnections_AgentFleetGovernance(t *testing.T) {
t.Fatalf("current agent fleet governance = %+v", current)
}
if current.ConfigDrift == nil ||
current.ConfigDrift.Status != fleetConfigDriftCurrent ||
current.ConfigDrift.Status != fleetStatePending ||
current.ConfigDrift.Desired == nil ||
current.ConfigDrift.Applied == nil ||
current.ConfigDrift.Applied != nil ||
current.ConfigDrift.Desired.Version != connectionAgentConfigFingerprintVersion ||
current.ConfigDrift.Desired.Hash != current.ConfigDrift.Applied.Hash {
current.ConfigDrift.Desired.Hash != currentDesired.Hash {
t.Fatalf("current agent config drift = %+v", current.ConfigDrift)
}
if current.Rollout == nil || current.Rollout.Status != fleetStateCurrent || current.Rollout.Stage != fleetRolloutStageApplied {
if current.Rollout == nil || current.Rollout.Status != fleetStatePending || current.Rollout.Stage != fleetRolloutStagePending {
t.Fatalf("current agent rollout = %+v", current.Rollout)
}
if current.CommandPolicy == nil ||
@ -449,6 +468,89 @@ func TestBuildConnections_AgentFleetGovernance(t *testing.T) {
}
}
func TestBuildConnections_AgentConfigDriftUsesCanonicalDesiredMetadataWithoutSelfComparing(t *testing.T) {
now := time.Now()
desired := desiredAgentConfigFingerprint(t, ptrBool(true), map[string]interface{}{
"interval": "10s",
})
in := aggregatorInputs{
hosts: []models.Host{
{
ID: "agent-1",
Hostname: "agent-1",
LastSeen: now,
CommandsEnabled: false,
DiskExclude: []string{"/dev/loop*"},
},
},
agentDesiredConfigs: map[string]ConnectionFleetConfigFingerprint{
"agent-1": desired,
},
now: now,
}
got := buildConnections(in)
if len(got) != 1 {
t.Fatalf("expected 1 connection, got %d", len(got))
}
drift := got[0].Fleet.ConfigDrift
if drift == nil {
t.Fatal("expected agent config drift metadata")
}
if drift.Status != fleetStatePending {
t.Fatalf("config drift status = %q, want pending", drift.Status)
}
if drift.Desired == nil || *drift.Desired != desired {
t.Fatalf("desired config drift fingerprint = %+v, want %+v", drift.Desired, desired)
}
if drift.Applied != nil {
t.Fatalf("applied config fingerprint should be absent until agent reports a comparable fingerprint, got %+v", drift.Applied)
}
selfCompared := connectionConfigFingerprint(connectionAgentConfigFingerprintVersion, map[string]any{
"commandsEnabled": false,
"diskExclude": []string{"/dev/loop*"},
})
if selfCompared == nil {
t.Fatal("expected local self-comparison fingerprint to be derivable")
}
if drift.Desired.Hash == selfCompared.Hash {
t.Fatalf("desired config hash reused report-field fingerprint %q", drift.Desired.Hash)
}
if got[0].Fleet.Rollout == nil || got[0].Fleet.Rollout.Status == fleetStateCurrent {
t.Fatalf("rollout should not claim current without an applied config comparison, got %+v", got[0].Fleet.Rollout)
}
}
func TestConnectionFleetAgentConfigDriftComparesAppliedFingerprintsWhenAvailable(t *testing.T) {
now := time.Now()
conn := Connection{
Type: ConnectionTypeAgent,
State: ConnectionStateActive,
Enabled: true,
LastSeen: &now,
}
desired := &ConnectionFleetConfigFingerprint{Version: connectionAgentConfigFingerprintVersion, Hash: "sha256:desired"}
applied := &ConnectionFleetConfigFingerprint{Version: connectionAgentConfigFingerprintVersion, Hash: "sha256:applied"}
drifted := connectionFleetAgentConfigDriftForFingerprints(conn, desired, applied)
if drifted.Status != fleetConfigDriftDrifted ||
drifted.Desired != desired ||
drifted.Applied != applied ||
drifted.LastObservedAt == nil {
t.Fatalf("drifted config comparison = %+v", drifted)
}
matchingApplied := &ConnectionFleetConfigFingerprint{Version: desired.Version, Hash: desired.Hash}
current := connectionFleetAgentConfigDriftForFingerprints(conn, desired, matchingApplied)
if current.Status != fleetConfigDriftCurrent ||
current.Desired != desired ||
current.Applied != matchingApplied ||
current.LastObservedAt == nil {
t.Fatalf("current config comparison = %+v", current)
}
}
func TestBuildConnections_PlatformFleetGovernance(t *testing.T) {
now := time.Now()
lastSuccess := now.Add(-30 * time.Second)

View file

@ -39,6 +39,7 @@ func buildAggregatorInputs(ctx context.Context, cfg *config.Config, persistence
if monitor != nil {
inputs.hosts = monitor.HostsSnapshot()
inputs.agentDesiredConfigs = connectionAgentDesiredConfigFingerprints(monitor, inputs.hosts)
inputs.instanceHealth = instanceHealthByKey(monitor.SchedulerHealth())
inputs.availabilityStatuses = monitor.AvailabilityStatusSnapshot()
} else {