From b44d5892f41ea9a586f788c1d822fcb7fe580702 Mon Sep 17 00:00:00 2001 From: rcourtman Date: Sun, 10 May 2026 19:27:36 +0100 Subject: [PATCH] Gate stale-finding auto-resolve on category whitelist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit reconcileStaleFindings was auto-resolving any seeded finding that the LLM didn't re-report in a successful run. The function's own comment acknowledges the LLM doesn't reliably use patrol_resolve_finding, so this was built as a cleanup pass — but it cannot tell the difference between "LLM correctly recognized this is fixed" and "LLM forgot to re-mention it." For findings that represent discrete events or persistent states (a backup task that failed, a service that crashed, a security vulnerability that was found, a configuration error), absence in a Patrol report is not evidence that the issue has cleared. The result was bogus auto_resolved → re-detected → regressed cycles, observed in the wild as "Backup failed" regressing 4× over 6 hours and "Provider analysis error" regressing 271×. Those bogus auto-resolutions also inflated the trust strip with fictional auto-resolved credit. CategorySupportsStaleAutoResolve in findings.go gates the cleanup: only `performance` and `capacity` findings — continuous current-state metric thresholds — may be auto-resolved from absence. The other four categories (reliability, backup, security, general) stay active until explicitly resolved. Updates the ai-runtime subsystem contract Current State section with the whitelist and the adjacent lifecycle dedup rules already landed. Adds TestReconcileStaleFindings_SkipsNonCurrentStateCategories with table-driven subtests for all four event/persistent categories, and TestCategorySupportsStaleAutoResolve to lock in the whitelist. --- .../v6/internal/subsystems/ai-runtime.md | 22 ++++++ internal/ai/findings.go | 28 ++++++++ internal/ai/patrol_ai.go | 28 +++++++- internal/ai/patrol_reconcile_test.go | 71 +++++++++++++++++++ 4 files changed, 147 insertions(+), 2 deletions(-) diff --git a/docs/release-control/v6/internal/subsystems/ai-runtime.md b/docs/release-control/v6/internal/subsystems/ai-runtime.md index bed299645..6ed376419 100644 --- a/docs/release-control/v6/internal/subsystems/ai-runtime.md +++ b/docs/release-control/v6/internal/subsystems/ai-runtime.md @@ -580,6 +580,28 @@ runtime cost control, and shared AI transport surfaces. ## Current State +Stale-finding auto-resolve (`reconcileStaleFindings` in +`internal/ai/patrol_ai.go`) is gated on the category whitelist exposed by +`CategorySupportsStaleAutoResolve` in `internal/ai/findings.go`. Only +`performance` and `capacity` findings — continuous current-state metric +thresholds where the most recent successful scan's observation is +authoritative — may be auto-resolved from absence in the LLM's report. +`reliability`, `backup`, `security`, and `general` findings represent +discrete events or persistent states; the LLM not re-mentioning a failed +backup task, a service that crashed, or a vulnerability that was found +is not evidence that the underlying issue has cleared. Those categories +must stay active until explicitly resolved either by the LLM calling +`patrol_resolve_finding` with evidence or by operator action through the +governed findings store. Lifecycle events recorded by `findings.go` must +not introduce duplicate generic transition rows: the canonical +`syncLoopStateLocked` records `loop_transition_violation` only when a +transition is rejected, and otherwise leaves the semantic event +(`auto_resolved`, `regressed`, `dismissed`, `acknowledged`, `snoozed`, +`reminded`, `suppression_lifted`, etc.) as the single audit row for the +transition. Re-detections of an already-active finding update +`TimesRaised` and `LastSeenAt` only — they must not append a `detected` +lifecycle event, because a heartbeat is not a transition. + The findings store now consults a `ResourceOperatorStateProvider` during the new-finding path. The interface lives in `internal/ai` to avoid an import cycle with `internal/unifiedresources`; the API layer diff --git a/internal/ai/findings.go b/internal/ai/findings.go index e70515a05..3f68e51d7 100644 --- a/internal/ai/findings.go +++ b/internal/ai/findings.go @@ -45,6 +45,34 @@ const ( FindingCategoryGeneral FindingCategory = "general" ) +// CategorySupportsStaleAutoResolve reports whether a finding category +// represents a continuous current-state condition that Patrol can soundly +// auto-resolve from absence in a successful run. Only `performance` and +// `capacity` qualify: those are metric thresholds (CPU above X, disk above +// Y) where the most recent scan's observation is authoritative — if the +// threshold isn't tripped now, the condition has cleared. +// +// All other categories represent discrete events or persistent states: +// `reliability` (a service that crashed, a restart loop), `backup` (a +// backup task that failed), `security` (a vulnerability that was found), +// and `general` (catch-all). For these, absence in a Patrol report only +// means the LLM didn't re-mention the finding — it does not mean the +// underlying issue has been addressed. Auto-resolving them on absence +// produced false `auto_resolved` events that the next run re-detected as +// regressions, inflating the regression counter and polluting the trust +// strip with fictional "auto-resolved" credit. +// +// Such findings stay active until explicitly resolved — either by the LLM +// calling `patrol_resolve_finding` with evidence, or by operator action. +func CategorySupportsStaleAutoResolve(category FindingCategory) bool { + switch category { + case FindingCategoryPerformance, FindingCategoryCapacity: + return true + default: + return false + } +} + // DismissReasonWillFixLater marks a finding as "I will fix this later" — an // operational commitment with an implicit deadline. See Finding.RemindAt. const DismissReasonWillFixLater = "will_fix_later" diff --git a/internal/ai/patrol_ai.go b/internal/ai/patrol_ai.go index 3cfb04c2b..0be1f8409 100644 --- a/internal/ai/patrol_ai.go +++ b/internal/ai/patrol_ai.go @@ -3755,8 +3755,17 @@ func seedFormatTimeAgo(now, t time.Time) string { // in seed context but were neither re-reported nor explicitly resolved during the run. // This handles the case where the LLM doesn't reliably use patrol_resolve_finding. // -// Safety: only called after successful full patrols (not scoped), and only for findings -// that were in the seed context (the LLM had the opportunity to re-report them). +// Safety: +// - only called after successful full patrols (not scoped), and only for findings +// that were in the seed context (the LLM had the opportunity to re-report them); +// - only acts on finding categories where "absence of re-report = condition cleared" +// is a sound model. See CategorySupportsStaleAutoResolve: performance and capacity +// are continuous metric thresholds, so the most recent successful scan's absence +// of trip is authoritative. Reliability/backup/security/general represent discrete +// events or persistent states — the LLM not re-mentioning a failed backup task or +// a security vulnerability does not mean the issue has been addressed, and silent +// auto-resolve there produced bogus auto_resolved → re-detected → regressed +// cycles that inflated the trust strip and the regression counter. func (p *PatrolService) reconcileStaleFindings(reportedIDs, resolvedIDs, seededFindingIDs []string, runHadErrors bool) int { if runHadErrors { return 0 @@ -3783,6 +3792,20 @@ func (p *PatrolService) reconcileStaleFindings(reportedIDs, resolvedIDs, seededF if reported[id] || resolved[id] { continue } + // Category gate: only continuous current-state categories are safe to + // auto-resolve from absence. Event/persistent categories stay active + // until explicitly resolved. + finding := p.findings.Get(id) + if finding == nil { + continue + } + if !CategorySupportsStaleAutoResolve(finding.Category) { + log.Debug(). + Str("finding_id", id). + Str("category", string(finding.Category)). + Msg("AI Patrol: skipping stale auto-resolve — category not eligible (event/persistent finding)") + continue + } // Only resolve if still active if ok := p.findings.ResolveWithReason(id, "No longer detected by patrol"); ok { count++ @@ -3797,6 +3820,7 @@ func (p *PatrolService) reconcileStaleFindings(reportedIDs, resolvedIDs, seededF log.Info(). Str("finding_id", id). + Str("category", string(finding.Category)). Msg("AI Patrol: Auto-resolved stale finding (not re-reported by patrol)") } } diff --git a/internal/ai/patrol_reconcile_test.go b/internal/ai/patrol_reconcile_test.go index 4d5f8af3d..4fa46b009 100644 --- a/internal/ai/patrol_reconcile_test.go +++ b/internal/ai/patrol_reconcile_test.go @@ -213,3 +213,74 @@ func TestReconcileStaleFindings_NoSeededFindings(t *testing.T) { t.Fatalf("expected 0 auto-resolved findings when no seeded IDs, got %d", resolved) } } + +// Category gate: only performance and capacity findings (continuous current-state +// metrics) may be auto-resolved from absence. Reliability, backup, security, and +// general findings represent events or persistent states and must stay active +// until explicitly resolved — silent absence-based auto-resolve there caused +// bogus auto_resolved → re-detected → regressed cycles that polluted the trust +// strip and inflated the regression counter. +func TestReconcileStaleFindings_SkipsNonCurrentStateCategories(t *testing.T) { + nonEligible := []struct { + name string + category FindingCategory + }{ + {"reliability", FindingCategoryReliability}, + {"backup", FindingCategoryBackup}, + {"security", FindingCategorySecurity}, + {"general", FindingCategoryGeneral}, + } + for _, tc := range nonEligible { + t.Run(string(tc.category), func(t *testing.T) { + f := &Finding{ + ID: "find-" + string(tc.category), + Severity: FindingSeverityWarning, + Category: tc.category, + ResourceID: "vm-100", + ResourceName: "web", + Title: "Some persistent event", + DetectedAt: time.Now().Add(-1 * time.Hour), + } + p := newTestPatrolWithFindings([]*Finding{f}) + + resolved := p.reconcileStaleFindings( + nil, // not re-reported + nil, // not explicitly resolved + []string{f.ID}, // seeded + false, // run succeeded + ) + + if resolved != 0 { + t.Fatalf("expected 0 auto-resolved findings for category %s, got %d", tc.category, resolved) + } + stored := p.findings.Get(f.ID) + if stored == nil { + t.Fatal("finding should still exist in store") + } + if stored.ResolvedAt != nil { + t.Fatalf("category %s finding must NOT be auto-resolved from absence; resolved at %s", tc.category, stored.ResolvedAt) + } + }) + } +} + +func TestCategorySupportsStaleAutoResolve(t *testing.T) { + cases := []struct { + category FindingCategory + want bool + }{ + {FindingCategoryPerformance, true}, + {FindingCategoryCapacity, true}, + {FindingCategoryReliability, false}, + {FindingCategoryBackup, false}, + {FindingCategorySecurity, false}, + {FindingCategoryGeneral, false}, + } + for _, tc := range cases { + t.Run(string(tc.category), func(t *testing.T) { + if got := CategorySupportsStaleAutoResolve(tc.category); got != tc.want { + t.Fatalf("CategorySupportsStaleAutoResolve(%s) = %v, want %v", tc.category, got, tc.want) + } + }) + } +}