From b44d5892f41ea9a586f788c1d822fcb7fe580702 Mon Sep 17 00:00:00 2001
From: rcourtman <courtmanr@gmail.com>
Date: Sun, 10 May 2026 19:27:36 +0100
Subject: [PATCH] Gate stale-finding auto-resolve on category whitelist
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

reconcileStaleFindings was auto-resolving any seeded finding that the
LLM didn't re-report in a successful run. The function's own comment
acknowledges the LLM doesn't reliably use patrol_resolve_finding, so
this was built as a cleanup pass — but it cannot tell the difference
between "LLM correctly recognized this is fixed" and "LLM forgot to
re-mention it." For findings that represent discrete events or
persistent states (a backup task that failed, a service that
crashed, a security vulnerability that was found, a configuration
error), absence in a Patrol report is not evidence that the issue
has cleared. The result was bogus auto_resolved → re-detected →
regressed cycles, observed in the wild as "Backup failed" regressing
4× over 6 hours and "Provider analysis error" regressing 271×.
Those bogus auto-resolutions also inflated the trust strip with
fictional auto-resolved credit.

CategorySupportsStaleAutoResolve in findings.go gates the cleanup:
only `performance` and `capacity` findings — continuous current-state
metric thresholds — may be auto-resolved from absence. The other
four categories (reliability, backup, security, general) stay active
until explicitly resolved.

Updates the ai-runtime subsystem contract Current State section with
the whitelist and the adjacent lifecycle dedup rules already landed.

Adds TestReconcileStaleFindings_SkipsNonCurrentStateCategories with
table-driven subtests for all four event/persistent categories, and
TestCategorySupportsStaleAutoResolve to lock in the whitelist.
---
 .../v6/internal/subsystems/ai-runtime.md      | 22 ++++++
 internal/ai/findings.go                       | 28 ++++++++
 internal/ai/patrol_ai.go                      | 28 +++++++-
 internal/ai/patrol_reconcile_test.go          | 71 +++++++++++++++++++
 4 files changed, 147 insertions(+), 2 deletions(-)

diff --git a/docs/release-control/v6/internal/subsystems/ai-runtime.md b/docs/release-control/v6/internal/subsystems/ai-runtime.md
index bed299645..6ed376419 100644
--- a/docs/release-control/v6/internal/subsystems/ai-runtime.md
+++ b/docs/release-control/v6/internal/subsystems/ai-runtime.md
@@ -580,6 +580,28 @@ runtime cost control, and shared AI transport surfaces.
 
 ## Current State
 
+Stale-finding auto-resolve (`reconcileStaleFindings` in
+`internal/ai/patrol_ai.go`) is gated on the category whitelist exposed by
+`CategorySupportsStaleAutoResolve` in `internal/ai/findings.go`. Only
+`performance` and `capacity` findings — continuous current-state metric
+thresholds where the most recent successful scan's observation is
+authoritative — may be auto-resolved from absence in the LLM's report.
+`reliability`, `backup`, `security`, and `general` findings represent
+discrete events or persistent states; the LLM not re-mentioning a failed
+backup task, a service that crashed, or a vulnerability that was found
+is not evidence that the underlying issue has cleared. Those categories
+must stay active until explicitly resolved either by the LLM calling
+`patrol_resolve_finding` with evidence or by operator action through the
+governed findings store. Lifecycle events recorded by `findings.go` must
+not introduce duplicate generic transition rows: the canonical
+`syncLoopStateLocked` records `loop_transition_violation` only when a
+transition is rejected, and otherwise leaves the semantic event
+(`auto_resolved`, `regressed`, `dismissed`, `acknowledged`, `snoozed`,
+`reminded`, `suppression_lifted`, etc.) as the single audit row for the
+transition. Re-detections of an already-active finding update
+`TimesRaised` and `LastSeenAt` only — they must not append a `detected`
+lifecycle event, because a heartbeat is not a transition.
+
 The findings store now consults a `ResourceOperatorStateProvider`
 during the new-finding path. The interface lives in `internal/ai` to
 avoid an import cycle with `internal/unifiedresources`; the API layer
diff --git a/internal/ai/findings.go b/internal/ai/findings.go
index e70515a05..3f68e51d7 100644
--- a/internal/ai/findings.go
+++ b/internal/ai/findings.go
@@ -45,6 +45,34 @@ const (
 	FindingCategoryGeneral     FindingCategory = "general"
 )
 
+// CategorySupportsStaleAutoResolve reports whether a finding category
+// represents a continuous current-state condition that Patrol can soundly
+// auto-resolve from absence in a successful run. Only `performance` and
+// `capacity` qualify: those are metric thresholds (CPU above X, disk above
+// Y) where the most recent scan's observation is authoritative — if the
+// threshold isn't tripped now, the condition has cleared.
+//
+// All other categories represent discrete events or persistent states:
+// `reliability` (a service that crashed, a restart loop), `backup` (a
+// backup task that failed), `security` (a vulnerability that was found),
+// and `general` (catch-all). For these, absence in a Patrol report only
+// means the LLM didn't re-mention the finding — it does not mean the
+// underlying issue has been addressed. Auto-resolving them on absence
+// produced false `auto_resolved` events that the next run re-detected as
+// regressions, inflating the regression counter and polluting the trust
+// strip with fictional "auto-resolved" credit.
+//
+// Such findings stay active until explicitly resolved — either by the LLM
+// calling `patrol_resolve_finding` with evidence, or by operator action.
+func CategorySupportsStaleAutoResolve(category FindingCategory) bool {
+	switch category {
+	case FindingCategoryPerformance, FindingCategoryCapacity:
+		return true
+	default:
+		return false
+	}
+}
+
 // DismissReasonWillFixLater marks a finding as "I will fix this later" — an
 // operational commitment with an implicit deadline. See Finding.RemindAt.
 const DismissReasonWillFixLater = "will_fix_later"
diff --git a/internal/ai/patrol_ai.go b/internal/ai/patrol_ai.go
index 3cfb04c2b..0be1f8409 100644
--- a/internal/ai/patrol_ai.go
+++ b/internal/ai/patrol_ai.go
@@ -3755,8 +3755,17 @@ func seedFormatTimeAgo(now, t time.Time) string {
 // in seed context but were neither re-reported nor explicitly resolved during the run.
 // This handles the case where the LLM doesn't reliably use patrol_resolve_finding.
 //
-// Safety: only called after successful full patrols (not scoped), and only for findings
-// that were in the seed context (the LLM had the opportunity to re-report them).
+// Safety:
+//   - only called after successful full patrols (not scoped), and only for findings
+//     that were in the seed context (the LLM had the opportunity to re-report them);
+//   - only acts on finding categories where "absence of re-report = condition cleared"
+//     is a sound model. See CategorySupportsStaleAutoResolve: performance and capacity
+//     are continuous metric thresholds, so the most recent successful scan's absence
+//     of trip is authoritative. Reliability/backup/security/general represent discrete
+//     events or persistent states — the LLM not re-mentioning a failed backup task or
+//     a security vulnerability does not mean the issue has been addressed, and silent
+//     auto-resolve there produced bogus auto_resolved → re-detected → regressed
+//     cycles that inflated the trust strip and the regression counter.
 func (p *PatrolService) reconcileStaleFindings(reportedIDs, resolvedIDs, seededFindingIDs []string, runHadErrors bool) int {
 	if runHadErrors {
 		return 0
@@ -3783,6 +3792,20 @@ func (p *PatrolService) reconcileStaleFindings(reportedIDs, resolvedIDs, seededF
 		if reported[id] || resolved[id] {
 			continue
 		}
+		// Category gate: only continuous current-state categories are safe to
+		// auto-resolve from absence. Event/persistent categories stay active
+		// until explicitly resolved.
+		finding := p.findings.Get(id)
+		if finding == nil {
+			continue
+		}
+		if !CategorySupportsStaleAutoResolve(finding.Category) {
+			log.Debug().
+				Str("finding_id", id).
+				Str("category", string(finding.Category)).
+				Msg("AI Patrol: skipping stale auto-resolve — category not eligible (event/persistent finding)")
+			continue
+		}
 		// Only resolve if still active
 		if ok := p.findings.ResolveWithReason(id, "No longer detected by patrol"); ok {
 			count++
@@ -3797,6 +3820,7 @@ func (p *PatrolService) reconcileStaleFindings(reportedIDs, resolvedIDs, seededF
 
 			log.Info().
 				Str("finding_id", id).
+				Str("category", string(finding.Category)).
 				Msg("AI Patrol: Auto-resolved stale finding (not re-reported by patrol)")
 		}
 	}
diff --git a/internal/ai/patrol_reconcile_test.go b/internal/ai/patrol_reconcile_test.go
index 4d5f8af3d..4fa46b009 100644
--- a/internal/ai/patrol_reconcile_test.go
+++ b/internal/ai/patrol_reconcile_test.go
@@ -213,3 +213,74 @@ func TestReconcileStaleFindings_NoSeededFindings(t *testing.T) {
 		t.Fatalf("expected 0 auto-resolved findings when no seeded IDs, got %d", resolved)
 	}
 }
+
+// Category gate: only performance and capacity findings (continuous current-state
+// metrics) may be auto-resolved from absence. Reliability, backup, security, and
+// general findings represent events or persistent states and must stay active
+// until explicitly resolved — silent absence-based auto-resolve there caused
+// bogus auto_resolved → re-detected → regressed cycles that polluted the trust
+// strip and inflated the regression counter.
+func TestReconcileStaleFindings_SkipsNonCurrentStateCategories(t *testing.T) {
+	nonEligible := []struct {
+		name     string
+		category FindingCategory
+	}{
+		{"reliability", FindingCategoryReliability},
+		{"backup", FindingCategoryBackup},
+		{"security", FindingCategorySecurity},
+		{"general", FindingCategoryGeneral},
+	}
+	for _, tc := range nonEligible {
+		t.Run(string(tc.category), func(t *testing.T) {
+			f := &Finding{
+				ID:           "find-" + string(tc.category),
+				Severity:     FindingSeverityWarning,
+				Category:     tc.category,
+				ResourceID:   "vm-100",
+				ResourceName: "web",
+				Title:        "Some persistent event",
+				DetectedAt:   time.Now().Add(-1 * time.Hour),
+			}
+			p := newTestPatrolWithFindings([]*Finding{f})
+
+			resolved := p.reconcileStaleFindings(
+				nil,            // not re-reported
+				nil,            // not explicitly resolved
+				[]string{f.ID}, // seeded
+				false,          // run succeeded
+			)
+
+			if resolved != 0 {
+				t.Fatalf("expected 0 auto-resolved findings for category %s, got %d", tc.category, resolved)
+			}
+			stored := p.findings.Get(f.ID)
+			if stored == nil {
+				t.Fatal("finding should still exist in store")
+			}
+			if stored.ResolvedAt != nil {
+				t.Fatalf("category %s finding must NOT be auto-resolved from absence; resolved at %s", tc.category, stored.ResolvedAt)
+			}
+		})
+	}
+}
+
+func TestCategorySupportsStaleAutoResolve(t *testing.T) {
+	cases := []struct {
+		category FindingCategory
+		want     bool
+	}{
+		{FindingCategoryPerformance, true},
+		{FindingCategoryCapacity, true},
+		{FindingCategoryReliability, false},
+		{FindingCategoryBackup, false},
+		{FindingCategorySecurity, false},
+		{FindingCategoryGeneral, false},
+	}
+	for _, tc := range cases {
+		t.Run(string(tc.category), func(t *testing.T) {
+			if got := CategorySupportsStaleAutoResolve(tc.category); got != tc.want {
+				t.Fatalf("CategorySupportsStaleAutoResolve(%s) = %v, want %v", tc.category, got, tc.want)
+			}
+		})
+	}
+}