mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-20 01:01:20 +00:00
Gate stale-finding auto-resolve on category whitelist
reconcileStaleFindings was auto-resolving any seeded finding that the LLM didn't re-report in a successful run. The function's own comment acknowledges the LLM doesn't reliably use patrol_resolve_finding, so this was built as a cleanup pass — but it cannot tell the difference between "LLM correctly recognized this is fixed" and "LLM forgot to re-mention it." For findings that represent discrete events or persistent states (a backup task that failed, a service that crashed, a security vulnerability that was found, a configuration error), absence in a Patrol report is not evidence that the issue has cleared. The result was bogus auto_resolved → re-detected → regressed cycles, observed in the wild as "Backup failed" regressing 4× over 6 hours and "Provider analysis error" regressing 271×. Those bogus auto-resolutions also inflated the trust strip with fictional auto-resolved credit. CategorySupportsStaleAutoResolve in findings.go gates the cleanup: only `performance` and `capacity` findings — continuous current-state metric thresholds — may be auto-resolved from absence. The other four categories (reliability, backup, security, general) stay active until explicitly resolved. Updates the ai-runtime subsystem contract Current State section with the whitelist and the adjacent lifecycle dedup rules already landed. Adds TestReconcileStaleFindings_SkipsNonCurrentStateCategories with table-driven subtests for all four event/persistent categories, and TestCategorySupportsStaleAutoResolve to lock in the whitelist.
This commit is contained in:
parent
c6bbd9c1e4
commit
b44d5892f4
4 changed files with 147 additions and 2 deletions
|
|
@ -580,6 +580,28 @@ runtime cost control, and shared AI transport surfaces.
|
|||
|
||||
## Current State
|
||||
|
||||
Stale-finding auto-resolve (`reconcileStaleFindings` in
|
||||
`internal/ai/patrol_ai.go`) is gated on the category whitelist exposed by
|
||||
`CategorySupportsStaleAutoResolve` in `internal/ai/findings.go`. Only
|
||||
`performance` and `capacity` findings — continuous current-state metric
|
||||
thresholds where the most recent successful scan's observation is
|
||||
authoritative — may be auto-resolved from absence in the LLM's report.
|
||||
`reliability`, `backup`, `security`, and `general` findings represent
|
||||
discrete events or persistent states; the LLM not re-mentioning a failed
|
||||
backup task, a service that crashed, or a vulnerability that was found
|
||||
is not evidence that the underlying issue has cleared. Those categories
|
||||
must stay active until explicitly resolved either by the LLM calling
|
||||
`patrol_resolve_finding` with evidence or by operator action through the
|
||||
governed findings store. Lifecycle events recorded by `findings.go` must
|
||||
not introduce duplicate generic transition rows: the canonical
|
||||
`syncLoopStateLocked` records `loop_transition_violation` only when a
|
||||
transition is rejected, and otherwise leaves the semantic event
|
||||
(`auto_resolved`, `regressed`, `dismissed`, `acknowledged`, `snoozed`,
|
||||
`reminded`, `suppression_lifted`, etc.) as the single audit row for the
|
||||
transition. Re-detections of an already-active finding update
|
||||
`TimesRaised` and `LastSeenAt` only — they must not append a `detected`
|
||||
lifecycle event, because a heartbeat is not a transition.
|
||||
|
||||
The findings store now consults a `ResourceOperatorStateProvider`
|
||||
during the new-finding path. The interface lives in `internal/ai` to
|
||||
avoid an import cycle with `internal/unifiedresources`; the API layer
|
||||
|
|
|
|||
|
|
@ -45,6 +45,34 @@ const (
|
|||
FindingCategoryGeneral FindingCategory = "general"
|
||||
)
|
||||
|
||||
// CategorySupportsStaleAutoResolve reports whether a finding category
|
||||
// represents a continuous current-state condition that Patrol can soundly
|
||||
// auto-resolve from absence in a successful run. Only `performance` and
|
||||
// `capacity` qualify: those are metric thresholds (CPU above X, disk above
|
||||
// Y) where the most recent scan's observation is authoritative — if the
|
||||
// threshold isn't tripped now, the condition has cleared.
|
||||
//
|
||||
// All other categories represent discrete events or persistent states:
|
||||
// `reliability` (a service that crashed, a restart loop), `backup` (a
|
||||
// backup task that failed), `security` (a vulnerability that was found),
|
||||
// and `general` (catch-all). For these, absence in a Patrol report only
|
||||
// means the LLM didn't re-mention the finding — it does not mean the
|
||||
// underlying issue has been addressed. Auto-resolving them on absence
|
||||
// produced false `auto_resolved` events that the next run re-detected as
|
||||
// regressions, inflating the regression counter and polluting the trust
|
||||
// strip with fictional "auto-resolved" credit.
|
||||
//
|
||||
// Such findings stay active until explicitly resolved — either by the LLM
|
||||
// calling `patrol_resolve_finding` with evidence, or by operator action.
|
||||
func CategorySupportsStaleAutoResolve(category FindingCategory) bool {
|
||||
switch category {
|
||||
case FindingCategoryPerformance, FindingCategoryCapacity:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// DismissReasonWillFixLater marks a finding as "I will fix this later" — an
|
||||
// operational commitment with an implicit deadline. See Finding.RemindAt.
|
||||
const DismissReasonWillFixLater = "will_fix_later"
|
||||
|
|
|
|||
|
|
@ -3755,8 +3755,17 @@ func seedFormatTimeAgo(now, t time.Time) string {
|
|||
// in seed context but were neither re-reported nor explicitly resolved during the run.
|
||||
// This handles the case where the LLM doesn't reliably use patrol_resolve_finding.
|
||||
//
|
||||
// Safety: only called after successful full patrols (not scoped), and only for findings
|
||||
// that were in the seed context (the LLM had the opportunity to re-report them).
|
||||
// Safety:
|
||||
// - only called after successful full patrols (not scoped), and only for findings
|
||||
// that were in the seed context (the LLM had the opportunity to re-report them);
|
||||
// - only acts on finding categories where "absence of re-report = condition cleared"
|
||||
// is a sound model. See CategorySupportsStaleAutoResolve: performance and capacity
|
||||
// are continuous metric thresholds, so the most recent successful scan's absence
|
||||
// of trip is authoritative. Reliability/backup/security/general represent discrete
|
||||
// events or persistent states — the LLM not re-mentioning a failed backup task or
|
||||
// a security vulnerability does not mean the issue has been addressed, and silent
|
||||
// auto-resolve there produced bogus auto_resolved → re-detected → regressed
|
||||
// cycles that inflated the trust strip and the regression counter.
|
||||
func (p *PatrolService) reconcileStaleFindings(reportedIDs, resolvedIDs, seededFindingIDs []string, runHadErrors bool) int {
|
||||
if runHadErrors {
|
||||
return 0
|
||||
|
|
@ -3783,6 +3792,20 @@ func (p *PatrolService) reconcileStaleFindings(reportedIDs, resolvedIDs, seededF
|
|||
if reported[id] || resolved[id] {
|
||||
continue
|
||||
}
|
||||
// Category gate: only continuous current-state categories are safe to
|
||||
// auto-resolve from absence. Event/persistent categories stay active
|
||||
// until explicitly resolved.
|
||||
finding := p.findings.Get(id)
|
||||
if finding == nil {
|
||||
continue
|
||||
}
|
||||
if !CategorySupportsStaleAutoResolve(finding.Category) {
|
||||
log.Debug().
|
||||
Str("finding_id", id).
|
||||
Str("category", string(finding.Category)).
|
||||
Msg("AI Patrol: skipping stale auto-resolve — category not eligible (event/persistent finding)")
|
||||
continue
|
||||
}
|
||||
// Only resolve if still active
|
||||
if ok := p.findings.ResolveWithReason(id, "No longer detected by patrol"); ok {
|
||||
count++
|
||||
|
|
@ -3797,6 +3820,7 @@ func (p *PatrolService) reconcileStaleFindings(reportedIDs, resolvedIDs, seededF
|
|||
|
||||
log.Info().
|
||||
Str("finding_id", id).
|
||||
Str("category", string(finding.Category)).
|
||||
Msg("AI Patrol: Auto-resolved stale finding (not re-reported by patrol)")
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -213,3 +213,74 @@ func TestReconcileStaleFindings_NoSeededFindings(t *testing.T) {
|
|||
t.Fatalf("expected 0 auto-resolved findings when no seeded IDs, got %d", resolved)
|
||||
}
|
||||
}
|
||||
|
||||
// Category gate: only performance and capacity findings (continuous current-state
|
||||
// metrics) may be auto-resolved from absence. Reliability, backup, security, and
|
||||
// general findings represent events or persistent states and must stay active
|
||||
// until explicitly resolved — silent absence-based auto-resolve there caused
|
||||
// bogus auto_resolved → re-detected → regressed cycles that polluted the trust
|
||||
// strip and inflated the regression counter.
|
||||
func TestReconcileStaleFindings_SkipsNonCurrentStateCategories(t *testing.T) {
|
||||
nonEligible := []struct {
|
||||
name string
|
||||
category FindingCategory
|
||||
}{
|
||||
{"reliability", FindingCategoryReliability},
|
||||
{"backup", FindingCategoryBackup},
|
||||
{"security", FindingCategorySecurity},
|
||||
{"general", FindingCategoryGeneral},
|
||||
}
|
||||
for _, tc := range nonEligible {
|
||||
t.Run(string(tc.category), func(t *testing.T) {
|
||||
f := &Finding{
|
||||
ID: "find-" + string(tc.category),
|
||||
Severity: FindingSeverityWarning,
|
||||
Category: tc.category,
|
||||
ResourceID: "vm-100",
|
||||
ResourceName: "web",
|
||||
Title: "Some persistent event",
|
||||
DetectedAt: time.Now().Add(-1 * time.Hour),
|
||||
}
|
||||
p := newTestPatrolWithFindings([]*Finding{f})
|
||||
|
||||
resolved := p.reconcileStaleFindings(
|
||||
nil, // not re-reported
|
||||
nil, // not explicitly resolved
|
||||
[]string{f.ID}, // seeded
|
||||
false, // run succeeded
|
||||
)
|
||||
|
||||
if resolved != 0 {
|
||||
t.Fatalf("expected 0 auto-resolved findings for category %s, got %d", tc.category, resolved)
|
||||
}
|
||||
stored := p.findings.Get(f.ID)
|
||||
if stored == nil {
|
||||
t.Fatal("finding should still exist in store")
|
||||
}
|
||||
if stored.ResolvedAt != nil {
|
||||
t.Fatalf("category %s finding must NOT be auto-resolved from absence; resolved at %s", tc.category, stored.ResolvedAt)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCategorySupportsStaleAutoResolve(t *testing.T) {
|
||||
cases := []struct {
|
||||
category FindingCategory
|
||||
want bool
|
||||
}{
|
||||
{FindingCategoryPerformance, true},
|
||||
{FindingCategoryCapacity, true},
|
||||
{FindingCategoryReliability, false},
|
||||
{FindingCategoryBackup, false},
|
||||
{FindingCategorySecurity, false},
|
||||
{FindingCategoryGeneral, false},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(string(tc.category), func(t *testing.T) {
|
||||
if got := CategorySupportsStaleAutoResolve(tc.category); got != tc.want {
|
||||
t.Fatalf("CategorySupportsStaleAutoResolve(%s) = %v, want %v", tc.category, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue