mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-20 01:01:20 +00:00
Reset regression counters polluted by bogus auto_resolve cycles
The Backup failed finding on the live preview showed "regressed 6×"
when the actual regression count of genuine recurrences was at
most 1 or 2 — the rest were the system fighting itself, driven by
the absence-based auto_resolve paths that were gated (category
whitelist) or removed (alert-mirror rip) earlier in this branch.
Counter stayed sticky after those fixes landed, so the trust strip
and finding badges still surfaced the inflated number.
FindingsStore.SetPersistence load pass now scans each active
finding's lifecycle for the two known bogus-signature auto_resolved
reasons ("No longer detected by patrol", "Resource no longer
exists in infrastructure"). If found, RegressionCount is reset to
0 and LastRegressionAt is cleared, and a regression_counter_reset
lifecycle event is appended so the migration is idempotent. A
finding that already has a regression_counter_reset event is left
alone; any regressed events that accrued after the reset are
genuine and stand.
findingHasBogusAutoResolveCycle returns true only when the
lifecycle contains a bogus auto_resolved and no prior reset event,
so the function is the single point of truth for the migration
decision and is straightforward to test. Test covers three cases:
finding with bogus signature gets reset, finding with empty-message
auto_resolved (LLM-driven, legitimate) keeps its counter, finding
already migrated is not re-reset.
Updates ai-runtime Current State to document the second migration
on top of the alert-mirror retirement.
This commit is contained in:
parent
590671ffbb
commit
942f9ca0f5
3 changed files with 203 additions and 1 deletions
|
|
@ -599,7 +599,16 @@ alert-mirror findings already persisted from an earlier build,
|
|||
auto-resolves any active finding matching the legacy signature
|
||||
(title `"Active alert detected"`, source `ai-analysis`, category
|
||||
`general`) with a clear retirement reason; the pass is idempotent
|
||||
and self-cleaning.
|
||||
and self-cleaning. The same load pass also resets the
|
||||
`RegressionCount` and clears `LastRegressionAt` on any active
|
||||
finding whose lifecycle contains an `auto_resolved` event with one
|
||||
of the two known bogus-signature reasons ("No longer detected by
|
||||
patrol", "Resource no longer exists in infrastructure"), because
|
||||
the counter was inflated by the absence-based auto-resolve paths
|
||||
that have since been gated or removed. The reset appends a
|
||||
`regression_counter_reset` lifecycle event so the migration only
|
||||
fires once per finding; genuine recurrences from then on accrue
|
||||
cleanly.
|
||||
|
||||
The overall health score (`calculateOverallHealth` in
|
||||
`internal/ai/intelligence.go`) tiers the "recent Patrol errors" coverage
|
||||
|
|
|
|||
|
|
@ -886,6 +886,30 @@ func (s *FindingsStore) SetPersistence(p FindingsPersistence) error {
|
|||
})
|
||||
normalizedLoadedState = true
|
||||
}
|
||||
// Reset regression counters polluted by bogus auto_resolve
|
||||
// cycles. Before the category gate (commit b44d5892f) and the
|
||||
// resource-presence gate that still needs the same treatment,
|
||||
// findings of event/persistent categories were being
|
||||
// auto-resolved on absence and re-detected next run, each
|
||||
// cycle incrementing RegressionCount. The counter became
|
||||
// unreliable as a signal — the user sees "regressed 6×" but
|
||||
// most of those regressions weren't real recurrences, they
|
||||
// were the system fighting itself. Once a finding shows
|
||||
// evidence of the bogus pattern in its lifecycle, treat the
|
||||
// entire counter as suspect and reset it; genuine recurrences
|
||||
// will accrue cleanly from here. Idempotent: after reset,
|
||||
// the lifecycle is annotated with a regression_counter_reset
|
||||
// event so the migration only fires once per finding.
|
||||
if f.ResolvedAt == nil && findingHasBogusAutoResolveCycle(f) {
|
||||
f.RegressionCount = 0
|
||||
f.LastRegressionAt = nil
|
||||
f.Lifecycle = append(f.Lifecycle, FindingLifecycleEvent{
|
||||
At: time.Now(),
|
||||
Type: "regression_counter_reset",
|
||||
Message: "Regression counter reset on migration: prior auto_resolve cycles were driven by the now-removed absence-based auto-resolve paths.",
|
||||
})
|
||||
normalizedLoadedState = true
|
||||
}
|
||||
// Ensure derived fields are consistent after load.
|
||||
f.syncLoopState()
|
||||
s.findings[id] = f
|
||||
|
|
@ -902,6 +926,48 @@ func (s *FindingsStore) SetPersistence(p FindingsPersistence) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// findingHasBogusAutoResolveCycle reports whether the finding's lifecycle
|
||||
// contains evidence of the absence-based auto_resolve pattern that was
|
||||
// removed in the b44d5892f category gate and the alert-mirror rip. The
|
||||
// signature is an `auto_resolved` lifecycle event whose message matches
|
||||
// one of the two reasons the legacy absence paths stamped:
|
||||
//
|
||||
// - "No longer detected by patrol" — emitted by reconcileStaleFindings
|
||||
// when the LLM didn't re-mention a seeded finding in a successful run
|
||||
// - "Resource no longer exists in infrastructure" — emitted when a
|
||||
// finding's resource was missing from the current inventory snapshot
|
||||
//
|
||||
// Either reason on an active finding means at least one prior regression
|
||||
// was driven by the system fighting itself, not by a genuine recurrence,
|
||||
// so the cumulative counter is no longer trustworthy.
|
||||
func findingHasBogusAutoResolveCycle(f *Finding) bool {
|
||||
if f == nil {
|
||||
return false
|
||||
}
|
||||
if f.RegressionCount == 0 {
|
||||
return false
|
||||
}
|
||||
bogus := false
|
||||
for _, e := range f.Lifecycle {
|
||||
if e.Type == "regression_counter_reset" {
|
||||
// Already migrated — must not be re-applied even if other
|
||||
// auto_resolved events in the same lifecycle match the bogus
|
||||
// signature. Genuine regressions accrued after the reset
|
||||
// stand on their own and the counter is now trustworthy.
|
||||
return false
|
||||
}
|
||||
if e.Type != "auto_resolved" {
|
||||
continue
|
||||
}
|
||||
switch e.Message {
|
||||
case "No longer detected by patrol",
|
||||
"Resource no longer exists in infrastructure":
|
||||
bogus = true
|
||||
}
|
||||
}
|
||||
return bogus
|
||||
}
|
||||
|
||||
// isLegacyAlertMirrorFinding reports whether the finding looks like an
|
||||
// active "Active alert detected" finding produced by the now-removed
|
||||
// detectAlertSignals → SignalActiveAlert deterministic emitter. The title
|
||||
|
|
|
|||
|
|
@ -225,6 +225,133 @@ func TestFindingsStore_SetPersistence_RetiresLegacyAlertMirrorFindings(t *testin
|
|||
}
|
||||
}
|
||||
|
||||
func TestFindingsStore_SetPersistence_ResetsRegressionCounterPollutedByBogusCycles(t *testing.T) {
|
||||
store := NewFindingsStore()
|
||||
store.saveDebounce = 5 * time.Millisecond
|
||||
now := time.Now()
|
||||
lastRegress := now.Add(-30 * time.Minute)
|
||||
saved := make(chan map[string]*Finding, 1)
|
||||
|
||||
withBogusCycle := &Finding{
|
||||
ID: "with-bogus",
|
||||
Severity: FindingSeverityWarning,
|
||||
ResourceID: "vm-bogus",
|
||||
Title: "Backup failed",
|
||||
Source: "ai-analysis",
|
||||
Category: FindingCategoryBackup,
|
||||
LastSeenAt: now,
|
||||
RegressionCount: 6,
|
||||
LastRegressionAt: &lastRegress,
|
||||
Lifecycle: []FindingLifecycleEvent{
|
||||
{At: now.Add(-3 * time.Hour), Type: "detected"},
|
||||
{At: now.Add(-2 * time.Hour), Type: "auto_resolved", Message: "No longer detected by patrol"},
|
||||
{At: now.Add(-90 * time.Minute), Type: "regressed", Message: "Finding re-detected after resolution"},
|
||||
{At: now.Add(-60 * time.Minute), Type: "auto_resolved", Message: "Resource no longer exists in infrastructure"},
|
||||
{At: lastRegress, Type: "regressed", Message: "Finding re-detected after resolution"},
|
||||
},
|
||||
}
|
||||
|
||||
withRealCycle := &Finding{
|
||||
ID: "with-real",
|
||||
Severity: FindingSeverityWarning,
|
||||
ResourceID: "vm-real",
|
||||
Title: "High CPU usage",
|
||||
Source: "ai-analysis",
|
||||
Category: FindingCategoryPerformance,
|
||||
LastSeenAt: now,
|
||||
RegressionCount: 2,
|
||||
LastRegressionAt: &lastRegress,
|
||||
Lifecycle: []FindingLifecycleEvent{
|
||||
{At: now.Add(-2 * time.Hour), Type: "detected"},
|
||||
// An LLM-driven explicit resolve (empty message via Resolve(_,true)
|
||||
// is NOT one of the bogus-signature reasons) — the regression
|
||||
// counter that follows reflects a legitimate recurrence and
|
||||
// must be preserved.
|
||||
{At: now.Add(-90 * time.Minute), Type: "auto_resolved"},
|
||||
{At: lastRegress, Type: "regressed", Message: "Finding re-detected after resolution"},
|
||||
},
|
||||
}
|
||||
|
||||
alreadyMigrated := &Finding{
|
||||
ID: "already",
|
||||
Severity: FindingSeverityWarning,
|
||||
ResourceID: "vm-already",
|
||||
Title: "Backup failed",
|
||||
Source: "ai-analysis",
|
||||
Category: FindingCategoryBackup,
|
||||
LastSeenAt: now,
|
||||
RegressionCount: 4,
|
||||
LastRegressionAt: &lastRegress,
|
||||
Lifecycle: []FindingLifecycleEvent{
|
||||
{At: now.Add(-3 * time.Hour), Type: "auto_resolved", Message: "No longer detected by patrol"},
|
||||
{At: now.Add(-2 * time.Hour), Type: "regression_counter_reset"},
|
||||
// Genuine regressions accrued after the migration must be kept.
|
||||
{At: now.Add(-90 * time.Minute), Type: "regressed"},
|
||||
},
|
||||
}
|
||||
|
||||
p := &recordingPersistence{
|
||||
findings: map[string]*Finding{
|
||||
withBogusCycle.ID: withBogusCycle,
|
||||
withRealCycle.ID: withRealCycle,
|
||||
alreadyMigrated.ID: alreadyMigrated,
|
||||
},
|
||||
saved: saved,
|
||||
}
|
||||
|
||||
if err := store.SetPersistence(p); err != nil {
|
||||
t.Fatalf("SetPersistence failed: %v", err)
|
||||
}
|
||||
|
||||
bogus := store.Get("with-bogus")
|
||||
if bogus.RegressionCount != 0 {
|
||||
t.Fatalf("expected polluted regression counter reset to 0, got %d", bogus.RegressionCount)
|
||||
}
|
||||
if bogus.LastRegressionAt != nil {
|
||||
t.Fatal("expected LastRegressionAt cleared on reset")
|
||||
}
|
||||
foundResetEvent := false
|
||||
for _, e := range bogus.Lifecycle {
|
||||
if e.Type == "regression_counter_reset" {
|
||||
foundResetEvent = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !foundResetEvent {
|
||||
t.Fatal("expected regression_counter_reset lifecycle event on the migrated finding")
|
||||
}
|
||||
|
||||
real := store.Get("with-real")
|
||||
if real.RegressionCount != 2 {
|
||||
t.Fatalf("finding without bogus signature must keep its regression counter; got %d", real.RegressionCount)
|
||||
}
|
||||
for _, e := range real.Lifecycle {
|
||||
if e.Type == "regression_counter_reset" {
|
||||
t.Fatal("finding without bogus signature must not gain a reset event")
|
||||
}
|
||||
}
|
||||
|
||||
already := store.Get("already")
|
||||
if already.RegressionCount != 4 {
|
||||
t.Fatalf("already-migrated finding must not be reset again; got %d", already.RegressionCount)
|
||||
}
|
||||
resetCount := 0
|
||||
for _, e := range already.Lifecycle {
|
||||
if e.Type == "regression_counter_reset" {
|
||||
resetCount++
|
||||
}
|
||||
}
|
||||
if resetCount != 1 {
|
||||
t.Fatalf("already-migrated finding must keep exactly one reset event, got %d", resetCount)
|
||||
}
|
||||
|
||||
select {
|
||||
case <-saved:
|
||||
case <-time.After(500 * time.Millisecond):
|
||||
t.Fatal("timed out waiting for migration save")
|
||||
}
|
||||
}
|
||||
|
||||
func TestFindingsStore_scheduleSave_NoPersistence(t *testing.T) {
|
||||
store := NewFindingsStore()
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue