Stop mirroring alerts into Patrol findings

The deterministic signal pipeline ran the pulse_alerts tool output
through detectAlertSignals and produced a SignalActiveAlert for
every firing alert, which Patrol then materialized as an
"Active alert detected" finding (source: ai-analysis, category:
general). The system prompt at the top of patrol_ai.go explicitly
tells the LLM not to duplicate alerts — but the deterministic
emitter was duplicating them anyway, behind the LLM's back.

Symptoms observed in the wild:
- 9 active "Active alert detected" findings in Patrol, every one a
  duplicate of an existing alert already on the Alerts page.
- The LLM, doing what the prompt told it, resolved each mirrored
  finding via patrol_resolve_finding. Next run the alert was still
  firing and Patrol re-emitted the signal → finding regressed.
  Lifecycle showed several auto_resolved → re-detected → regressed
  cycles per finding within hours.
- Health score dragged down by issues the operator already saw on
  the Alerts page, with no operator action possible from Patrol
  that wasn't already available from Alerts.

Rip detectAlertSignals entirely, remove the pulse_alerts case from
the signal-extraction switch, drop SignalActiveAlert plus its key
/ title / recommendation entries. Convert the prior
TestDetectSignals_ActiveAlert into a regression guard that locks
in the no-mirror behavior.

Updates the ai-runtime subsystem Current State to record the
decision: Patrol does not duplicate the Alerts surface; alerts
own their own lifecycle, surface, and acknowledgement model.
This commit is contained in:
rcourtman 2026-05-10 21:33:41 +01:00
parent 7bd596d378
commit 271d12ecab
5 changed files with 35 additions and 73 deletions

View file

@ -580,6 +580,21 @@ runtime cost control, and shared AI transport surfaces.
## Current State
Patrol deterministic signal extraction (`internal/ai/patrol_signals.go`)
does not mirror the Alerts surface. The `pulse_alerts` tool output is
intentionally absent from the signal switch in `DetectSignals` — alerts
already have their own canonical surface, lifecycle, and operator
acknowledgement model, and the `SignalActiveAlert` mirror path has
been removed. Mirroring previously double-counted (every alert was
also a Patrol "Active alert detected" finding), dragged the health
score down for issues the operator already knew about, and produced
bogus `auto_resolved` → re-detected → regressed cycles when the LLM
explicitly resolved the mirrored finding while the underlying alert
kept firing. Patrol's job, per its own system prompt, is to surface
issues alerts cannot — trends, capacity risks, misconfigurations,
reliability gaps, cross-resource correlations. The Alerts page is
the canonical surface for currently-firing alerts.
The overall health score (`calculateOverallHealth` in
`internal/ai/intelligence.go`) tiers the "recent Patrol errors" coverage
factor by the ratio of errored runs to relevant runs in the scoring

View file

@ -996,8 +996,6 @@ func signalKey(s DetectedSignal) string {
return "backup-failed"
case SignalBackupStale:
return "backup-stale"
case SignalActiveAlert:
return "active-alert"
case SignalGuestUnreachable:
return "guest-unreachable"
default:
@ -1019,8 +1017,6 @@ func signalTitle(s DetectedSignal) string {
return "Backup failed"
case SignalBackupStale:
return "Backup is stale"
case SignalActiveAlert:
return "Active alert detected"
case SignalGuestUnreachable:
return fmt.Sprintf("Guest unreachable: %s", s.ResourceName)
default:
@ -1042,8 +1038,6 @@ func defaultRecommendationForSignal(s DetectedSignal) string {
return "Review backup logs and fix the underlying error, then rerun the backup."
case SignalBackupStale:
return "Ensure backups are scheduled and completing successfully; run a new backup."
case SignalActiveAlert:
return "Investigate the active alert and resolve the underlying issue."
case SignalGuestUnreachable:
return "Investigate why this guest is not responding to ping. Check network configuration, firewall rules, or whether the guest has crashed."
default:

View file

@ -73,7 +73,6 @@ func TestSignalHelpersAndFindingsFromSignals(t *testing.T) {
{signal: DetectedSignal{SignalType: SignalHighDisk}, wantKey: "disk-high", wantTitle: "Storage usage is high", recSubstring: "storage"},
{signal: DetectedSignal{SignalType: SignalBackupFailed}, wantKey: "backup-failed", wantTitle: "Backup failed", recSubstring: "backup"},
{signal: DetectedSignal{SignalType: SignalBackupStale}, wantKey: "backup-stale", wantTitle: "Backup is stale", recSubstring: "backup"},
{signal: DetectedSignal{SignalType: SignalActiveAlert}, wantKey: "active-alert", wantTitle: "Active alert detected", recSubstring: "alert"},
{signal: DetectedSignal{SignalType: SignalGuestUnreachable, ResourceName: "db-server"}, wantKey: "guest-unreachable", wantTitle: "Guest unreachable: db-server", recSubstring: "ping"},
{signal: DetectedSignal{SignalType: SignalType("unknown")}, wantKey: "deterministic-signal", wantTitle: "Infrastructure signal detected", recSubstring: "Investigate"},
}

View file

@ -78,7 +78,6 @@ const (
SignalHighDisk SignalType = "high_disk"
SignalBackupFailed SignalType = "backup_failed"
SignalBackupStale SignalType = "backup_stale"
SignalActiveAlert SignalType = "active_alert"
SignalGuestUnreachable SignalType = "guest_unreachable"
)
@ -128,8 +127,14 @@ func detectSignalsFromToolCall(tc *ToolCallRecord, thresholds SignalThresholds)
signals = append(signals, detectStorageSignals(tc, thresholds)...)
case "pulse_metrics":
signals = append(signals, detectMetricsSignals(tc, thresholds)...)
case "pulse_alerts":
signals = append(signals, detectAlertSignals(tc)...)
// pulse_alerts intentionally has no deterministic signal extraction.
// Patrol's job is to find issues alerts cannot — duplicating the
// alert list into Patrol findings was double-counting (alerts
// already have their own lifecycle, surface, and acknowledgement
// model) and produced regression cycles when the LLM resolved the
// mirrored finding while the underlying alert kept firing. The
// Alerts page is the canonical surface for currently-firing
// alerts; Patrol should stay quiet here.
}
return signals
@ -688,59 +693,6 @@ func detectMetricsSignals(tc *ToolCallRecord, thresholds SignalThresholds) []Det
return signals
}
// --- Alert signals ---
// alertList is the minimal struct for parsing alert list output.
type alertList struct {
Alerts []struct {
ID string `json:"id"`
ResourceID string `json:"resource_id"`
ResourceName string `json:"resource_name"`
Type string `json:"type"`
Severity string `json:"severity"`
Message string `json:"message"`
} `json:"alerts"`
}
func detectAlertSignals(tc *ToolCallRecord) []DetectedSignal {
inputAction := extractInputField(tc.Input, "action")
if inputAction != "list" {
return nil
}
var signals []DetectedSignal
var data alertList
if err := json.Unmarshal([]byte(tc.Output), &data); err != nil {
if !tryParseEmbeddedJSON(tc.Output, &data) {
log.Debug().Err(err).Str("tool", tc.ToolName).Msg("patrol_signals: failed to parse alerts output")
return nil
}
}
for _, alert := range data.Alerts {
sevLower := strings.ToLower(alert.Severity)
if sevLower != "critical" && sevLower != "warning" {
continue
}
resourceType := inferFindingResourceType(alert.ResourceID, alert.ResourceName)
signals = append(signals, DetectedSignal{
SignalType: SignalActiveAlert,
ResourceID: alert.ResourceID,
ResourceName: alert.ResourceName,
ResourceType: resourceType,
SuggestedSeverity: sevLower,
Category: string(FindingCategoryGeneral),
Summary: "Active " + sevLower + " alert: " + alert.Message,
Evidence: truncateEvidence(tc.Output),
ToolCallID: tc.ID,
})
}
return signals
}
// --- Deduplication and matching ---
// deduplicateSignals removes duplicate signals with the same SignalType:ResourceID.

View file

@ -332,11 +332,19 @@ func TestDetectSignals_BackupsStaleFromSummaries(t *testing.T) {
}
}
func TestDetectSignals_ActiveAlert(t *testing.T) {
// TestDetectSignals_DoesNotMirrorAlerts locks in the decision that Patrol
// does not duplicate the Alerts surface. pulse_alerts is intentionally not
// in the deterministic signal extraction switch — alerts have their own
// canonical surface, lifecycle, and acknowledgement model. Mirroring them
// as Patrol findings double-counted, drained the health score, and
// produced bogus auto_resolved → re-detected cycles when the LLM
// resolved the mirrored finding while the underlying alert kept firing.
func TestDetectSignals_DoesNotMirrorAlerts(t *testing.T) {
output, _ := json.Marshal(map[string]interface{}{
"alerts": []map[string]interface{}{
{"id": "a1", "resource_id": "node1", "resource_name": "pve1", "severity": "critical", "message": "Node offline"},
{"id": "a2", "resource_id": "vm100", "resource_name": "web", "severity": "info", "message": "Minor issue"},
{"id": "a2", "resource_id": "vm100", "resource_name": "web", "severity": "warning", "message": "High CPU"},
{"id": "a3", "resource_id": "vm200", "resource_name": "db", "severity": "info", "message": "Minor issue"},
},
})
@ -351,14 +359,8 @@ func TestDetectSignals_ActiveAlert(t *testing.T) {
}
signals := DetectSignals(toolCalls, DefaultSignalThresholds())
if len(signals) != 1 {
t.Fatalf("expected 1 signal (only critical/warning alerts), got %d", len(signals))
}
if signals[0].SignalType != SignalActiveAlert {
t.Errorf("expected SignalActiveAlert, got %s", signals[0].SignalType)
}
if signals[0].SuggestedSeverity != "critical" {
t.Errorf("expected critical severity pass-through, got %s", signals[0].SuggestedSeverity)
if len(signals) != 0 {
t.Fatalf("expected no signals from pulse_alerts output (alerts must not be mirrored as Patrol findings), got %d: %+v", len(signals), signals)
}
}