mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-19 16:27:37 +00:00
Stop mirroring alerts into Patrol findings
The deterministic signal pipeline ran the pulse_alerts tool output through detectAlertSignals and produced a SignalActiveAlert for every firing alert, which Patrol then materialized as an "Active alert detected" finding (source: ai-analysis, category: general). The system prompt at the top of patrol_ai.go explicitly tells the LLM not to duplicate alerts — but the deterministic emitter was duplicating them anyway, behind the LLM's back. Symptoms observed in the wild: - 9 active "Active alert detected" findings in Patrol, every one a duplicate of an existing alert already on the Alerts page. - The LLM, doing what the prompt told it, resolved each mirrored finding via patrol_resolve_finding. Next run the alert was still firing and Patrol re-emitted the signal → finding regressed. Lifecycle showed several auto_resolved → re-detected → regressed cycles per finding within hours. - Health score dragged down by issues the operator already saw on the Alerts page, with no operator action possible from Patrol that wasn't already available from Alerts. Rip detectAlertSignals entirely, remove the pulse_alerts case from the signal-extraction switch, drop SignalActiveAlert plus its key / title / recommendation entries. Convert the prior TestDetectSignals_ActiveAlert into a regression guard that locks in the no-mirror behavior. Updates the ai-runtime subsystem Current State to record the decision: Patrol does not duplicate the Alerts surface; alerts own their own lifecycle, surface, and acknowledgement model.
This commit is contained in:
parent
7bd596d378
commit
271d12ecab
5 changed files with 35 additions and 73 deletions
|
|
@ -580,6 +580,21 @@ runtime cost control, and shared AI transport surfaces.
|
|||
|
||||
## Current State
|
||||
|
||||
Patrol deterministic signal extraction (`internal/ai/patrol_signals.go`)
|
||||
does not mirror the Alerts surface. The `pulse_alerts` tool output is
|
||||
intentionally absent from the signal switch in `DetectSignals` — alerts
|
||||
already have their own canonical surface, lifecycle, and operator
|
||||
acknowledgement model, and the `SignalActiveAlert` mirror path has
|
||||
been removed. Mirroring previously double-counted (every alert was
|
||||
also a Patrol "Active alert detected" finding), dragged the health
|
||||
score down for issues the operator already knew about, and produced
|
||||
bogus `auto_resolved` → re-detected → regressed cycles when the LLM
|
||||
explicitly resolved the mirrored finding while the underlying alert
|
||||
kept firing. Patrol's job, per its own system prompt, is to surface
|
||||
issues alerts cannot — trends, capacity risks, misconfigurations,
|
||||
reliability gaps, cross-resource correlations. The Alerts page is
|
||||
the canonical surface for currently-firing alerts.
|
||||
|
||||
The overall health score (`calculateOverallHealth` in
|
||||
`internal/ai/intelligence.go`) tiers the "recent Patrol errors" coverage
|
||||
factor by the ratio of errored runs to relevant runs in the scoring
|
||||
|
|
|
|||
|
|
@ -996,8 +996,6 @@ func signalKey(s DetectedSignal) string {
|
|||
return "backup-failed"
|
||||
case SignalBackupStale:
|
||||
return "backup-stale"
|
||||
case SignalActiveAlert:
|
||||
return "active-alert"
|
||||
case SignalGuestUnreachable:
|
||||
return "guest-unreachable"
|
||||
default:
|
||||
|
|
@ -1019,8 +1017,6 @@ func signalTitle(s DetectedSignal) string {
|
|||
return "Backup failed"
|
||||
case SignalBackupStale:
|
||||
return "Backup is stale"
|
||||
case SignalActiveAlert:
|
||||
return "Active alert detected"
|
||||
case SignalGuestUnreachable:
|
||||
return fmt.Sprintf("Guest unreachable: %s", s.ResourceName)
|
||||
default:
|
||||
|
|
@ -1042,8 +1038,6 @@ func defaultRecommendationForSignal(s DetectedSignal) string {
|
|||
return "Review backup logs and fix the underlying error, then rerun the backup."
|
||||
case SignalBackupStale:
|
||||
return "Ensure backups are scheduled and completing successfully; run a new backup."
|
||||
case SignalActiveAlert:
|
||||
return "Investigate the active alert and resolve the underlying issue."
|
||||
case SignalGuestUnreachable:
|
||||
return "Investigate why this guest is not responding to ping. Check network configuration, firewall rules, or whether the guest has crashed."
|
||||
default:
|
||||
|
|
|
|||
|
|
@ -73,7 +73,6 @@ func TestSignalHelpersAndFindingsFromSignals(t *testing.T) {
|
|||
{signal: DetectedSignal{SignalType: SignalHighDisk}, wantKey: "disk-high", wantTitle: "Storage usage is high", recSubstring: "storage"},
|
||||
{signal: DetectedSignal{SignalType: SignalBackupFailed}, wantKey: "backup-failed", wantTitle: "Backup failed", recSubstring: "backup"},
|
||||
{signal: DetectedSignal{SignalType: SignalBackupStale}, wantKey: "backup-stale", wantTitle: "Backup is stale", recSubstring: "backup"},
|
||||
{signal: DetectedSignal{SignalType: SignalActiveAlert}, wantKey: "active-alert", wantTitle: "Active alert detected", recSubstring: "alert"},
|
||||
{signal: DetectedSignal{SignalType: SignalGuestUnreachable, ResourceName: "db-server"}, wantKey: "guest-unreachable", wantTitle: "Guest unreachable: db-server", recSubstring: "ping"},
|
||||
{signal: DetectedSignal{SignalType: SignalType("unknown")}, wantKey: "deterministic-signal", wantTitle: "Infrastructure signal detected", recSubstring: "Investigate"},
|
||||
}
|
||||
|
|
|
|||
|
|
@ -78,7 +78,6 @@ const (
|
|||
SignalHighDisk SignalType = "high_disk"
|
||||
SignalBackupFailed SignalType = "backup_failed"
|
||||
SignalBackupStale SignalType = "backup_stale"
|
||||
SignalActiveAlert SignalType = "active_alert"
|
||||
SignalGuestUnreachable SignalType = "guest_unreachable"
|
||||
)
|
||||
|
||||
|
|
@ -128,8 +127,14 @@ func detectSignalsFromToolCall(tc *ToolCallRecord, thresholds SignalThresholds)
|
|||
signals = append(signals, detectStorageSignals(tc, thresholds)...)
|
||||
case "pulse_metrics":
|
||||
signals = append(signals, detectMetricsSignals(tc, thresholds)...)
|
||||
case "pulse_alerts":
|
||||
signals = append(signals, detectAlertSignals(tc)...)
|
||||
// pulse_alerts intentionally has no deterministic signal extraction.
|
||||
// Patrol's job is to find issues alerts cannot — duplicating the
|
||||
// alert list into Patrol findings was double-counting (alerts
|
||||
// already have their own lifecycle, surface, and acknowledgement
|
||||
// model) and produced regression cycles when the LLM resolved the
|
||||
// mirrored finding while the underlying alert kept firing. The
|
||||
// Alerts page is the canonical surface for currently-firing
|
||||
// alerts; Patrol should stay quiet here.
|
||||
}
|
||||
|
||||
return signals
|
||||
|
|
@ -688,59 +693,6 @@ func detectMetricsSignals(tc *ToolCallRecord, thresholds SignalThresholds) []Det
|
|||
return signals
|
||||
}
|
||||
|
||||
// --- Alert signals ---
|
||||
|
||||
// alertList is the minimal struct for parsing alert list output.
|
||||
type alertList struct {
|
||||
Alerts []struct {
|
||||
ID string `json:"id"`
|
||||
ResourceID string `json:"resource_id"`
|
||||
ResourceName string `json:"resource_name"`
|
||||
Type string `json:"type"`
|
||||
Severity string `json:"severity"`
|
||||
Message string `json:"message"`
|
||||
} `json:"alerts"`
|
||||
}
|
||||
|
||||
func detectAlertSignals(tc *ToolCallRecord) []DetectedSignal {
|
||||
inputAction := extractInputField(tc.Input, "action")
|
||||
if inputAction != "list" {
|
||||
return nil
|
||||
}
|
||||
|
||||
var signals []DetectedSignal
|
||||
var data alertList
|
||||
if err := json.Unmarshal([]byte(tc.Output), &data); err != nil {
|
||||
if !tryParseEmbeddedJSON(tc.Output, &data) {
|
||||
log.Debug().Err(err).Str("tool", tc.ToolName).Msg("patrol_signals: failed to parse alerts output")
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
for _, alert := range data.Alerts {
|
||||
sevLower := strings.ToLower(alert.Severity)
|
||||
if sevLower != "critical" && sevLower != "warning" {
|
||||
continue
|
||||
}
|
||||
|
||||
resourceType := inferFindingResourceType(alert.ResourceID, alert.ResourceName)
|
||||
|
||||
signals = append(signals, DetectedSignal{
|
||||
SignalType: SignalActiveAlert,
|
||||
ResourceID: alert.ResourceID,
|
||||
ResourceName: alert.ResourceName,
|
||||
ResourceType: resourceType,
|
||||
SuggestedSeverity: sevLower,
|
||||
Category: string(FindingCategoryGeneral),
|
||||
Summary: "Active " + sevLower + " alert: " + alert.Message,
|
||||
Evidence: truncateEvidence(tc.Output),
|
||||
ToolCallID: tc.ID,
|
||||
})
|
||||
}
|
||||
|
||||
return signals
|
||||
}
|
||||
|
||||
// --- Deduplication and matching ---
|
||||
|
||||
// deduplicateSignals removes duplicate signals with the same SignalType:ResourceID.
|
||||
|
|
|
|||
|
|
@ -332,11 +332,19 @@ func TestDetectSignals_BackupsStaleFromSummaries(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestDetectSignals_ActiveAlert(t *testing.T) {
|
||||
// TestDetectSignals_DoesNotMirrorAlerts locks in the decision that Patrol
|
||||
// does not duplicate the Alerts surface. pulse_alerts is intentionally not
|
||||
// in the deterministic signal extraction switch — alerts have their own
|
||||
// canonical surface, lifecycle, and acknowledgement model. Mirroring them
|
||||
// as Patrol findings double-counted, drained the health score, and
|
||||
// produced bogus auto_resolved → re-detected cycles when the LLM
|
||||
// resolved the mirrored finding while the underlying alert kept firing.
|
||||
func TestDetectSignals_DoesNotMirrorAlerts(t *testing.T) {
|
||||
output, _ := json.Marshal(map[string]interface{}{
|
||||
"alerts": []map[string]interface{}{
|
||||
{"id": "a1", "resource_id": "node1", "resource_name": "pve1", "severity": "critical", "message": "Node offline"},
|
||||
{"id": "a2", "resource_id": "vm100", "resource_name": "web", "severity": "info", "message": "Minor issue"},
|
||||
{"id": "a2", "resource_id": "vm100", "resource_name": "web", "severity": "warning", "message": "High CPU"},
|
||||
{"id": "a3", "resource_id": "vm200", "resource_name": "db", "severity": "info", "message": "Minor issue"},
|
||||
},
|
||||
})
|
||||
|
||||
|
|
@ -351,14 +359,8 @@ func TestDetectSignals_ActiveAlert(t *testing.T) {
|
|||
}
|
||||
|
||||
signals := DetectSignals(toolCalls, DefaultSignalThresholds())
|
||||
if len(signals) != 1 {
|
||||
t.Fatalf("expected 1 signal (only critical/warning alerts), got %d", len(signals))
|
||||
}
|
||||
if signals[0].SignalType != SignalActiveAlert {
|
||||
t.Errorf("expected SignalActiveAlert, got %s", signals[0].SignalType)
|
||||
}
|
||||
if signals[0].SuggestedSeverity != "critical" {
|
||||
t.Errorf("expected critical severity pass-through, got %s", signals[0].SuggestedSeverity)
|
||||
if len(signals) != 0 {
|
||||
t.Fatalf("expected no signals from pulse_alerts output (alerts must not be mirrored as Patrol findings), got %d: %+v", len(signals), signals)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue