diff --git a/docs/release-control/v6/internal/subsystems/ai-runtime.md b/docs/release-control/v6/internal/subsystems/ai-runtime.md index eab648b85..84db3ca95 100644 --- a/docs/release-control/v6/internal/subsystems/ai-runtime.md +++ b/docs/release-control/v6/internal/subsystems/ai-runtime.md @@ -580,6 +580,21 @@ runtime cost control, and shared AI transport surfaces. ## Current State +Patrol deterministic signal extraction (`internal/ai/patrol_signals.go`) +does not mirror the Alerts surface. The `pulse_alerts` tool output is +intentionally absent from the signal switch in `DetectSignals` — alerts +already have their own canonical surface, lifecycle, and operator +acknowledgement model, and the `SignalActiveAlert` mirror path has +been removed. Mirroring previously double-counted (every alert was +also a Patrol "Active alert detected" finding), dragged the health +score down for issues the operator already knew about, and produced +bogus `auto_resolved` → re-detected → regressed cycles when the LLM +explicitly resolved the mirrored finding while the underlying alert +kept firing. Patrol's job, per its own system prompt, is to surface +issues alerts cannot — trends, capacity risks, misconfigurations, +reliability gaps, cross-resource correlations. The Alerts page is +the canonical surface for currently-firing alerts. + The overall health score (`calculateOverallHealth` in `internal/ai/intelligence.go`) tiers the "recent Patrol errors" coverage factor by the ratio of errored runs to relevant runs in the scoring diff --git a/internal/ai/patrol_ai.go b/internal/ai/patrol_ai.go index 0be1f8409..8b2c93de1 100644 --- a/internal/ai/patrol_ai.go +++ b/internal/ai/patrol_ai.go @@ -996,8 +996,6 @@ func signalKey(s DetectedSignal) string { return "backup-failed" case SignalBackupStale: return "backup-stale" - case SignalActiveAlert: - return "active-alert" case SignalGuestUnreachable: return "guest-unreachable" default: @@ -1019,8 +1017,6 @@ func signalTitle(s DetectedSignal) string { return "Backup failed" case SignalBackupStale: return "Backup is stale" - case SignalActiveAlert: - return "Active alert detected" case SignalGuestUnreachable: return fmt.Sprintf("Guest unreachable: %s", s.ResourceName) default: @@ -1042,8 +1038,6 @@ func defaultRecommendationForSignal(s DetectedSignal) string { return "Review backup logs and fix the underlying error, then rerun the backup." case SignalBackupStale: return "Ensure backups are scheduled and completing successfully; run a new backup." - case SignalActiveAlert: - return "Investigate the active alert and resolve the underlying issue." case SignalGuestUnreachable: return "Investigate why this guest is not responding to ping. Check network configuration, firewall rules, or whether the guest has crashed." default: diff --git a/internal/ai/patrol_ai_eval_additional_test.go b/internal/ai/patrol_ai_eval_additional_test.go index 94a00ebbc..9a7a0e4d7 100644 --- a/internal/ai/patrol_ai_eval_additional_test.go +++ b/internal/ai/patrol_ai_eval_additional_test.go @@ -73,7 +73,6 @@ func TestSignalHelpersAndFindingsFromSignals(t *testing.T) { {signal: DetectedSignal{SignalType: SignalHighDisk}, wantKey: "disk-high", wantTitle: "Storage usage is high", recSubstring: "storage"}, {signal: DetectedSignal{SignalType: SignalBackupFailed}, wantKey: "backup-failed", wantTitle: "Backup failed", recSubstring: "backup"}, {signal: DetectedSignal{SignalType: SignalBackupStale}, wantKey: "backup-stale", wantTitle: "Backup is stale", recSubstring: "backup"}, - {signal: DetectedSignal{SignalType: SignalActiveAlert}, wantKey: "active-alert", wantTitle: "Active alert detected", recSubstring: "alert"}, {signal: DetectedSignal{SignalType: SignalGuestUnreachable, ResourceName: "db-server"}, wantKey: "guest-unreachable", wantTitle: "Guest unreachable: db-server", recSubstring: "ping"}, {signal: DetectedSignal{SignalType: SignalType("unknown")}, wantKey: "deterministic-signal", wantTitle: "Infrastructure signal detected", recSubstring: "Investigate"}, } diff --git a/internal/ai/patrol_signals.go b/internal/ai/patrol_signals.go index 1396e2471..44af8085e 100644 --- a/internal/ai/patrol_signals.go +++ b/internal/ai/patrol_signals.go @@ -78,7 +78,6 @@ const ( SignalHighDisk SignalType = "high_disk" SignalBackupFailed SignalType = "backup_failed" SignalBackupStale SignalType = "backup_stale" - SignalActiveAlert SignalType = "active_alert" SignalGuestUnreachable SignalType = "guest_unreachable" ) @@ -128,8 +127,14 @@ func detectSignalsFromToolCall(tc *ToolCallRecord, thresholds SignalThresholds) signals = append(signals, detectStorageSignals(tc, thresholds)...) case "pulse_metrics": signals = append(signals, detectMetricsSignals(tc, thresholds)...) - case "pulse_alerts": - signals = append(signals, detectAlertSignals(tc)...) + // pulse_alerts intentionally has no deterministic signal extraction. + // Patrol's job is to find issues alerts cannot — duplicating the + // alert list into Patrol findings was double-counting (alerts + // already have their own lifecycle, surface, and acknowledgement + // model) and produced regression cycles when the LLM resolved the + // mirrored finding while the underlying alert kept firing. The + // Alerts page is the canonical surface for currently-firing + // alerts; Patrol should stay quiet here. } return signals @@ -688,59 +693,6 @@ func detectMetricsSignals(tc *ToolCallRecord, thresholds SignalThresholds) []Det return signals } -// --- Alert signals --- - -// alertList is the minimal struct for parsing alert list output. -type alertList struct { - Alerts []struct { - ID string `json:"id"` - ResourceID string `json:"resource_id"` - ResourceName string `json:"resource_name"` - Type string `json:"type"` - Severity string `json:"severity"` - Message string `json:"message"` - } `json:"alerts"` -} - -func detectAlertSignals(tc *ToolCallRecord) []DetectedSignal { - inputAction := extractInputField(tc.Input, "action") - if inputAction != "list" { - return nil - } - - var signals []DetectedSignal - var data alertList - if err := json.Unmarshal([]byte(tc.Output), &data); err != nil { - if !tryParseEmbeddedJSON(tc.Output, &data) { - log.Debug().Err(err).Str("tool", tc.ToolName).Msg("patrol_signals: failed to parse alerts output") - return nil - } - } - - for _, alert := range data.Alerts { - sevLower := strings.ToLower(alert.Severity) - if sevLower != "critical" && sevLower != "warning" { - continue - } - - resourceType := inferFindingResourceType(alert.ResourceID, alert.ResourceName) - - signals = append(signals, DetectedSignal{ - SignalType: SignalActiveAlert, - ResourceID: alert.ResourceID, - ResourceName: alert.ResourceName, - ResourceType: resourceType, - SuggestedSeverity: sevLower, - Category: string(FindingCategoryGeneral), - Summary: "Active " + sevLower + " alert: " + alert.Message, - Evidence: truncateEvidence(tc.Output), - ToolCallID: tc.ID, - }) - } - - return signals -} - // --- Deduplication and matching --- // deduplicateSignals removes duplicate signals with the same SignalType:ResourceID. diff --git a/internal/ai/patrol_signals_test.go b/internal/ai/patrol_signals_test.go index abe8825a5..4a52979f7 100644 --- a/internal/ai/patrol_signals_test.go +++ b/internal/ai/patrol_signals_test.go @@ -332,11 +332,19 @@ func TestDetectSignals_BackupsStaleFromSummaries(t *testing.T) { } } -func TestDetectSignals_ActiveAlert(t *testing.T) { +// TestDetectSignals_DoesNotMirrorAlerts locks in the decision that Patrol +// does not duplicate the Alerts surface. pulse_alerts is intentionally not +// in the deterministic signal extraction switch — alerts have their own +// canonical surface, lifecycle, and acknowledgement model. Mirroring them +// as Patrol findings double-counted, drained the health score, and +// produced bogus auto_resolved → re-detected cycles when the LLM +// resolved the mirrored finding while the underlying alert kept firing. +func TestDetectSignals_DoesNotMirrorAlerts(t *testing.T) { output, _ := json.Marshal(map[string]interface{}{ "alerts": []map[string]interface{}{ {"id": "a1", "resource_id": "node1", "resource_name": "pve1", "severity": "critical", "message": "Node offline"}, - {"id": "a2", "resource_id": "vm100", "resource_name": "web", "severity": "info", "message": "Minor issue"}, + {"id": "a2", "resource_id": "vm100", "resource_name": "web", "severity": "warning", "message": "High CPU"}, + {"id": "a3", "resource_id": "vm200", "resource_name": "db", "severity": "info", "message": "Minor issue"}, }, }) @@ -351,14 +359,8 @@ func TestDetectSignals_ActiveAlert(t *testing.T) { } signals := DetectSignals(toolCalls, DefaultSignalThresholds()) - if len(signals) != 1 { - t.Fatalf("expected 1 signal (only critical/warning alerts), got %d", len(signals)) - } - if signals[0].SignalType != SignalActiveAlert { - t.Errorf("expected SignalActiveAlert, got %s", signals[0].SignalType) - } - if signals[0].SuggestedSeverity != "critical" { - t.Errorf("expected critical severity pass-through, got %s", signals[0].SuggestedSeverity) + if len(signals) != 0 { + t.Fatalf("expected no signals from pulse_alerts output (alerts must not be mirrored as Patrol findings), got %d: %+v", len(signals), signals) } }