From 2ec625091ca8eaff87038fca0e7bf7b0ec8b8eea Mon Sep 17 00:00:00 2001 From: rcourtman Date: Wed, 25 Mar 2026 10:13:08 +0000 Subject: [PATCH] Degrade Patrol health on incomplete coverage --- .../v6/internal/subsystems/ai-runtime.md | 10 ++ internal/ai/intelligence.go | 115 +++++++++++++++++- internal/ai/intelligence_coverage_test.go | 50 +++++++- internal/ai/patrol_init.go | 1 + internal/ai/patrol_run.go | 36 ++++-- internal/ai/patrol_run_test.go | 34 ++++++ 6 files changed, 229 insertions(+), 17 deletions(-) diff --git a/docs/release-control/v6/internal/subsystems/ai-runtime.md b/docs/release-control/v6/internal/subsystems/ai-runtime.md index a4c3ffc0b..89a5ae443 100644 --- a/docs/release-control/v6/internal/subsystems/ai-runtime.md +++ b/docs/release-control/v6/internal/subsystems/ai-runtime.md @@ -463,6 +463,16 @@ That runtime-state contract must be derived from live Patrol runtime inputs, not only from the last failed run attempt: exhausted quickstart credits are a blocked Patrol runtime immediately, and the backend must also clear any stale quickstart block once credits or BYOK configuration return. +The same runtime contract now also governs when the system-wide Patrol health +summary is allowed to read as healthy. `internal/ai/intelligence.go` must not +derive `Health A` or `100/100` from "no active findings" alone when recent +Patrol evidence is limited to alert-scoped runs or includes recent Patrol run +errors; the summary must degrade and explain that overall infrastructure health +is not fully verified until a recent successful full Patrol run exists. +The Patrol startup scheduler must preserve that coverage guarantee as well: +`internal/ai/patrol_run.go` may skip the startup full patrol only when recent +run history already includes a successful full Patrol run, not merely because +some recent scoped alert-triggered run exists. AI chat tool-name labels, pending-tool headers, and assistant status copy now also route through the shared frontend identifier-label helper, so the chat surfaces do not keep their own underscore-stripping behavior separate from diff --git a/internal/ai/intelligence.go b/internal/ai/intelligence.go index 31b6364cf..75333eb8c 100644 --- a/internal/ai/intelligence.go +++ b/internal/ai/intelligence.go @@ -153,6 +153,7 @@ type Intelligence struct { knowledge *knowledge.Store changes *memory.ChangeDetector remediations *memory.RemediationLog + runHistoryStore *PatrolRunHistoryStore resourceTimelineStore unifiedresources.ResourceStore resourceTimelineStoreOrgID string unifiedResourceProvider UnifiedResourceProvider @@ -167,6 +168,11 @@ type Intelligence struct { dataDir string } +const ( + intelligencePatrolCoverageWindow = 24 * time.Hour + intelligenceRecentRunLimit = 10 +) + // IntelligenceConfig configures the unified intelligence layer type IntelligenceConfig struct { DataDir string @@ -212,6 +218,14 @@ func (i *Intelligence) SetResourceTimelineStore(store unifiedresources.ResourceS i.resourceTimelineStoreOrgID = strings.TrimSpace(orgID) } +// SetRunHistoryStore wires Patrol run history so intelligence summaries can +// distinguish broad successful coverage from recent scoped or incomplete runs. +func (i *Intelligence) SetRunHistoryStore(store *PatrolRunHistoryStore) { + i.mu.Lock() + defer i.mu.Unlock() + i.runHistoryStore = store +} + // SetUnifiedResourceProvider wires the canonical unified resource provider used // for infrastructure-wide posture summaries. func (i *Intelligence) SetUnifiedResourceProvider(urp UnifiedResourceProvider) { @@ -237,6 +251,7 @@ func (i *Intelligence) GetSummary() *IntelligenceSummary { findings := i.findings patternsDetector := i.patterns remediations := i.remediations + runHistoryStore := i.runHistoryStore unifiedResourceProvider := i.unifiedResourceProvider i.mu.RUnlock() @@ -275,7 +290,7 @@ func (i *Intelligence) GetSummary() *IntelligenceSummary { summary.Learning = i.getLearningStats() // Calculate overall health - summary.OverallHealth = i.calculateOverallHealth(summary) + summary.OverallHealth = i.calculateOverallHealth(summary, runHistoryStore) // Resources at risk summary.ResourcesAtRisk = i.getResourcesAtRisk(5) @@ -798,7 +813,7 @@ func (i *Intelligence) FormatCorrelationsContext(resourceID string) string { return detector.FormatForContext(resourceID) } -func (i *Intelligence) calculateOverallHealth(summary *IntelligenceSummary) HealthScore { +func (i *Intelligence) calculateOverallHealth(summary *IntelligenceSummary, runHistoryStore *PatrolRunHistoryStore) HealthScore { health := HealthScore{ Score: 100, Grade: HealthGradeA, @@ -849,6 +864,16 @@ func (i *Intelligence) calculateOverallHealth(summary *IntelligenceSummary) Heal } } + if factor, ok := summarizeRecentPatrolCoverage(runHistoryStore, time.Now()); ok { + health.Score -= factor.impact + health.Factors = append(health.Factors, HealthFactor{ + Name: factor.name, + Impact: -factor.impact / 100, + Description: factor.description, + Category: "coverage", + }) + } + // Bonus for learning progress if summary.Learning.ResourcesWithKnowledge > 5 { bonus := 5.0 @@ -988,6 +1013,12 @@ func scoreToGrade(score float64) HealthGrade { } func (i *Intelligence) generateHealthPrediction(health HealthScore, summary *IntelligenceSummary) string { + for _, factor := range health.Factors { + if factor.Category == "coverage" { + return factor.Description + } + } + if health.Grade == HealthGradeA { return "Infrastructure is healthy with no significant issues detected." } @@ -1009,6 +1040,86 @@ func (i *Intelligence) generateHealthPrediction(health HealthScore, summary *Int return "Infrastructure is stable with minor issues to monitor." } +type patrolCoverageFactor struct { + name string + description string + impact float64 +} + +func summarizeRecentPatrolCoverage( + runHistoryStore *PatrolRunHistoryStore, + now time.Time, +) (patrolCoverageFactor, bool) { + if runHistoryStore == nil { + return patrolCoverageFactor{}, false + } + + recentRuns := runHistoryStore.GetRecent(intelligenceRecentRunLimit) + if len(recentRuns) == 0 { + return patrolCoverageFactor{}, false + } + + cutoff := now.Add(-intelligencePatrolCoverageWindow) + relevant := make([]PatrolRunRecord, 0, len(recentRuns)) + for _, run := range recentRuns { + if run.CompletedAt.IsZero() || run.CompletedAt.Before(cutoff) { + continue + } + relevant = append(relevant, run) + } + if len(relevant) == 0 { + return patrolCoverageFactor{}, false + } + + var recentErrors int + var hasSuccessfulFullRun bool + var scopedRuns int + for _, run := range relevant { + if run.ErrorCount > 0 || strings.EqualFold(strings.TrimSpace(run.Status), "error") { + recentErrors++ + } + if isSuccessfulFullPatrolRun(run) { + hasSuccessfulFullRun = true + } + if isScopedPatrolRun(run) { + scopedRuns++ + } + } + + switch { + case !hasSuccessfulFullRun && recentErrors > 0: + return patrolCoverageFactor{ + name: "Patrol coverage incomplete", + description: "Patrol coverage is incomplete: recent activity was limited to scoped runs and ended with errors, so overall health is not fully verified.", + impact: 35, + }, true + case !hasSuccessfulFullRun && scopedRuns == len(relevant): + return patrolCoverageFactor{ + name: "Patrol coverage incomplete", + description: "Patrol coverage is incomplete: recent activity was limited to scoped runs, so overall infrastructure health is not fully verified.", + impact: 20, + }, true + case recentErrors > 0: + return patrolCoverageFactor{ + name: "Recent Patrol errors", + description: "Recent Patrol runs encountered errors, so the current health summary may be incomplete.", + impact: 10, + }, true + default: + return patrolCoverageFactor{}, false + } +} + +func isScopedPatrolRun(run PatrolRunRecord) bool { + return strings.EqualFold(strings.TrimSpace(run.Type), "scoped") +} + +func isSuccessfulFullPatrolRun(run PatrolRunRecord) bool { + return !isScopedPatrolRun(run) && + run.ErrorCount == 0 && + !strings.EqualFold(strings.TrimSpace(run.Status), "error") +} + func (i *Intelligence) getResourcesAtRisk(limit int) []ResourceRiskSummary { if i.findings == nil { return nil diff --git a/internal/ai/intelligence_coverage_test.go b/internal/ai/intelligence_coverage_test.go index 576769237..6a5eb6ee1 100644 --- a/internal/ai/intelligence_coverage_test.go +++ b/internal/ai/intelligence_coverage_test.go @@ -153,6 +153,52 @@ func TestIntelligence_getTopFindings_Empty(t *testing.T) { } } +func TestIntelligence_GetSummary_DegradesWhenRecentPatrolCoverageIsScopedAndErroring(t *testing.T) { + intel := NewIntelligence(IntelligenceConfig{}) + runHistory := NewPatrolRunHistoryStore(10) + now := time.Now() + runHistory.Add(PatrolRunRecord{ + ID: "scoped-error-1", + Type: "scoped", + TriggerReason: "alert_fired", + CompletedAt: now.Add(-5 * time.Minute), + ErrorCount: 1, + Status: "error", + ResourcesChecked: 1, + }) + runHistory.Add(PatrolRunRecord{ + ID: "scoped-error-2", + Type: "scoped", + TriggerReason: "alert_fired", + CompletedAt: now.Add(-15 * time.Minute), + ErrorCount: 1, + Status: "error", + ResourcesChecked: 1, + }) + intel.SetRunHistoryStore(runHistory) + + summary := intel.GetSummary() + if summary.OverallHealth.Score >= 100 { + t.Fatalf("expected reduced health score, got %f", summary.OverallHealth.Score) + } + if summary.OverallHealth.Grade == HealthGradeA { + t.Fatalf("expected non-A grade, got %s", summary.OverallHealth.Grade) + } + if !strings.Contains(summary.OverallHealth.Prediction, "not fully verified") { + t.Fatalf("expected coverage warning prediction, got %q", summary.OverallHealth.Prediction) + } + foundCoverageFactor := false + for _, factor := range summary.OverallHealth.Factors { + if factor.Category == "coverage" { + foundCoverageFactor = true + break + } + } + if !foundCoverageFactor { + t.Fatal("expected coverage factor in overall health") + } +} + func TestIntelligence_getLearningStats(t *testing.T) { intel := NewIntelligence(IntelligenceConfig{}) knowledgeStore, err := knowledge.NewStore(t.TempDir()) @@ -388,14 +434,14 @@ func TestIntelligence_calculateOverallHealth_Clamps(t *testing.T) { negative := intel.calculateOverallHealth(&IntelligenceSummary{ FindingsCount: FindingsCounts{Critical: 10, Warning: 10}, UpcomingRisks: predictions, - }) + }, nil) if negative.Score != 0 { t.Errorf("expected score clamped to 0, got %f", negative.Score) } positive := intel.calculateOverallHealth(&IntelligenceSummary{ Learning: LearningStats{ResourcesWithKnowledge: 10}, - }) + }, nil) if positive.Score != 100 { t.Errorf("expected score clamped to 100, got %f", positive.Score) } diff --git a/internal/ai/patrol_init.go b/internal/ai/patrol_init.go index e49f387c5..8ebb85537 100644 --- a/internal/ai/patrol_init.go +++ b/internal/ai/patrol_init.go @@ -782,6 +782,7 @@ func (p *PatrolService) GetIntelligence() *Intelligence { p.changeDetector, p.remediationLog, ) + p.intelligence.SetRunHistoryStore(p.runHistoryStore) if p.aiService != nil { p.aiService.mu.RLock() store := p.aiService.resourceExportStore diff --git a/internal/ai/patrol_run.go b/internal/ai/patrol_run.go index b70987ea9..01db35c6f 100644 --- a/internal/ai/patrol_run.go +++ b/internal/ai/patrol_run.go @@ -117,19 +117,8 @@ func (p *PatrolService) patrolLoop(ctx context.Context) { select { case <-initialTimer.C: // Check if a patrol ran recently (within last hour) to avoid wasting tokens on restarts - runHistory := p.GetRunHistory(1) - - skipInitial := false - if len(runHistory) > 0 { - lastRun := runHistory[0] - timeSinceLastRun := time.Since(lastRun.CompletedAt) - if timeSinceLastRun < 1*time.Hour { - log.Info(). - Dur("time_since_last", timeSinceLastRun). - Msg("AI Patrol: Skipping initial patrol - recent run exists") - skipInitial = true - } - } + runHistory := p.GetRunHistory(10) + skipInitial := shouldSkipInitialFullPatrol(runHistory, time.Now()) if !skipInitial { p.runPatrolWithTrigger(ctx, TriggerReasonStartup, nil) @@ -206,6 +195,27 @@ func (p *PatrolService) patrolLoop(ctx context.Context) { } } +func shouldSkipInitialFullPatrol(runHistory []PatrolRunRecord, now time.Time) bool { + for _, run := range runHistory { + if run.CompletedAt.IsZero() { + continue + } + timeSinceLastRun := now.Sub(run.CompletedAt) + if timeSinceLastRun >= 1*time.Hour { + continue + } + if isSuccessfulFullPatrolRun(run) { + log.Info(). + Dur("time_since_last", timeSinceLastRun). + Str("run_type", run.Type). + Str("trigger_reason", run.TriggerReason). + Msg("AI Patrol: Skipping initial patrol - recent successful full run exists") + return true + } + } + return false +} + // runPatrol executes a scheduled patrol run func (p *PatrolService) runPatrol(ctx context.Context) { p.runPatrolWithTrigger(ctx, TriggerReasonScheduled, nil) diff --git a/internal/ai/patrol_run_test.go b/internal/ai/patrol_run_test.go index 9119153ee..5906f730e 100644 --- a/internal/ai/patrol_run_test.go +++ b/internal/ai/patrol_run_test.go @@ -1514,6 +1514,40 @@ func TestGetStatus_ClearsStaleQuickstartBlockedStateWhenCreditsReturn(t *testing } } +func TestShouldSkipInitialFullPatrol_IgnoresRecentScopedErrorRun(t *testing.T) { + now := time.Now() + skip := shouldSkipInitialFullPatrol([]PatrolRunRecord{ + { + ID: "scoped-error", + Type: "scoped", + TriggerReason: "alert_fired", + CompletedAt: now.Add(-5 * time.Minute), + ErrorCount: 1, + Status: "error", + }, + }, now) + if skip { + t.Fatal("expected recent scoped error run not to suppress initial full patrol") + } +} + +func TestShouldSkipInitialFullPatrol_SkipsAfterRecentSuccessfulFullRun(t *testing.T) { + now := time.Now() + skip := shouldSkipInitialFullPatrol([]PatrolRunRecord{ + { + ID: "full-success", + Type: "patrol", + TriggerReason: "scheduled", + CompletedAt: now.Add(-10 * time.Minute), + ErrorCount: 0, + Status: "healthy", + }, + }, now) + if !skip { + t.Fatal("expected recent successful full patrol to suppress initial full patrol") + } +} + // --- appendStreamContent --- func TestAppendStreamContent(t *testing.T) {