Degrade Patrol health on incomplete coverage

2026-05-10 03:51:54 +00:00 · 2026-03-25 10:13:08 +00:00 · 2026-03-25 10:13:08 +00:00 · 2ec625091c
commit 2ec625091c
parent 2afd685877
6 changed files with 229 additions and 17 deletions
--- a/docs/release-control/v6/internal/subsystems/ai-runtime.md
+++ b/docs/release-control/v6/internal/subsystems/ai-runtime.md
@ -463,6 +463,16 @@ That runtime-state contract must be derived from live Patrol runtime inputs,
 not only from the last failed run attempt: exhausted quickstart credits are a
 blocked Patrol runtime immediately, and the backend must also clear any stale
 quickstart block once credits or BYOK configuration return.
+The same runtime contract now also governs when the system-wide Patrol health
+summary is allowed to read as healthy. `internal/ai/intelligence.go` must not
+derive `Health A` or `100/100` from "no active findings" alone when recent
+Patrol evidence is limited to alert-scoped runs or includes recent Patrol run
+errors; the summary must degrade and explain that overall infrastructure health
+is not fully verified until a recent successful full Patrol run exists.
+The Patrol startup scheduler must preserve that coverage guarantee as well:
+`internal/ai/patrol_run.go` may skip the startup full patrol only when recent
+run history already includes a successful full Patrol run, not merely because
+some recent scoped alert-triggered run exists.
 AI chat tool-name labels, pending-tool headers, and assistant status copy now
 also route through the shared frontend identifier-label helper, so the chat
 surfaces do not keep their own underscore-stripping behavior separate from
--- a/internal/ai/intelligence.go
+++ b/internal/ai/intelligence.go
@ -153,6 +153,7 @@ type Intelligence struct {
 	knowledge                  *knowledge.Store
 	changes                    *memory.ChangeDetector
 	remediations               *memory.RemediationLog
+	runHistoryStore            *PatrolRunHistoryStore
 	resourceTimelineStore      unifiedresources.ResourceStore
 	resourceTimelineStoreOrgID string
 	unifiedResourceProvider    UnifiedResourceProvider
@ -167,6 +168,11 @@ type Intelligence struct {
 	dataDir string
 }

+const (
+	intelligencePatrolCoverageWindow = 24 * time.Hour
+	intelligenceRecentRunLimit       = 10
+)
+
 // IntelligenceConfig configures the unified intelligence layer
 type IntelligenceConfig struct {
 	DataDir string
@ -212,6 +218,14 @@ func (i *Intelligence) SetResourceTimelineStore(store unifiedresources.ResourceS
 	i.resourceTimelineStoreOrgID = strings.TrimSpace(orgID)
 }

+// SetRunHistoryStore wires Patrol run history so intelligence summaries can
+// distinguish broad successful coverage from recent scoped or incomplete runs.
+func (i *Intelligence) SetRunHistoryStore(store *PatrolRunHistoryStore) {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+	i.runHistoryStore = store
+}
+
 // SetUnifiedResourceProvider wires the canonical unified resource provider used
 // for infrastructure-wide posture summaries.
 func (i *Intelligence) SetUnifiedResourceProvider(urp UnifiedResourceProvider) {
@ -237,6 +251,7 @@ func (i *Intelligence) GetSummary() *IntelligenceSummary {
 	findings := i.findings
 	patternsDetector := i.patterns
 	remediations := i.remediations
+	runHistoryStore := i.runHistoryStore
 	unifiedResourceProvider := i.unifiedResourceProvider
 	i.mu.RUnlock()

@ -275,7 +290,7 @@ func (i *Intelligence) GetSummary() *IntelligenceSummary {
 	summary.Learning = i.getLearningStats()

 	// Calculate overall health
-	summary.OverallHealth = i.calculateOverallHealth(summary)
+	summary.OverallHealth = i.calculateOverallHealth(summary, runHistoryStore)

 	// Resources at risk
 	summary.ResourcesAtRisk = i.getResourcesAtRisk(5)
@ -798,7 +813,7 @@ func (i *Intelligence) FormatCorrelationsContext(resourceID string) string {
 	return detector.FormatForContext(resourceID)
 }

-func (i *Intelligence) calculateOverallHealth(summary *IntelligenceSummary) HealthScore {
+func (i *Intelligence) calculateOverallHealth(summary *IntelligenceSummary, runHistoryStore *PatrolRunHistoryStore) HealthScore {
 	health := HealthScore{
 		Score:   100,
 		Grade:   HealthGradeA,
@ -849,6 +864,16 @@ func (i *Intelligence) calculateOverallHealth(summary *IntelligenceSummary) Heal
 		}
 	}

+	if factor, ok := summarizeRecentPatrolCoverage(runHistoryStore, time.Now()); ok {
+		health.Score -= factor.impact
+		health.Factors = append(health.Factors, HealthFactor{
+			Name:        factor.name,
+			Impact:      -factor.impact / 100,
+			Description: factor.description,
+			Category:    "coverage",
+		})
+	}
+
 	// Bonus for learning progress
 	if summary.Learning.ResourcesWithKnowledge > 5 {
 		bonus := 5.0
@ -988,6 +1013,12 @@ func scoreToGrade(score float64) HealthGrade {
 }

 func (i *Intelligence) generateHealthPrediction(health HealthScore, summary *IntelligenceSummary) string {
+	for _, factor := range health.Factors {
+		if factor.Category == "coverage" {
+			return factor.Description
+		}
+	}
+
 	if health.Grade == HealthGradeA {
 		return "Infrastructure is healthy with no significant issues detected."
 	}
@ -1009,6 +1040,86 @@ func (i *Intelligence) generateHealthPrediction(health HealthScore, summary *Int
 	return "Infrastructure is stable with minor issues to monitor."
 }

+type patrolCoverageFactor struct {
+	name        string
+	description string
+	impact      float64
+}
+
+func summarizeRecentPatrolCoverage(
+	runHistoryStore *PatrolRunHistoryStore,
+	now time.Time,
+) (patrolCoverageFactor, bool) {
+	if runHistoryStore == nil {
+		return patrolCoverageFactor{}, false
+	}
+
+	recentRuns := runHistoryStore.GetRecent(intelligenceRecentRunLimit)
+	if len(recentRuns) == 0 {
+		return patrolCoverageFactor{}, false
+	}
+
+	cutoff := now.Add(-intelligencePatrolCoverageWindow)
+	relevant := make([]PatrolRunRecord, 0, len(recentRuns))
+	for _, run := range recentRuns {
+		if run.CompletedAt.IsZero() || run.CompletedAt.Before(cutoff) {
+			continue
+		}
+		relevant = append(relevant, run)
+	}
+	if len(relevant) == 0 {
+		return patrolCoverageFactor{}, false
+	}
+
+	var recentErrors int
+	var hasSuccessfulFullRun bool
+	var scopedRuns int
+	for _, run := range relevant {
+		if run.ErrorCount > 0 || strings.EqualFold(strings.TrimSpace(run.Status), "error") {
+			recentErrors++
+		}
+		if isSuccessfulFullPatrolRun(run) {
+			hasSuccessfulFullRun = true
+		}
+		if isScopedPatrolRun(run) {
+			scopedRuns++
+		}
+	}
+
+	switch {
+	case !hasSuccessfulFullRun && recentErrors > 0:
+		return patrolCoverageFactor{
+			name:        "Patrol coverage incomplete",
+			description: "Patrol coverage is incomplete: recent activity was limited to scoped runs and ended with errors, so overall health is not fully verified.",
+			impact:      35,
+		}, true
+	case !hasSuccessfulFullRun && scopedRuns == len(relevant):
+		return patrolCoverageFactor{
+			name:        "Patrol coverage incomplete",
+			description: "Patrol coverage is incomplete: recent activity was limited to scoped runs, so overall infrastructure health is not fully verified.",
+			impact:      20,
+		}, true
+	case recentErrors > 0:
+		return patrolCoverageFactor{
+			name:        "Recent Patrol errors",
+			description: "Recent Patrol runs encountered errors, so the current health summary may be incomplete.",
+			impact:      10,
+		}, true
+	default:
+		return patrolCoverageFactor{}, false
+	}
+}
+
+func isScopedPatrolRun(run PatrolRunRecord) bool {
+	return strings.EqualFold(strings.TrimSpace(run.Type), "scoped")
+}
+
+func isSuccessfulFullPatrolRun(run PatrolRunRecord) bool {
+	return !isScopedPatrolRun(run) &&
+		run.ErrorCount == 0 &&
+		!strings.EqualFold(strings.TrimSpace(run.Status), "error")
+}
+
 func (i *Intelligence) getResourcesAtRisk(limit int) []ResourceRiskSummary {
 	if i.findings == nil {
 		return nil
--- a/internal/ai/intelligence_coverage_test.go
+++ b/internal/ai/intelligence_coverage_test.go
@ -153,6 +153,52 @@ func TestIntelligence_getTopFindings_Empty(t *testing.T) {
 	}
 }

+func TestIntelligence_GetSummary_DegradesWhenRecentPatrolCoverageIsScopedAndErroring(t *testing.T) {
+	intel := NewIntelligence(IntelligenceConfig{})
+	runHistory := NewPatrolRunHistoryStore(10)
+	now := time.Now()
+	runHistory.Add(PatrolRunRecord{
+		ID:               "scoped-error-1",
+		Type:             "scoped",
+		TriggerReason:    "alert_fired",
+		CompletedAt:      now.Add(-5 * time.Minute),
+		ErrorCount:       1,
+		Status:           "error",
+		ResourcesChecked: 1,
+	})
+	runHistory.Add(PatrolRunRecord{
+		ID:               "scoped-error-2",
+		Type:             "scoped",
+		TriggerReason:    "alert_fired",
+		CompletedAt:      now.Add(-15 * time.Minute),
+		ErrorCount:       1,
+		Status:           "error",
+		ResourcesChecked: 1,
+	})
+	intel.SetRunHistoryStore(runHistory)
+
+	summary := intel.GetSummary()
+	if summary.OverallHealth.Score >= 100 {
+		t.Fatalf("expected reduced health score, got %f", summary.OverallHealth.Score)
+	}
+	if summary.OverallHealth.Grade == HealthGradeA {
+		t.Fatalf("expected non-A grade, got %s", summary.OverallHealth.Grade)
+	}
+	if !strings.Contains(summary.OverallHealth.Prediction, "not fully verified") {
+		t.Fatalf("expected coverage warning prediction, got %q", summary.OverallHealth.Prediction)
+	}
+	foundCoverageFactor := false
+	for _, factor := range summary.OverallHealth.Factors {
+		if factor.Category == "coverage" {
+			foundCoverageFactor = true
+			break
+		}
+	}
+	if !foundCoverageFactor {
+		t.Fatal("expected coverage factor in overall health")
+	}
+}
+
 func TestIntelligence_getLearningStats(t *testing.T) {
 	intel := NewIntelligence(IntelligenceConfig{})
 	knowledgeStore, err := knowledge.NewStore(t.TempDir())
@ -388,14 +434,14 @@ func TestIntelligence_calculateOverallHealth_Clamps(t *testing.T) {
 	negative := intel.calculateOverallHealth(&IntelligenceSummary{
 		FindingsCount: FindingsCounts{Critical: 10, Warning: 10},
 		UpcomingRisks: predictions,
-	})
+	}, nil)
 	if negative.Score != 0 {
 		t.Errorf("expected score clamped to 0, got %f", negative.Score)
 	}

 	positive := intel.calculateOverallHealth(&IntelligenceSummary{
 		Learning: LearningStats{ResourcesWithKnowledge: 10},
-	})
+	}, nil)
 	if positive.Score != 100 {
 		t.Errorf("expected score clamped to 100, got %f", positive.Score)
 	}
--- a/internal/ai/patrol_init.go
+++ b/internal/ai/patrol_init.go
@ -782,6 +782,7 @@ func (p *PatrolService) GetIntelligence() *Intelligence {
 		p.changeDetector,
 		p.remediationLog,
 	)
+	p.intelligence.SetRunHistoryStore(p.runHistoryStore)
 	if p.aiService != nil {
 		p.aiService.mu.RLock()
 		store := p.aiService.resourceExportStore
--- a/internal/ai/patrol_run.go
+++ b/internal/ai/patrol_run.go
@ -117,19 +117,8 @@ func (p *PatrolService) patrolLoop(ctx context.Context) {
 	select {
 	case <-initialTimer.C:
 		// Check if a patrol ran recently (within last hour) to avoid wasting tokens on restarts
-		runHistory := p.GetRunHistory(1)
-
-		skipInitial := false
-		if len(runHistory) > 0 {
-			lastRun := runHistory[0]
-			timeSinceLastRun := time.Since(lastRun.CompletedAt)
-			if timeSinceLastRun < 1*time.Hour {
-				log.Info().
-					Dur("time_since_last", timeSinceLastRun).
-					Msg("AI Patrol: Skipping initial patrol - recent run exists")
-				skipInitial = true
-			}
-		}
+		runHistory := p.GetRunHistory(10)
+		skipInitial := shouldSkipInitialFullPatrol(runHistory, time.Now())

 		if !skipInitial {
 			p.runPatrolWithTrigger(ctx, TriggerReasonStartup, nil)
@ -206,6 +195,27 @@ func (p *PatrolService) patrolLoop(ctx context.Context) {
 	}
 }

+func shouldSkipInitialFullPatrol(runHistory []PatrolRunRecord, now time.Time) bool {
+	for _, run := range runHistory {
+		if run.CompletedAt.IsZero() {
+			continue
+		}
+		timeSinceLastRun := now.Sub(run.CompletedAt)
+		if timeSinceLastRun >= 1*time.Hour {
+			continue
+		}
+		if isSuccessfulFullPatrolRun(run) {
+			log.Info().
+				Dur("time_since_last", timeSinceLastRun).
+				Str("run_type", run.Type).
+				Str("trigger_reason", run.TriggerReason).
+				Msg("AI Patrol: Skipping initial patrol - recent successful full run exists")
+			return true
+		}
+	}
+	return false
+}
+
 // runPatrol executes a scheduled patrol run
 func (p *PatrolService) runPatrol(ctx context.Context) {
 	p.runPatrolWithTrigger(ctx, TriggerReasonScheduled, nil)
--- a/internal/ai/patrol_run_test.go
+++ b/internal/ai/patrol_run_test.go
@ -1514,6 +1514,40 @@ func TestGetStatus_ClearsStaleQuickstartBlockedStateWhenCreditsReturn(t *testing
 	}
 }

+func TestShouldSkipInitialFullPatrol_IgnoresRecentScopedErrorRun(t *testing.T) {
+	now := time.Now()
+	skip := shouldSkipInitialFullPatrol([]PatrolRunRecord{
+		{
+			ID:            "scoped-error",
+			Type:          "scoped",
+			TriggerReason: "alert_fired",
+			CompletedAt:   now.Add(-5 * time.Minute),
+			ErrorCount:    1,
+			Status:        "error",
+		},
+	}, now)
+	if skip {
+		t.Fatal("expected recent scoped error run not to suppress initial full patrol")
+	}
+}
+
+func TestShouldSkipInitialFullPatrol_SkipsAfterRecentSuccessfulFullRun(t *testing.T) {
+	now := time.Now()
+	skip := shouldSkipInitialFullPatrol([]PatrolRunRecord{
+		{
+			ID:            "full-success",
+			Type:          "patrol",
+			TriggerReason: "scheduled",
+			CompletedAt:   now.Add(-10 * time.Minute),
+			ErrorCount:    0,
+			Status:        "healthy",
+		},
+	}, now)
+	if !skip {
+		t.Fatal("expected recent successful full patrol to suppress initial full patrol")
+	}
+}
+
 // --- appendStreamContent ---

 func TestAppendStreamContent(t *testing.T) {