Degrade Patrol health on incomplete coverage

This commit is contained in:
rcourtman 2026-03-25 10:13:08 +00:00
parent 2afd685877
commit 2ec625091c
6 changed files with 229 additions and 17 deletions

View file

@ -463,6 +463,16 @@ That runtime-state contract must be derived from live Patrol runtime inputs,
not only from the last failed run attempt: exhausted quickstart credits are a
blocked Patrol runtime immediately, and the backend must also clear any stale
quickstart block once credits or BYOK configuration return.
The same runtime contract now also governs when the system-wide Patrol health
summary is allowed to read as healthy. `internal/ai/intelligence.go` must not
derive `Health A` or `100/100` from "no active findings" alone when recent
Patrol evidence is limited to alert-scoped runs or includes recent Patrol run
errors; the summary must degrade and explain that overall infrastructure health
is not fully verified until a recent successful full Patrol run exists.
The Patrol startup scheduler must preserve that coverage guarantee as well:
`internal/ai/patrol_run.go` may skip the startup full patrol only when recent
run history already includes a successful full Patrol run, not merely because
some recent scoped alert-triggered run exists.
AI chat tool-name labels, pending-tool headers, and assistant status copy now
also route through the shared frontend identifier-label helper, so the chat
surfaces do not keep their own underscore-stripping behavior separate from

View file

@ -153,6 +153,7 @@ type Intelligence struct {
knowledge *knowledge.Store
changes *memory.ChangeDetector
remediations *memory.RemediationLog
runHistoryStore *PatrolRunHistoryStore
resourceTimelineStore unifiedresources.ResourceStore
resourceTimelineStoreOrgID string
unifiedResourceProvider UnifiedResourceProvider
@ -167,6 +168,11 @@ type Intelligence struct {
dataDir string
}
const (
intelligencePatrolCoverageWindow = 24 * time.Hour
intelligenceRecentRunLimit = 10
)
// IntelligenceConfig configures the unified intelligence layer
type IntelligenceConfig struct {
DataDir string
@ -212,6 +218,14 @@ func (i *Intelligence) SetResourceTimelineStore(store unifiedresources.ResourceS
i.resourceTimelineStoreOrgID = strings.TrimSpace(orgID)
}
// SetRunHistoryStore wires Patrol run history so intelligence summaries can
// distinguish broad successful coverage from recent scoped or incomplete runs.
func (i *Intelligence) SetRunHistoryStore(store *PatrolRunHistoryStore) {
i.mu.Lock()
defer i.mu.Unlock()
i.runHistoryStore = store
}
// SetUnifiedResourceProvider wires the canonical unified resource provider used
// for infrastructure-wide posture summaries.
func (i *Intelligence) SetUnifiedResourceProvider(urp UnifiedResourceProvider) {
@ -237,6 +251,7 @@ func (i *Intelligence) GetSummary() *IntelligenceSummary {
findings := i.findings
patternsDetector := i.patterns
remediations := i.remediations
runHistoryStore := i.runHistoryStore
unifiedResourceProvider := i.unifiedResourceProvider
i.mu.RUnlock()
@ -275,7 +290,7 @@ func (i *Intelligence) GetSummary() *IntelligenceSummary {
summary.Learning = i.getLearningStats()
// Calculate overall health
summary.OverallHealth = i.calculateOverallHealth(summary)
summary.OverallHealth = i.calculateOverallHealth(summary, runHistoryStore)
// Resources at risk
summary.ResourcesAtRisk = i.getResourcesAtRisk(5)
@ -798,7 +813,7 @@ func (i *Intelligence) FormatCorrelationsContext(resourceID string) string {
return detector.FormatForContext(resourceID)
}
func (i *Intelligence) calculateOverallHealth(summary *IntelligenceSummary) HealthScore {
func (i *Intelligence) calculateOverallHealth(summary *IntelligenceSummary, runHistoryStore *PatrolRunHistoryStore) HealthScore {
health := HealthScore{
Score: 100,
Grade: HealthGradeA,
@ -849,6 +864,16 @@ func (i *Intelligence) calculateOverallHealth(summary *IntelligenceSummary) Heal
}
}
if factor, ok := summarizeRecentPatrolCoverage(runHistoryStore, time.Now()); ok {
health.Score -= factor.impact
health.Factors = append(health.Factors, HealthFactor{
Name: factor.name,
Impact: -factor.impact / 100,
Description: factor.description,
Category: "coverage",
})
}
// Bonus for learning progress
if summary.Learning.ResourcesWithKnowledge > 5 {
bonus := 5.0
@ -988,6 +1013,12 @@ func scoreToGrade(score float64) HealthGrade {
}
func (i *Intelligence) generateHealthPrediction(health HealthScore, summary *IntelligenceSummary) string {
for _, factor := range health.Factors {
if factor.Category == "coverage" {
return factor.Description
}
}
if health.Grade == HealthGradeA {
return "Infrastructure is healthy with no significant issues detected."
}
@ -1009,6 +1040,86 @@ func (i *Intelligence) generateHealthPrediction(health HealthScore, summary *Int
return "Infrastructure is stable with minor issues to monitor."
}
type patrolCoverageFactor struct {
name string
description string
impact float64
}
func summarizeRecentPatrolCoverage(
runHistoryStore *PatrolRunHistoryStore,
now time.Time,
) (patrolCoverageFactor, bool) {
if runHistoryStore == nil {
return patrolCoverageFactor{}, false
}
recentRuns := runHistoryStore.GetRecent(intelligenceRecentRunLimit)
if len(recentRuns) == 0 {
return patrolCoverageFactor{}, false
}
cutoff := now.Add(-intelligencePatrolCoverageWindow)
relevant := make([]PatrolRunRecord, 0, len(recentRuns))
for _, run := range recentRuns {
if run.CompletedAt.IsZero() || run.CompletedAt.Before(cutoff) {
continue
}
relevant = append(relevant, run)
}
if len(relevant) == 0 {
return patrolCoverageFactor{}, false
}
var recentErrors int
var hasSuccessfulFullRun bool
var scopedRuns int
for _, run := range relevant {
if run.ErrorCount > 0 || strings.EqualFold(strings.TrimSpace(run.Status), "error") {
recentErrors++
}
if isSuccessfulFullPatrolRun(run) {
hasSuccessfulFullRun = true
}
if isScopedPatrolRun(run) {
scopedRuns++
}
}
switch {
case !hasSuccessfulFullRun && recentErrors > 0:
return patrolCoverageFactor{
name: "Patrol coverage incomplete",
description: "Patrol coverage is incomplete: recent activity was limited to scoped runs and ended with errors, so overall health is not fully verified.",
impact: 35,
}, true
case !hasSuccessfulFullRun && scopedRuns == len(relevant):
return patrolCoverageFactor{
name: "Patrol coverage incomplete",
description: "Patrol coverage is incomplete: recent activity was limited to scoped runs, so overall infrastructure health is not fully verified.",
impact: 20,
}, true
case recentErrors > 0:
return patrolCoverageFactor{
name: "Recent Patrol errors",
description: "Recent Patrol runs encountered errors, so the current health summary may be incomplete.",
impact: 10,
}, true
default:
return patrolCoverageFactor{}, false
}
}
func isScopedPatrolRun(run PatrolRunRecord) bool {
return strings.EqualFold(strings.TrimSpace(run.Type), "scoped")
}
func isSuccessfulFullPatrolRun(run PatrolRunRecord) bool {
return !isScopedPatrolRun(run) &&
run.ErrorCount == 0 &&
!strings.EqualFold(strings.TrimSpace(run.Status), "error")
}
func (i *Intelligence) getResourcesAtRisk(limit int) []ResourceRiskSummary {
if i.findings == nil {
return nil

View file

@ -153,6 +153,52 @@ func TestIntelligence_getTopFindings_Empty(t *testing.T) {
}
}
func TestIntelligence_GetSummary_DegradesWhenRecentPatrolCoverageIsScopedAndErroring(t *testing.T) {
intel := NewIntelligence(IntelligenceConfig{})
runHistory := NewPatrolRunHistoryStore(10)
now := time.Now()
runHistory.Add(PatrolRunRecord{
ID: "scoped-error-1",
Type: "scoped",
TriggerReason: "alert_fired",
CompletedAt: now.Add(-5 * time.Minute),
ErrorCount: 1,
Status: "error",
ResourcesChecked: 1,
})
runHistory.Add(PatrolRunRecord{
ID: "scoped-error-2",
Type: "scoped",
TriggerReason: "alert_fired",
CompletedAt: now.Add(-15 * time.Minute),
ErrorCount: 1,
Status: "error",
ResourcesChecked: 1,
})
intel.SetRunHistoryStore(runHistory)
summary := intel.GetSummary()
if summary.OverallHealth.Score >= 100 {
t.Fatalf("expected reduced health score, got %f", summary.OverallHealth.Score)
}
if summary.OverallHealth.Grade == HealthGradeA {
t.Fatalf("expected non-A grade, got %s", summary.OverallHealth.Grade)
}
if !strings.Contains(summary.OverallHealth.Prediction, "not fully verified") {
t.Fatalf("expected coverage warning prediction, got %q", summary.OverallHealth.Prediction)
}
foundCoverageFactor := false
for _, factor := range summary.OverallHealth.Factors {
if factor.Category == "coverage" {
foundCoverageFactor = true
break
}
}
if !foundCoverageFactor {
t.Fatal("expected coverage factor in overall health")
}
}
func TestIntelligence_getLearningStats(t *testing.T) {
intel := NewIntelligence(IntelligenceConfig{})
knowledgeStore, err := knowledge.NewStore(t.TempDir())
@ -388,14 +434,14 @@ func TestIntelligence_calculateOverallHealth_Clamps(t *testing.T) {
negative := intel.calculateOverallHealth(&IntelligenceSummary{
FindingsCount: FindingsCounts{Critical: 10, Warning: 10},
UpcomingRisks: predictions,
})
}, nil)
if negative.Score != 0 {
t.Errorf("expected score clamped to 0, got %f", negative.Score)
}
positive := intel.calculateOverallHealth(&IntelligenceSummary{
Learning: LearningStats{ResourcesWithKnowledge: 10},
})
}, nil)
if positive.Score != 100 {
t.Errorf("expected score clamped to 100, got %f", positive.Score)
}

View file

@ -782,6 +782,7 @@ func (p *PatrolService) GetIntelligence() *Intelligence {
p.changeDetector,
p.remediationLog,
)
p.intelligence.SetRunHistoryStore(p.runHistoryStore)
if p.aiService != nil {
p.aiService.mu.RLock()
store := p.aiService.resourceExportStore

View file

@ -117,19 +117,8 @@ func (p *PatrolService) patrolLoop(ctx context.Context) {
select {
case <-initialTimer.C:
// Check if a patrol ran recently (within last hour) to avoid wasting tokens on restarts
runHistory := p.GetRunHistory(1)
skipInitial := false
if len(runHistory) > 0 {
lastRun := runHistory[0]
timeSinceLastRun := time.Since(lastRun.CompletedAt)
if timeSinceLastRun < 1*time.Hour {
log.Info().
Dur("time_since_last", timeSinceLastRun).
Msg("AI Patrol: Skipping initial patrol - recent run exists")
skipInitial = true
}
}
runHistory := p.GetRunHistory(10)
skipInitial := shouldSkipInitialFullPatrol(runHistory, time.Now())
if !skipInitial {
p.runPatrolWithTrigger(ctx, TriggerReasonStartup, nil)
@ -206,6 +195,27 @@ func (p *PatrolService) patrolLoop(ctx context.Context) {
}
}
func shouldSkipInitialFullPatrol(runHistory []PatrolRunRecord, now time.Time) bool {
for _, run := range runHistory {
if run.CompletedAt.IsZero() {
continue
}
timeSinceLastRun := now.Sub(run.CompletedAt)
if timeSinceLastRun >= 1*time.Hour {
continue
}
if isSuccessfulFullPatrolRun(run) {
log.Info().
Dur("time_since_last", timeSinceLastRun).
Str("run_type", run.Type).
Str("trigger_reason", run.TriggerReason).
Msg("AI Patrol: Skipping initial patrol - recent successful full run exists")
return true
}
}
return false
}
// runPatrol executes a scheduled patrol run
func (p *PatrolService) runPatrol(ctx context.Context) {
p.runPatrolWithTrigger(ctx, TriggerReasonScheduled, nil)

View file

@ -1514,6 +1514,40 @@ func TestGetStatus_ClearsStaleQuickstartBlockedStateWhenCreditsReturn(t *testing
}
}
func TestShouldSkipInitialFullPatrol_IgnoresRecentScopedErrorRun(t *testing.T) {
now := time.Now()
skip := shouldSkipInitialFullPatrol([]PatrolRunRecord{
{
ID: "scoped-error",
Type: "scoped",
TriggerReason: "alert_fired",
CompletedAt: now.Add(-5 * time.Minute),
ErrorCount: 1,
Status: "error",
},
}, now)
if skip {
t.Fatal("expected recent scoped error run not to suppress initial full patrol")
}
}
func TestShouldSkipInitialFullPatrol_SkipsAfterRecentSuccessfulFullRun(t *testing.T) {
now := time.Now()
skip := shouldSkipInitialFullPatrol([]PatrolRunRecord{
{
ID: "full-success",
Type: "patrol",
TriggerReason: "scheduled",
CompletedAt: now.Add(-10 * time.Minute),
ErrorCount: 0,
Status: "healthy",
},
}, now)
if !skip {
t.Fatal("expected recent successful full patrol to suppress initial full patrol")
}
}
// --- appendStreamContent ---
func TestAppendStreamContent(t *testing.T) {