mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-10 03:51:54 +00:00
Degrade Patrol health on incomplete coverage
This commit is contained in:
parent
2afd685877
commit
2ec625091c
6 changed files with 229 additions and 17 deletions
|
|
@ -463,6 +463,16 @@ That runtime-state contract must be derived from live Patrol runtime inputs,
|
|||
not only from the last failed run attempt: exhausted quickstart credits are a
|
||||
blocked Patrol runtime immediately, and the backend must also clear any stale
|
||||
quickstart block once credits or BYOK configuration return.
|
||||
The same runtime contract now also governs when the system-wide Patrol health
|
||||
summary is allowed to read as healthy. `internal/ai/intelligence.go` must not
|
||||
derive `Health A` or `100/100` from "no active findings" alone when recent
|
||||
Patrol evidence is limited to alert-scoped runs or includes recent Patrol run
|
||||
errors; the summary must degrade and explain that overall infrastructure health
|
||||
is not fully verified until a recent successful full Patrol run exists.
|
||||
The Patrol startup scheduler must preserve that coverage guarantee as well:
|
||||
`internal/ai/patrol_run.go` may skip the startup full patrol only when recent
|
||||
run history already includes a successful full Patrol run, not merely because
|
||||
some recent scoped alert-triggered run exists.
|
||||
AI chat tool-name labels, pending-tool headers, and assistant status copy now
|
||||
also route through the shared frontend identifier-label helper, so the chat
|
||||
surfaces do not keep their own underscore-stripping behavior separate from
|
||||
|
|
|
|||
|
|
@ -153,6 +153,7 @@ type Intelligence struct {
|
|||
knowledge *knowledge.Store
|
||||
changes *memory.ChangeDetector
|
||||
remediations *memory.RemediationLog
|
||||
runHistoryStore *PatrolRunHistoryStore
|
||||
resourceTimelineStore unifiedresources.ResourceStore
|
||||
resourceTimelineStoreOrgID string
|
||||
unifiedResourceProvider UnifiedResourceProvider
|
||||
|
|
@ -167,6 +168,11 @@ type Intelligence struct {
|
|||
dataDir string
|
||||
}
|
||||
|
||||
const (
|
||||
intelligencePatrolCoverageWindow = 24 * time.Hour
|
||||
intelligenceRecentRunLimit = 10
|
||||
)
|
||||
|
||||
// IntelligenceConfig configures the unified intelligence layer
|
||||
type IntelligenceConfig struct {
|
||||
DataDir string
|
||||
|
|
@ -212,6 +218,14 @@ func (i *Intelligence) SetResourceTimelineStore(store unifiedresources.ResourceS
|
|||
i.resourceTimelineStoreOrgID = strings.TrimSpace(orgID)
|
||||
}
|
||||
|
||||
// SetRunHistoryStore wires Patrol run history so intelligence summaries can
|
||||
// distinguish broad successful coverage from recent scoped or incomplete runs.
|
||||
func (i *Intelligence) SetRunHistoryStore(store *PatrolRunHistoryStore) {
|
||||
i.mu.Lock()
|
||||
defer i.mu.Unlock()
|
||||
i.runHistoryStore = store
|
||||
}
|
||||
|
||||
// SetUnifiedResourceProvider wires the canonical unified resource provider used
|
||||
// for infrastructure-wide posture summaries.
|
||||
func (i *Intelligence) SetUnifiedResourceProvider(urp UnifiedResourceProvider) {
|
||||
|
|
@ -237,6 +251,7 @@ func (i *Intelligence) GetSummary() *IntelligenceSummary {
|
|||
findings := i.findings
|
||||
patternsDetector := i.patterns
|
||||
remediations := i.remediations
|
||||
runHistoryStore := i.runHistoryStore
|
||||
unifiedResourceProvider := i.unifiedResourceProvider
|
||||
i.mu.RUnlock()
|
||||
|
||||
|
|
@ -275,7 +290,7 @@ func (i *Intelligence) GetSummary() *IntelligenceSummary {
|
|||
summary.Learning = i.getLearningStats()
|
||||
|
||||
// Calculate overall health
|
||||
summary.OverallHealth = i.calculateOverallHealth(summary)
|
||||
summary.OverallHealth = i.calculateOverallHealth(summary, runHistoryStore)
|
||||
|
||||
// Resources at risk
|
||||
summary.ResourcesAtRisk = i.getResourcesAtRisk(5)
|
||||
|
|
@ -798,7 +813,7 @@ func (i *Intelligence) FormatCorrelationsContext(resourceID string) string {
|
|||
return detector.FormatForContext(resourceID)
|
||||
}
|
||||
|
||||
func (i *Intelligence) calculateOverallHealth(summary *IntelligenceSummary) HealthScore {
|
||||
func (i *Intelligence) calculateOverallHealth(summary *IntelligenceSummary, runHistoryStore *PatrolRunHistoryStore) HealthScore {
|
||||
health := HealthScore{
|
||||
Score: 100,
|
||||
Grade: HealthGradeA,
|
||||
|
|
@ -849,6 +864,16 @@ func (i *Intelligence) calculateOverallHealth(summary *IntelligenceSummary) Heal
|
|||
}
|
||||
}
|
||||
|
||||
if factor, ok := summarizeRecentPatrolCoverage(runHistoryStore, time.Now()); ok {
|
||||
health.Score -= factor.impact
|
||||
health.Factors = append(health.Factors, HealthFactor{
|
||||
Name: factor.name,
|
||||
Impact: -factor.impact / 100,
|
||||
Description: factor.description,
|
||||
Category: "coverage",
|
||||
})
|
||||
}
|
||||
|
||||
// Bonus for learning progress
|
||||
if summary.Learning.ResourcesWithKnowledge > 5 {
|
||||
bonus := 5.0
|
||||
|
|
@ -988,6 +1013,12 @@ func scoreToGrade(score float64) HealthGrade {
|
|||
}
|
||||
|
||||
func (i *Intelligence) generateHealthPrediction(health HealthScore, summary *IntelligenceSummary) string {
|
||||
for _, factor := range health.Factors {
|
||||
if factor.Category == "coverage" {
|
||||
return factor.Description
|
||||
}
|
||||
}
|
||||
|
||||
if health.Grade == HealthGradeA {
|
||||
return "Infrastructure is healthy with no significant issues detected."
|
||||
}
|
||||
|
|
@ -1009,6 +1040,86 @@ func (i *Intelligence) generateHealthPrediction(health HealthScore, summary *Int
|
|||
return "Infrastructure is stable with minor issues to monitor."
|
||||
}
|
||||
|
||||
type patrolCoverageFactor struct {
|
||||
name string
|
||||
description string
|
||||
impact float64
|
||||
}
|
||||
|
||||
func summarizeRecentPatrolCoverage(
|
||||
runHistoryStore *PatrolRunHistoryStore,
|
||||
now time.Time,
|
||||
) (patrolCoverageFactor, bool) {
|
||||
if runHistoryStore == nil {
|
||||
return patrolCoverageFactor{}, false
|
||||
}
|
||||
|
||||
recentRuns := runHistoryStore.GetRecent(intelligenceRecentRunLimit)
|
||||
if len(recentRuns) == 0 {
|
||||
return patrolCoverageFactor{}, false
|
||||
}
|
||||
|
||||
cutoff := now.Add(-intelligencePatrolCoverageWindow)
|
||||
relevant := make([]PatrolRunRecord, 0, len(recentRuns))
|
||||
for _, run := range recentRuns {
|
||||
if run.CompletedAt.IsZero() || run.CompletedAt.Before(cutoff) {
|
||||
continue
|
||||
}
|
||||
relevant = append(relevant, run)
|
||||
}
|
||||
if len(relevant) == 0 {
|
||||
return patrolCoverageFactor{}, false
|
||||
}
|
||||
|
||||
var recentErrors int
|
||||
var hasSuccessfulFullRun bool
|
||||
var scopedRuns int
|
||||
for _, run := range relevant {
|
||||
if run.ErrorCount > 0 || strings.EqualFold(strings.TrimSpace(run.Status), "error") {
|
||||
recentErrors++
|
||||
}
|
||||
if isSuccessfulFullPatrolRun(run) {
|
||||
hasSuccessfulFullRun = true
|
||||
}
|
||||
if isScopedPatrolRun(run) {
|
||||
scopedRuns++
|
||||
}
|
||||
}
|
||||
|
||||
switch {
|
||||
case !hasSuccessfulFullRun && recentErrors > 0:
|
||||
return patrolCoverageFactor{
|
||||
name: "Patrol coverage incomplete",
|
||||
description: "Patrol coverage is incomplete: recent activity was limited to scoped runs and ended with errors, so overall health is not fully verified.",
|
||||
impact: 35,
|
||||
}, true
|
||||
case !hasSuccessfulFullRun && scopedRuns == len(relevant):
|
||||
return patrolCoverageFactor{
|
||||
name: "Patrol coverage incomplete",
|
||||
description: "Patrol coverage is incomplete: recent activity was limited to scoped runs, so overall infrastructure health is not fully verified.",
|
||||
impact: 20,
|
||||
}, true
|
||||
case recentErrors > 0:
|
||||
return patrolCoverageFactor{
|
||||
name: "Recent Patrol errors",
|
||||
description: "Recent Patrol runs encountered errors, so the current health summary may be incomplete.",
|
||||
impact: 10,
|
||||
}, true
|
||||
default:
|
||||
return patrolCoverageFactor{}, false
|
||||
}
|
||||
}
|
||||
|
||||
func isScopedPatrolRun(run PatrolRunRecord) bool {
|
||||
return strings.EqualFold(strings.TrimSpace(run.Type), "scoped")
|
||||
}
|
||||
|
||||
func isSuccessfulFullPatrolRun(run PatrolRunRecord) bool {
|
||||
return !isScopedPatrolRun(run) &&
|
||||
run.ErrorCount == 0 &&
|
||||
!strings.EqualFold(strings.TrimSpace(run.Status), "error")
|
||||
}
|
||||
|
||||
func (i *Intelligence) getResourcesAtRisk(limit int) []ResourceRiskSummary {
|
||||
if i.findings == nil {
|
||||
return nil
|
||||
|
|
|
|||
|
|
@ -153,6 +153,52 @@ func TestIntelligence_getTopFindings_Empty(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestIntelligence_GetSummary_DegradesWhenRecentPatrolCoverageIsScopedAndErroring(t *testing.T) {
|
||||
intel := NewIntelligence(IntelligenceConfig{})
|
||||
runHistory := NewPatrolRunHistoryStore(10)
|
||||
now := time.Now()
|
||||
runHistory.Add(PatrolRunRecord{
|
||||
ID: "scoped-error-1",
|
||||
Type: "scoped",
|
||||
TriggerReason: "alert_fired",
|
||||
CompletedAt: now.Add(-5 * time.Minute),
|
||||
ErrorCount: 1,
|
||||
Status: "error",
|
||||
ResourcesChecked: 1,
|
||||
})
|
||||
runHistory.Add(PatrolRunRecord{
|
||||
ID: "scoped-error-2",
|
||||
Type: "scoped",
|
||||
TriggerReason: "alert_fired",
|
||||
CompletedAt: now.Add(-15 * time.Minute),
|
||||
ErrorCount: 1,
|
||||
Status: "error",
|
||||
ResourcesChecked: 1,
|
||||
})
|
||||
intel.SetRunHistoryStore(runHistory)
|
||||
|
||||
summary := intel.GetSummary()
|
||||
if summary.OverallHealth.Score >= 100 {
|
||||
t.Fatalf("expected reduced health score, got %f", summary.OverallHealth.Score)
|
||||
}
|
||||
if summary.OverallHealth.Grade == HealthGradeA {
|
||||
t.Fatalf("expected non-A grade, got %s", summary.OverallHealth.Grade)
|
||||
}
|
||||
if !strings.Contains(summary.OverallHealth.Prediction, "not fully verified") {
|
||||
t.Fatalf("expected coverage warning prediction, got %q", summary.OverallHealth.Prediction)
|
||||
}
|
||||
foundCoverageFactor := false
|
||||
for _, factor := range summary.OverallHealth.Factors {
|
||||
if factor.Category == "coverage" {
|
||||
foundCoverageFactor = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !foundCoverageFactor {
|
||||
t.Fatal("expected coverage factor in overall health")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIntelligence_getLearningStats(t *testing.T) {
|
||||
intel := NewIntelligence(IntelligenceConfig{})
|
||||
knowledgeStore, err := knowledge.NewStore(t.TempDir())
|
||||
|
|
@ -388,14 +434,14 @@ func TestIntelligence_calculateOverallHealth_Clamps(t *testing.T) {
|
|||
negative := intel.calculateOverallHealth(&IntelligenceSummary{
|
||||
FindingsCount: FindingsCounts{Critical: 10, Warning: 10},
|
||||
UpcomingRisks: predictions,
|
||||
})
|
||||
}, nil)
|
||||
if negative.Score != 0 {
|
||||
t.Errorf("expected score clamped to 0, got %f", negative.Score)
|
||||
}
|
||||
|
||||
positive := intel.calculateOverallHealth(&IntelligenceSummary{
|
||||
Learning: LearningStats{ResourcesWithKnowledge: 10},
|
||||
})
|
||||
}, nil)
|
||||
if positive.Score != 100 {
|
||||
t.Errorf("expected score clamped to 100, got %f", positive.Score)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -782,6 +782,7 @@ func (p *PatrolService) GetIntelligence() *Intelligence {
|
|||
p.changeDetector,
|
||||
p.remediationLog,
|
||||
)
|
||||
p.intelligence.SetRunHistoryStore(p.runHistoryStore)
|
||||
if p.aiService != nil {
|
||||
p.aiService.mu.RLock()
|
||||
store := p.aiService.resourceExportStore
|
||||
|
|
|
|||
|
|
@ -117,19 +117,8 @@ func (p *PatrolService) patrolLoop(ctx context.Context) {
|
|||
select {
|
||||
case <-initialTimer.C:
|
||||
// Check if a patrol ran recently (within last hour) to avoid wasting tokens on restarts
|
||||
runHistory := p.GetRunHistory(1)
|
||||
|
||||
skipInitial := false
|
||||
if len(runHistory) > 0 {
|
||||
lastRun := runHistory[0]
|
||||
timeSinceLastRun := time.Since(lastRun.CompletedAt)
|
||||
if timeSinceLastRun < 1*time.Hour {
|
||||
log.Info().
|
||||
Dur("time_since_last", timeSinceLastRun).
|
||||
Msg("AI Patrol: Skipping initial patrol - recent run exists")
|
||||
skipInitial = true
|
||||
}
|
||||
}
|
||||
runHistory := p.GetRunHistory(10)
|
||||
skipInitial := shouldSkipInitialFullPatrol(runHistory, time.Now())
|
||||
|
||||
if !skipInitial {
|
||||
p.runPatrolWithTrigger(ctx, TriggerReasonStartup, nil)
|
||||
|
|
@ -206,6 +195,27 @@ func (p *PatrolService) patrolLoop(ctx context.Context) {
|
|||
}
|
||||
}
|
||||
|
||||
func shouldSkipInitialFullPatrol(runHistory []PatrolRunRecord, now time.Time) bool {
|
||||
for _, run := range runHistory {
|
||||
if run.CompletedAt.IsZero() {
|
||||
continue
|
||||
}
|
||||
timeSinceLastRun := now.Sub(run.CompletedAt)
|
||||
if timeSinceLastRun >= 1*time.Hour {
|
||||
continue
|
||||
}
|
||||
if isSuccessfulFullPatrolRun(run) {
|
||||
log.Info().
|
||||
Dur("time_since_last", timeSinceLastRun).
|
||||
Str("run_type", run.Type).
|
||||
Str("trigger_reason", run.TriggerReason).
|
||||
Msg("AI Patrol: Skipping initial patrol - recent successful full run exists")
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// runPatrol executes a scheduled patrol run
|
||||
func (p *PatrolService) runPatrol(ctx context.Context) {
|
||||
p.runPatrolWithTrigger(ctx, TriggerReasonScheduled, nil)
|
||||
|
|
|
|||
|
|
@ -1514,6 +1514,40 @@ func TestGetStatus_ClearsStaleQuickstartBlockedStateWhenCreditsReturn(t *testing
|
|||
}
|
||||
}
|
||||
|
||||
func TestShouldSkipInitialFullPatrol_IgnoresRecentScopedErrorRun(t *testing.T) {
|
||||
now := time.Now()
|
||||
skip := shouldSkipInitialFullPatrol([]PatrolRunRecord{
|
||||
{
|
||||
ID: "scoped-error",
|
||||
Type: "scoped",
|
||||
TriggerReason: "alert_fired",
|
||||
CompletedAt: now.Add(-5 * time.Minute),
|
||||
ErrorCount: 1,
|
||||
Status: "error",
|
||||
},
|
||||
}, now)
|
||||
if skip {
|
||||
t.Fatal("expected recent scoped error run not to suppress initial full patrol")
|
||||
}
|
||||
}
|
||||
|
||||
func TestShouldSkipInitialFullPatrol_SkipsAfterRecentSuccessfulFullRun(t *testing.T) {
|
||||
now := time.Now()
|
||||
skip := shouldSkipInitialFullPatrol([]PatrolRunRecord{
|
||||
{
|
||||
ID: "full-success",
|
||||
Type: "patrol",
|
||||
TriggerReason: "scheduled",
|
||||
CompletedAt: now.Add(-10 * time.Minute),
|
||||
ErrorCount: 0,
|
||||
Status: "healthy",
|
||||
},
|
||||
}, now)
|
||||
if !skip {
|
||||
t.Fatal("expected recent successful full patrol to suppress initial full patrol")
|
||||
}
|
||||
}
|
||||
|
||||
// --- appendStreamContent ---
|
||||
|
||||
func TestAppendStreamContent(t *testing.T) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue