mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-07 17:19:57 +00:00
2506 lines
80 KiB
Go
2506 lines
80 KiB
Go
// patrol_run.go implements the PatrolService runtime: Start/Stop lifecycle,
|
|
// the main patrol loop, scoped patrol execution, alert auto-resolution,
|
|
// live streaming to UI subscribers, and run history tracking.
|
|
package ai
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha256"
|
|
"fmt"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/ai/circuit"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/ai/providers"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/models"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
// Patrol run lifecycle constants.
|
|
const (
|
|
initialPatrolStartDelay = 30 * time.Second // Delay before first patrol after startup
|
|
findingCleanupAge = 24 * time.Hour // Resolved findings older than this are purged
|
|
scopedPatrolRetryBackoff1 = 5 * time.Second // First retry backoff for dropped scoped patrols
|
|
scopedPatrolRetryBackoff2 = 15 * time.Second // Second retry backoff for dropped scoped patrols
|
|
scopedPatrolMaxRetries = 2 // Maximum re-queue attempts for dropped scoped patrols
|
|
scopedPatrolLogIDLimit = 10 // Maximum number of effective scope IDs to log inline
|
|
)
|
|
|
|
// Start begins the background patrol loop
|
|
func (p *PatrolService) Start(ctx context.Context) {
|
|
p.mu.Lock()
|
|
if p.running {
|
|
p.mu.Unlock()
|
|
return
|
|
}
|
|
p.running = true
|
|
p.stopCh = make(chan struct{})
|
|
p.configChanged = make(chan struct{}, 1) // Buffered to allow non-blocking send
|
|
p.mu.Unlock()
|
|
|
|
log.Info().
|
|
Dur("interval", p.config.GetInterval()).
|
|
Msg("Starting AI Patrol Service")
|
|
|
|
go p.patrolLoop(ctx)
|
|
}
|
|
|
|
// Stop stops the patrol service. It signals the patrol loop to exit, then
|
|
// waits up to 15 seconds for in-flight investigations to finish and
|
|
// force-saves findings/investigation state to disk.
|
|
func (p *PatrolService) Stop() {
|
|
p.mu.Lock()
|
|
if !p.running {
|
|
p.mu.Unlock()
|
|
return
|
|
}
|
|
p.running = false
|
|
close(p.stopCh)
|
|
orchestrator := p.investigationOrchestrator
|
|
findings := p.findings
|
|
p.mu.Unlock()
|
|
|
|
log.Info().Msg("stopping AI Patrol Service")
|
|
|
|
// Give investigations 15 seconds to finish (leaves headroom within server's 30s budget)
|
|
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 15*time.Second)
|
|
defer shutdownCancel()
|
|
|
|
// Signal orchestrator to cancel running investigations and persist state
|
|
if orchestrator != nil {
|
|
if err := orchestrator.Shutdown(shutdownCtx); err != nil {
|
|
log.Warn().Err(err).Msg("AI Patrol: Investigation orchestrator shutdown returned error")
|
|
}
|
|
}
|
|
|
|
// Wait for investigation goroutines tracked by PatrolService
|
|
done := make(chan struct{})
|
|
go func() {
|
|
p.investigationWg.Wait()
|
|
close(done)
|
|
}()
|
|
select {
|
|
case <-done:
|
|
// All investigation goroutines finished
|
|
case <-shutdownCtx.Done():
|
|
log.Warn().Msg("AI Patrol: Timed out waiting for investigation goroutines to finish")
|
|
}
|
|
|
|
// Force-save findings store
|
|
if findings != nil {
|
|
if err := findings.ForceSave(); err != nil {
|
|
log.Error().Err(err).Msg("AI Patrol: Failed to force-save findings during shutdown")
|
|
}
|
|
}
|
|
|
|
log.Info().Msg("AI Patrol Service stopped")
|
|
}
|
|
|
|
// patrolLoop is the main background loop
|
|
func (p *PatrolService) patrolLoop(ctx context.Context) {
|
|
// Seed recency from persisted run history so the API can return Patrol timing
|
|
// metadata immediately (before the first in-process patrol completes).
|
|
if history := p.GetRunHistory(10); len(history) > 0 {
|
|
lastActivity, lastFullPatrol := patrolRecencyFromHistory(history)
|
|
p.mu.Lock()
|
|
p.lastActivity = lastActivity
|
|
p.lastFullPatrol = lastFullPatrol
|
|
p.mu.Unlock()
|
|
}
|
|
|
|
// Run initial patrol shortly after startup, but only if one hasn't run recently
|
|
initialDelay := initialPatrolStartDelay
|
|
initialTimer := time.NewTimer(initialDelay)
|
|
defer initialTimer.Stop()
|
|
|
|
select {
|
|
case <-initialTimer.C:
|
|
// Check if a patrol ran recently (within last hour) to avoid wasting tokens on restarts
|
|
runHistory := p.GetRunHistory(10)
|
|
skipInitial := shouldSkipInitialFullPatrol(runHistory, time.Now())
|
|
|
|
if !skipInitial {
|
|
p.runPatrolWithTrigger(ctx, TriggerReasonStartup, nil)
|
|
}
|
|
case <-p.stopCh:
|
|
if !initialTimer.Stop() {
|
|
select {
|
|
case <-initialTimer.C:
|
|
default:
|
|
}
|
|
}
|
|
return
|
|
case <-ctx.Done():
|
|
if !initialTimer.Stop() {
|
|
select {
|
|
case <-initialTimer.C:
|
|
default:
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
p.mu.RLock()
|
|
interval := p.config.GetInterval()
|
|
configCh := p.configChanged
|
|
p.mu.RUnlock()
|
|
|
|
ticker := time.NewTicker(interval)
|
|
defer ticker.Stop()
|
|
|
|
p.mu.Lock()
|
|
p.nextScheduledAt = time.Now().Add(interval)
|
|
p.mu.Unlock()
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
// Update next scheduled time before the run starts — time.Now() closely
|
|
// matches the tick time here, and the ticker will fire again at roughly
|
|
// this moment + interval regardless of how long the run takes.
|
|
p.mu.Lock()
|
|
p.nextScheduledAt = time.Now().Add(interval)
|
|
p.mu.Unlock()
|
|
p.runPatrolWithTrigger(ctx, TriggerReasonScheduled, nil)
|
|
|
|
case alert := <-p.adHocTrigger:
|
|
// Run immediate targeted patrol for this alert
|
|
log.Info().Str("alert_identifier", alert.ID).Msg("patrol triggered by alert")
|
|
p.runTargetedPatrol(ctx, alert)
|
|
|
|
case <-configCh:
|
|
// Config changed - reset ticker with new interval
|
|
p.mu.RLock()
|
|
newInterval := p.config.GetInterval()
|
|
p.mu.RUnlock()
|
|
|
|
if newInterval != interval {
|
|
interval = newInterval
|
|
ticker.Reset(interval)
|
|
p.mu.Lock()
|
|
p.nextScheduledAt = time.Now().Add(interval)
|
|
p.mu.Unlock()
|
|
log.Info().
|
|
Dur("interval", interval).
|
|
Msg("Patrol ticker reset to new interval")
|
|
}
|
|
|
|
case <-p.stopCh:
|
|
return
|
|
|
|
case <-ctx.Done():
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func shouldSkipInitialFullPatrol(runHistory []PatrolRunRecord, now time.Time) bool {
|
|
for _, run := range runHistory {
|
|
if run.CompletedAt.IsZero() {
|
|
continue
|
|
}
|
|
timeSinceLastRun := now.Sub(run.CompletedAt)
|
|
if timeSinceLastRun >= 1*time.Hour {
|
|
continue
|
|
}
|
|
if isSuccessfulFullPatrolRun(run) {
|
|
log.Info().
|
|
Dur("time_since_last", timeSinceLastRun).
|
|
Str("run_type", run.Type).
|
|
Str("trigger_reason", run.TriggerReason).
|
|
Msg("AI Patrol: Skipping initial patrol - recent successful full run exists")
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func patrolRecencyFromHistory(runHistory []PatrolRunRecord) (time.Time, time.Time) {
|
|
var lastActivity time.Time
|
|
var lastFullPatrol time.Time
|
|
for _, run := range runHistory {
|
|
if run.CompletedAt.IsZero() {
|
|
continue
|
|
}
|
|
if lastActivity.IsZero() || run.CompletedAt.After(lastActivity) {
|
|
lastActivity = run.CompletedAt
|
|
}
|
|
if isFullPatrolRun(run) && (lastFullPatrol.IsZero() || run.CompletedAt.After(lastFullPatrol)) {
|
|
lastFullPatrol = run.CompletedAt
|
|
}
|
|
}
|
|
return lastActivity, lastFullPatrol
|
|
}
|
|
|
|
// runPatrol executes a scheduled patrol run
|
|
func (p *PatrolService) runPatrol(ctx context.Context) {
|
|
p.runPatrolWithTrigger(ctx, TriggerReasonScheduled, nil)
|
|
}
|
|
|
|
// runPatrolWithTrigger executes a patrol run with trigger context
|
|
func (p *PatrolService) runPatrolWithTrigger(ctx context.Context, trigger TriggerReason, scope *PatrolScope) {
|
|
p.mu.RLock()
|
|
cfg := p.config
|
|
breaker := p.circuitBreaker
|
|
p.mu.RUnlock()
|
|
|
|
if !cfg.Enabled {
|
|
return
|
|
}
|
|
|
|
if !p.tryStartRun("full") {
|
|
return
|
|
}
|
|
defer p.endRun()
|
|
|
|
// Check if circuit breaker allows LLM calls.
|
|
llmAllowed := breaker == nil || breaker.Allow()
|
|
if !llmAllowed {
|
|
log.Warn().Msg("AI Patrol: Circuit breaker is open (LLM calls blocked)")
|
|
}
|
|
|
|
start := time.Now()
|
|
runID := fmt.Sprintf("%d", start.UnixNano())
|
|
executionID := uuid.NewString()
|
|
patrolType := "patrol"
|
|
GetPatrolMetrics().RecordRun(string(trigger), "full")
|
|
|
|
log.Debug().Msg("AI Patrol: Starting patrol run")
|
|
|
|
// Track run statistics
|
|
var runStats struct {
|
|
resourceCount int
|
|
nodesChecked int
|
|
guestsChecked int
|
|
dockerChecked int
|
|
storageChecked int
|
|
hostsChecked int
|
|
trueNASChecked int
|
|
pbsChecked int
|
|
pmgChecked int
|
|
kubernetesChecked int
|
|
newFindings int
|
|
existingFindings int
|
|
rejectedFindings int
|
|
triageFlags int
|
|
triageSkippedLLM bool
|
|
findingIDs []string
|
|
errors int
|
|
lastAIError error // Preserve original error for circuit breaker categorization
|
|
aiAnalysis *AIAnalysisResult // Stores the AI's analysis for the run record
|
|
}
|
|
var newFindings []*Finding
|
|
|
|
// Get current state
|
|
if !p.hasPatrolRuntimeInputs() {
|
|
log.Warn().Msg("AI Patrol: No runtime state available")
|
|
return
|
|
}
|
|
|
|
state := p.currentPatrolRuntimeState()
|
|
|
|
// Helper to track findings
|
|
// Note: Only warning+ severity findings count toward newFindings since watch/info are filtered from UI
|
|
trackFinding := func(f *Finding) bool {
|
|
isNew := p.recordFinding(f)
|
|
if isNew {
|
|
// Only count warning+ findings as "new" for user-facing stats
|
|
if f.Severity == FindingSeverityWarning || f.Severity == FindingSeverityCritical {
|
|
runStats.newFindings++
|
|
newFindings = append(newFindings, f)
|
|
}
|
|
} else {
|
|
runStats.existingFindings++
|
|
}
|
|
|
|
// Only track warning+ severity finding IDs in the run record
|
|
if f.Severity == FindingSeverityWarning || f.Severity == FindingSeverityCritical {
|
|
runStats.findingIDs = append(runStats.findingIDs, f.ID)
|
|
}
|
|
|
|
return isNew
|
|
}
|
|
|
|
// Count resources for statistics from the patrol runtime state so scoped and
|
|
// unscoped runs share the same read-state-first semantics.
|
|
resourceCounts := patrolRuntimeCountResources(state)
|
|
if cfg.AnalyzeNodes {
|
|
runStats.nodesChecked = resourceCounts.nodes
|
|
}
|
|
if cfg.AnalyzeGuests {
|
|
runStats.guestsChecked = resourceCounts.guests
|
|
}
|
|
if cfg.AnalyzeDocker {
|
|
runStats.dockerChecked = resourceCounts.docker
|
|
}
|
|
if cfg.AnalyzeStorage {
|
|
runStats.storageChecked = resourceCounts.storage
|
|
}
|
|
if cfg.AnalyzePBS {
|
|
runStats.pbsChecked = resourceCounts.pbs
|
|
}
|
|
if cfg.AnalyzePMG {
|
|
runStats.pmgChecked = resourceCounts.pmg
|
|
}
|
|
if cfg.AnalyzeHosts {
|
|
runStats.hostsChecked = resourceCounts.hosts
|
|
runStats.trueNASChecked = resourceCounts.truenas
|
|
}
|
|
if cfg.AnalyzeKubernetes {
|
|
runStats.kubernetesChecked = resourceCounts.kubernetes
|
|
}
|
|
runStats.resourceCount = runStats.nodesChecked + runStats.guestsChecked +
|
|
runStats.dockerChecked + runStats.storageChecked + runStats.pbsChecked + runStats.pmgChecked + runStats.hostsChecked +
|
|
runStats.trueNASChecked +
|
|
runStats.kubernetesChecked
|
|
|
|
// Determine if we can run LLM analysis (requires AI service + circuit breaker not open)
|
|
aiServiceEnabled := p.aiService != nil && p.aiService.IsEnabled()
|
|
canRunLLM := aiServiceEnabled && llmAllowed
|
|
|
|
// Check quickstart credit status for messaging
|
|
p.mu.RLock()
|
|
qsMgr := p.quickstartCredits
|
|
p.mu.RUnlock()
|
|
|
|
// Check if we can run LLM analysis (AI-only patrol)
|
|
if !canRunLLM {
|
|
reason := "AI not configured - set up a provider in Settings > Pulse Assistant"
|
|
if !aiServiceEnabled {
|
|
if p.aiService != nil && strings.TrimSpace(p.aiService.QuickstartBlockedReason()) != "" {
|
|
reason = p.aiService.QuickstartBlockedReason()
|
|
} else if qsMgr != nil && !qsMgr.HasBYOK() && !qsMgr.HasCredits() {
|
|
// Distinguish between exhausted credits and no AI configured.
|
|
reason = patrolQuickstartCreditsExhaustedReason
|
|
} else {
|
|
reason = "AI not configured - set up a provider in Settings > Pulse Assistant"
|
|
}
|
|
} else if !llmAllowed {
|
|
reason = "circuit breaker is open"
|
|
GetPatrolMetrics().RecordCircuitBlock()
|
|
}
|
|
p.setBlockedReason(reason)
|
|
log.Info().Str("reason", reason).Msg("AI Patrol: Skipping run - AI unavailable")
|
|
return
|
|
}
|
|
|
|
// Check if using quickstart credits — verify credits remain before starting
|
|
usingQuickstart := p.aiService != nil && p.aiService.IsUsingQuickstart()
|
|
if usingQuickstart && p.aiService != nil && strings.TrimSpace(p.aiService.QuickstartBlockedReason()) != "" {
|
|
p.setBlockedReason(p.aiService.QuickstartBlockedReason())
|
|
log.Info().Str("reason", p.aiService.QuickstartBlockedReason()).Msg("AI Patrol: Skipping run - quickstart unavailable")
|
|
return
|
|
}
|
|
if usingQuickstart && qsMgr != nil && !qsMgr.HasCredits() {
|
|
p.setBlockedReason(patrolQuickstartCreditsExhaustedReason)
|
|
log.Info().Msg("AI Patrol: Skipping run - quickstart credits exhausted")
|
|
return
|
|
}
|
|
|
|
{
|
|
p.clearBlockedReason()
|
|
// Ensure stream state is clean for this run before the first streamed event.
|
|
p.resetStreamForRun(runID)
|
|
// Run agentic AI analysis — the LLM uses tools to investigate and reports findings
|
|
aiResult, aiErr := p.runAIAnalysisState(ctx, state, scope, executionID)
|
|
if aiErr != nil {
|
|
log.Warn().Err(aiErr).Msg("AI Patrol: LLM analysis failed")
|
|
runStats.errors++
|
|
runStats.lastAIError = aiErr
|
|
|
|
// Create a finding to surface this error to the user
|
|
errMsg := aiErr.Error()
|
|
var title, description, recommendation string
|
|
if usingQuickstart && (strings.Contains(errMsg, "dial tcp") || strings.Contains(errMsg, "no such host") || strings.Contains(errMsg, "connection refused") || strings.Contains(errMsg, "i/o timeout")) {
|
|
title = "Pulse Patrol: Quickstart credits require internet"
|
|
description = "Pulse Patrol cannot reach the quickstart proxy server. Quickstart credits require an internet connection."
|
|
recommendation = patrolQuickstartUnavailableReason
|
|
} else if strings.Contains(errMsg, "Insufficient Balance") || strings.Contains(errMsg, "402") {
|
|
title = "Pulse Patrol: Insufficient API credits"
|
|
description = "Pulse Patrol cannot analyze your infrastructure because your provider account has insufficient credits."
|
|
recommendation = "Add credits to your provider account (DeepSeek, OpenAI, etc.) or switch to a different provider in Pulse Assistant settings."
|
|
} else if strings.Contains(errMsg, "401") || strings.Contains(errMsg, "Unauthorized") {
|
|
title = "Pulse Patrol: Invalid API key"
|
|
description = "Pulse Patrol cannot analyze your infrastructure because the API key is invalid or expired."
|
|
recommendation = "Check your API key in Pulse Assistant settings and verify it is correct."
|
|
} else if strings.Contains(errMsg, "rate limit") || strings.Contains(errMsg, "429") {
|
|
title = "Pulse Patrol: Rate limited"
|
|
description = "Pulse Patrol is being rate limited by your provider. Analysis will be retried on the next patrol run."
|
|
recommendation = "Wait for the rate limit to reset, or consider upgrading your API plan for higher limits."
|
|
} else {
|
|
title = "Pulse Patrol: Analysis failed"
|
|
description = fmt.Sprintf("Pulse Patrol encountered an error while analyzing your infrastructure: %s", errMsg)
|
|
recommendation = "Check your Pulse Assistant settings and API key. If the problem persists, check the logs for more details."
|
|
}
|
|
|
|
errorFinding := &Finding{
|
|
ID: generateFindingID("ai-service", "reliability", "ai-patrol-error"),
|
|
Key: "ai-patrol-error",
|
|
Severity: "warning",
|
|
Category: "reliability",
|
|
ResourceID: "ai-service",
|
|
ResourceName: "Pulse Patrol Service",
|
|
ResourceType: "service",
|
|
Title: title,
|
|
Description: description,
|
|
Recommendation: recommendation,
|
|
Evidence: fmt.Sprintf("Error: %s", errMsg),
|
|
DetectedAt: time.Now(),
|
|
LastSeenAt: time.Now(),
|
|
}
|
|
trackFinding(errorFinding)
|
|
|
|
if usingQuickstart {
|
|
switch {
|
|
case providers.IsQuickstartCreditsExhausted(aiErr):
|
|
p.setBlockedReason(patrolQuickstartCreditsExhaustedReason)
|
|
case providers.IsQuickstartUnavailable(aiErr), quickstartBootstrapUnavailable(aiErr):
|
|
p.setBlockedReason(patrolQuickstartUnavailableReason)
|
|
}
|
|
}
|
|
} else if aiResult != nil {
|
|
runStats.aiAnalysis = aiResult
|
|
runStats.rejectedFindings = aiResult.RejectedFindings
|
|
runStats.triageFlags = aiResult.TriageFlags
|
|
runStats.triageSkippedLLM = aiResult.TriageSkippedLLM
|
|
|
|
// Auto-resolve previous patrol error finding if this run succeeded
|
|
errorFindingID := generateFindingID("ai-service", "reliability", "ai-patrol-error")
|
|
if existing := p.findings.Get(errorFindingID); existing != nil && !existing.IsResolved() {
|
|
p.findings.Resolve(errorFindingID, true) // auto-resolved
|
|
if resolver := p.unifiedFindingResolver; resolver != nil {
|
|
resolver(errorFindingID)
|
|
}
|
|
log.Info().Msg("AI Patrol: Auto-resolved previous patrol error finding after successful run")
|
|
}
|
|
|
|
// Findings are already recorded via patrol_report_finding tool calls.
|
|
// Track stats from the collected findings.
|
|
for _, f := range aiResult.Findings {
|
|
if f.Severity == FindingSeverityWarning || f.Severity == FindingSeverityCritical {
|
|
runStats.findingIDs = append(runStats.findingIDs, f.ID)
|
|
// Check if this finding was new by looking at the store
|
|
stored := p.findings.Get(f.ID)
|
|
if stored != nil && stored.TimesRaised <= 1 {
|
|
runStats.newFindings++
|
|
newFindings = append(newFindings, f)
|
|
} else {
|
|
runStats.existingFindings++
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Count resolved findings: LLM-resolved (via tool) + auto-reconciled stale findings.
|
|
var resolvedCount int
|
|
if runStats.aiAnalysis != nil {
|
|
resolvedCount = len(runStats.aiAnalysis.ResolvedIDs)
|
|
|
|
// Auto-resolve stale findings: active findings that were presented to the LLM
|
|
// in seed context but were neither re-reported nor explicitly resolved.
|
|
// Only runs after successful full patrols (not scoped).
|
|
autoResolved := p.reconcileStaleFindings(
|
|
runStats.aiAnalysis.ReportedIDs,
|
|
runStats.aiAnalysis.ResolvedIDs,
|
|
runStats.aiAnalysis.SeededFindingIDs,
|
|
runStats.errors > 0,
|
|
)
|
|
resolvedCount += autoResolved
|
|
if autoResolved > 0 {
|
|
log.Info().
|
|
Int("auto_resolved", autoResolved).
|
|
Msg("AI Patrol: Auto-resolved stale findings after full patrol")
|
|
}
|
|
}
|
|
|
|
// Cleanup old resolved findings (always runs, doesn't require LLM)
|
|
cleaned := p.findings.Cleanup(findingCleanupAge)
|
|
if cleaned > 0 {
|
|
log.Debug().Int("cleaned", cleaned).Msg("AI Patrol: Cleaned up old findings")
|
|
}
|
|
|
|
// Recover investigations stuck in "running" state (goroutine panicked or was killed)
|
|
p.recoverStuckInvestigations()
|
|
|
|
// Retry investigations that failed due to timeout (shorter cooldown than permanent failures)
|
|
p.retryTimedOutInvestigations()
|
|
|
|
// AI-based alert review: check active alerts against current state and auto-resolve fixed issues
|
|
// Pass llmAllowed so it knows whether AI calls are allowed.
|
|
alertsResolved := p.reviewAndResolveAlertsState(ctx, state, llmAllowed, executionID)
|
|
if alertsResolved > 0 {
|
|
log.Info().Int("alerts_resolved", alertsResolved).Msg("AI Patrol: Auto-resolved alerts where issues are fixed")
|
|
}
|
|
|
|
duration := time.Since(start)
|
|
completedAt := time.Now()
|
|
|
|
// Build findings summary string
|
|
summary := p.findings.GetSummary()
|
|
var findingsSummaryStr string
|
|
var status string
|
|
// Only count critical and warning as active issues (watch/info are filtered from UI)
|
|
totalActive := summary.Critical + summary.Warning
|
|
if totalActive == 0 {
|
|
findingsSummaryStr = "All healthy"
|
|
status = "healthy"
|
|
} else {
|
|
parts := []string{}
|
|
if summary.Critical > 0 {
|
|
parts = append(parts, fmt.Sprintf("%d critical", summary.Critical))
|
|
}
|
|
if summary.Warning > 0 {
|
|
parts = append(parts, fmt.Sprintf("%d warning", summary.Warning))
|
|
}
|
|
findingsSummaryStr = fmt.Sprintf("%s", joinParts(parts))
|
|
if summary.Critical > 0 {
|
|
status = "critical"
|
|
} else {
|
|
status = "issues_found"
|
|
}
|
|
}
|
|
if runStats.errors > 0 {
|
|
status = "error"
|
|
// Don't claim "All healthy" if there were errors - the patrol didn't complete properly
|
|
if findingsSummaryStr == "All healthy" {
|
|
findingsSummaryStr = fmt.Sprintf("Analysis incomplete (%d errors)", runStats.errors)
|
|
}
|
|
}
|
|
|
|
// Create run record
|
|
runRecord := PatrolRunRecord{
|
|
ID: runID,
|
|
StartedAt: start,
|
|
CompletedAt: completedAt,
|
|
Duration: duration,
|
|
DurationMs: duration.Milliseconds(),
|
|
Type: patrolType,
|
|
TriggerReason: string(trigger),
|
|
ResourcesChecked: runStats.resourceCount,
|
|
NodesChecked: runStats.nodesChecked,
|
|
GuestsChecked: runStats.guestsChecked,
|
|
DockerChecked: runStats.dockerChecked,
|
|
StorageChecked: runStats.storageChecked,
|
|
HostsChecked: runStats.hostsChecked,
|
|
TrueNASChecked: runStats.trueNASChecked,
|
|
PBSChecked: runStats.pbsChecked,
|
|
PMGChecked: runStats.pmgChecked,
|
|
KubernetesChecked: runStats.kubernetesChecked,
|
|
NewFindings: runStats.newFindings,
|
|
ExistingFindings: runStats.existingFindings,
|
|
RejectedFindings: runStats.rejectedFindings,
|
|
ResolvedFindings: resolvedCount,
|
|
AutoFixCount: 0,
|
|
FindingsSummary: findingsSummaryStr,
|
|
FindingIDs: runStats.findingIDs,
|
|
ErrorCount: runStats.errors,
|
|
Status: status,
|
|
}
|
|
|
|
if scope != nil {
|
|
runRecord.ScopeResourceIDs = scope.ResourceIDs
|
|
runRecord.ScopeResourceTypes = scope.ResourceTypes
|
|
runRecord.ScopeContext = scope.Context
|
|
runRecord.AlertIdentifier = scope.AlertIdentifier
|
|
runRecord.FindingID = scope.FindingID
|
|
}
|
|
|
|
// Add AI analysis details if available
|
|
if runStats.aiAnalysis != nil {
|
|
runRecord.AIAnalysis = runStats.aiAnalysis.Response
|
|
runRecord.InputTokens = runStats.aiAnalysis.InputTokens
|
|
runRecord.OutputTokens = runStats.aiAnalysis.OutputTokens
|
|
runRecord.TriageFlags = runStats.triageFlags
|
|
runRecord.TriageSkippedLLM = runStats.triageSkippedLLM
|
|
toolCalls := runStats.aiAnalysis.ToolCalls
|
|
if len(toolCalls) > MaxToolCallsPerRun {
|
|
toolCalls = toolCalls[:MaxToolCallsPerRun]
|
|
}
|
|
runRecord.ToolCalls = toolCalls
|
|
runRecord.ToolCallCount = len(runStats.aiAnalysis.ToolCalls)
|
|
log.Debug().
|
|
Int("response_length", len(runStats.aiAnalysis.Response)).
|
|
Int("input_tokens", runStats.aiAnalysis.InputTokens).
|
|
Int("output_tokens", runStats.aiAnalysis.OutputTokens).
|
|
Int("tool_calls", runRecord.ToolCallCount).
|
|
Msg("AI Patrol: Storing AI analysis in run record")
|
|
} else {
|
|
log.Debug().Msg("AI Patrol: No AI analysis to store (aiAnalysis is nil)")
|
|
}
|
|
|
|
p.mu.Lock()
|
|
p.lastActivity = completedAt
|
|
p.lastFullPatrol = completedAt
|
|
p.lastDuration = duration
|
|
p.resourcesChecked = runStats.resourceCount
|
|
p.errorCount = runStats.errors
|
|
p.mu.Unlock()
|
|
|
|
// Record circuit breaker result only if we actually attempted LLM calls.
|
|
// canRunLLM is true only when AI is enabled, licensed, AND breaker allowed.
|
|
// Use error categorization so non-transient errors (auth failures, insufficient
|
|
// credits) don't trip the breaker — those won't be fixed by waiting.
|
|
if breaker != nil && canRunLLM {
|
|
if runStats.errors > 0 {
|
|
aiErr := runStats.lastAIError
|
|
if aiErr == nil {
|
|
aiErr = fmt.Errorf("patrol completed with %d errors", runStats.errors)
|
|
}
|
|
breaker.RecordFailureWithCategory(aiErr, circuit.CategorizeError(aiErr))
|
|
} else {
|
|
breaker.RecordSuccess()
|
|
}
|
|
}
|
|
|
|
// Add to history store (handles persistence automatically)
|
|
p.runHistoryStore.Add(runRecord)
|
|
|
|
log.Info().
|
|
Str("type", patrolType).
|
|
Dur("duration", duration).
|
|
Int("resources", runStats.resourceCount).
|
|
Int("new_findings", runStats.newFindings).
|
|
Int("resolved", resolvedCount).
|
|
Int("critical", summary.Critical).
|
|
Int("warning", summary.Warning).
|
|
Int("watch", summary.Watch).
|
|
Msg("AI Patrol: Completed patrol run")
|
|
}
|
|
|
|
// runScopedPatrol runs a patrol on a filtered subset of resources.
|
|
// This provides token-efficient analysis for event-driven patrols.
|
|
func (p *PatrolService) runScopedPatrol(ctx context.Context, scope PatrolScope) {
|
|
p.mu.RLock()
|
|
cfg := p.config
|
|
breaker := p.circuitBreaker
|
|
p.mu.RUnlock()
|
|
|
|
if !cfg.Enabled {
|
|
return
|
|
}
|
|
|
|
if !p.tryStartRun("scoped") {
|
|
// Re-queue with backoff if retries remain
|
|
if scope.RetryCount < scopedPatrolMaxRetries {
|
|
scope.RetryCount++
|
|
backoff := scopedPatrolRetryBackoff1
|
|
if scope.RetryCount == scopedPatrolMaxRetries {
|
|
backoff = scopedPatrolRetryBackoff2
|
|
}
|
|
scope.RetryAfter = time.Now().Add(backoff)
|
|
if tm := p.GetTriggerManager(); tm != nil {
|
|
tm.TriggerPatrol(scope)
|
|
log.Info().
|
|
Int("retry", scope.RetryCount).
|
|
Dur("backoff", backoff).
|
|
Strs("resources", scope.ResourceIDs).
|
|
Msg("AI Patrol: Re-queued dropped scoped patrol with backoff")
|
|
}
|
|
} else {
|
|
GetPatrolMetrics().RecordScopedDroppedFinal()
|
|
log.Error().
|
|
Strs("resources", scope.ResourceIDs).
|
|
Str("reason", string(scope.Reason)).
|
|
Msg("AI Patrol: Scoped patrol permanently dropped after 2 retries")
|
|
}
|
|
return
|
|
}
|
|
defer p.endRun()
|
|
|
|
// Check if circuit breaker allows LLM calls.
|
|
llmAllowed := breaker == nil || breaker.Allow()
|
|
if !llmAllowed {
|
|
log.Warn().Msg("AI Patrol: Circuit breaker is open for scoped patrol (LLM calls blocked)")
|
|
}
|
|
|
|
start := time.Now()
|
|
runID := fmt.Sprintf("%d", start.UnixNano())
|
|
executionID := uuid.NewString()
|
|
GetPatrolMetrics().RecordRun(string(scope.Reason), "scoped")
|
|
var runStats struct {
|
|
resourceCount int
|
|
nodesChecked int
|
|
guestsChecked int
|
|
dockerChecked int
|
|
storageChecked int
|
|
hostsChecked int
|
|
trueNASChecked int
|
|
pbsChecked int
|
|
pmgChecked int
|
|
kubernetesChecked int
|
|
newFindings int
|
|
existingFindings int
|
|
rejectedFindings int
|
|
triageFlags int
|
|
triageSkippedLLM bool
|
|
findingIDs []string
|
|
errors int
|
|
aiAnalysis *AIAnalysisResult
|
|
}
|
|
|
|
// Get current state
|
|
if !p.hasPatrolRuntimeInputs() {
|
|
log.Warn().Msg("AI Patrol: No runtime state available for scoped patrol")
|
|
return
|
|
}
|
|
|
|
fullState := p.currentPatrolRuntimeState()
|
|
|
|
// Filter state based on scope
|
|
filteredState := p.filterStateByScopeState(fullState, scope)
|
|
effectiveScopeIDs := patrolRuntimeSortedResourceIDs(filteredState)
|
|
|
|
resourceCounts := patrolRuntimeCountResources(filteredState)
|
|
resourceCount := 0
|
|
if cfg.AnalyzeNodes {
|
|
resourceCount += resourceCounts.nodes
|
|
}
|
|
if cfg.AnalyzeGuests {
|
|
resourceCount += resourceCounts.guests
|
|
}
|
|
if cfg.AnalyzeDocker {
|
|
resourceCount += resourceCounts.docker
|
|
}
|
|
if cfg.AnalyzeStorage {
|
|
resourceCount += resourceCounts.storage
|
|
}
|
|
if cfg.AnalyzePBS {
|
|
resourceCount += resourceCounts.pbs
|
|
}
|
|
if cfg.AnalyzeHosts {
|
|
resourceCount += resourceCounts.hosts
|
|
resourceCount += resourceCounts.truenas
|
|
}
|
|
if cfg.AnalyzeKubernetes {
|
|
resourceCount += resourceCounts.kubernetes
|
|
}
|
|
if cfg.AnalyzePMG {
|
|
resourceCount += resourceCounts.pmg
|
|
}
|
|
|
|
if resourceCount == 0 {
|
|
log.Debug().
|
|
Strs("requested_ids", scope.ResourceIDs).
|
|
Strs("requested_types", scope.ResourceTypes).
|
|
Int("effective_scope_count", len(effectiveScopeIDs)).
|
|
Msg("AI Patrol: No resources matched scope filter")
|
|
return
|
|
}
|
|
|
|
log.Debug().
|
|
Strs("requested_ids", scope.ResourceIDs).
|
|
Strs("requested_types", scope.ResourceTypes).
|
|
Strs("effective_scope_ids", patrolLogResourceIDs(effectiveScopeIDs)).
|
|
Int("effective_scope_count", len(effectiveScopeIDs)).
|
|
Int("resource_count", resourceCount).
|
|
Str("reason", string(scope.Reason)).
|
|
Msg("AI Patrol: Running scoped analysis")
|
|
|
|
// Track run statistics
|
|
if cfg.AnalyzeNodes {
|
|
runStats.nodesChecked = resourceCounts.nodes
|
|
}
|
|
if cfg.AnalyzeGuests {
|
|
runStats.guestsChecked = resourceCounts.guests
|
|
}
|
|
if cfg.AnalyzeDocker {
|
|
runStats.dockerChecked = resourceCounts.docker
|
|
}
|
|
if cfg.AnalyzeStorage {
|
|
runStats.storageChecked = resourceCounts.storage
|
|
}
|
|
if cfg.AnalyzePBS {
|
|
runStats.pbsChecked = resourceCounts.pbs
|
|
}
|
|
if cfg.AnalyzeHosts {
|
|
runStats.hostsChecked = resourceCounts.hosts
|
|
runStats.trueNASChecked = resourceCounts.truenas
|
|
}
|
|
if cfg.AnalyzeKubernetes {
|
|
runStats.kubernetesChecked = resourceCounts.kubernetes
|
|
}
|
|
if cfg.AnalyzePMG {
|
|
runStats.pmgChecked = resourceCounts.pmg
|
|
}
|
|
runStats.resourceCount = resourceCount
|
|
|
|
// Determine if we can run LLM analysis
|
|
aiServiceEnabled := p.aiService != nil && p.aiService.IsEnabled()
|
|
canRunLLM := aiServiceEnabled && llmAllowed
|
|
|
|
// Check quickstart credit status for scoped runs
|
|
p.mu.RLock()
|
|
scopedQsMgr := p.quickstartCredits
|
|
p.mu.RUnlock()
|
|
scopedUsingQuickstart := p.aiService != nil && p.aiService.IsUsingQuickstart()
|
|
|
|
if !canRunLLM {
|
|
reason := "AI not configured - set up a provider in Settings > Pulse Assistant"
|
|
if !aiServiceEnabled {
|
|
if p.aiService != nil && strings.TrimSpace(p.aiService.QuickstartBlockedReason()) != "" {
|
|
reason = p.aiService.QuickstartBlockedReason()
|
|
} else if scopedQsMgr != nil && !scopedQsMgr.HasBYOK() && !scopedQsMgr.HasCredits() {
|
|
reason = patrolQuickstartCreditsExhaustedReason
|
|
}
|
|
} else if !llmAllowed {
|
|
reason = "circuit breaker is open"
|
|
GetPatrolMetrics().RecordCircuitBlock()
|
|
}
|
|
p.setBlockedReason(reason)
|
|
log.Info().Str("reason", reason).Msg("AI Patrol: Skipping scoped run - AI unavailable")
|
|
return
|
|
}
|
|
|
|
// Check if using quickstart credits — verify credits remain before starting
|
|
if scopedUsingQuickstart && p.aiService != nil && strings.TrimSpace(p.aiService.QuickstartBlockedReason()) != "" {
|
|
p.setBlockedReason(p.aiService.QuickstartBlockedReason())
|
|
log.Info().Str("reason", p.aiService.QuickstartBlockedReason()).Msg("AI Patrol: Skipping scoped run - quickstart unavailable")
|
|
return
|
|
}
|
|
if scopedUsingQuickstart && scopedQsMgr != nil && !scopedQsMgr.HasCredits() {
|
|
p.setBlockedReason(patrolQuickstartCreditsExhaustedReason)
|
|
log.Info().Msg("AI Patrol: Skipping scoped run - quickstart credits exhausted")
|
|
return
|
|
}
|
|
|
|
{
|
|
p.clearBlockedReason()
|
|
if !scope.NoStream {
|
|
// Ensure stream state is clean for this run before the first streamed event.
|
|
p.resetStreamForRun(runID)
|
|
}
|
|
// Run agentic AI analysis on filtered state with scope
|
|
aiResult, aiErr := p.runAIAnalysisState(ctx, filteredState, &scope, executionID)
|
|
if aiErr != nil {
|
|
log.Warn().Err(aiErr).Msg("AI Patrol (scoped): LLM analysis failed")
|
|
runStats.errors++
|
|
if scopedUsingQuickstart {
|
|
switch {
|
|
case providers.IsQuickstartCreditsExhausted(aiErr):
|
|
p.setBlockedReason(patrolQuickstartCreditsExhaustedReason)
|
|
case providers.IsQuickstartUnavailable(aiErr), quickstartBootstrapUnavailable(aiErr):
|
|
p.setBlockedReason(patrolQuickstartUnavailableReason)
|
|
}
|
|
}
|
|
} else if aiResult != nil {
|
|
runStats.aiAnalysis = aiResult
|
|
runStats.rejectedFindings = aiResult.RejectedFindings
|
|
runStats.triageFlags = aiResult.TriageFlags
|
|
runStats.triageSkippedLLM = aiResult.TriageSkippedLLM
|
|
|
|
// Findings are already recorded via patrol_report_finding tool calls.
|
|
for _, f := range aiResult.Findings {
|
|
if f.Severity == FindingSeverityWarning || f.Severity == FindingSeverityCritical {
|
|
runStats.findingIDs = append(runStats.findingIDs, f.ID)
|
|
stored := p.findings.Get(f.ID)
|
|
if stored != nil && stored.TimesRaised <= 1 {
|
|
runStats.newFindings++
|
|
} else {
|
|
runStats.existingFindings++
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
duration := time.Since(start)
|
|
completedAt := time.Now()
|
|
|
|
// Build findings summary string
|
|
summary := p.findings.GetSummary()
|
|
var findingsSummaryStr string
|
|
var status string
|
|
totalActive := summary.Critical + summary.Warning
|
|
if totalActive == 0 {
|
|
findingsSummaryStr = "All healthy"
|
|
status = "healthy"
|
|
} else {
|
|
parts := []string{}
|
|
if summary.Critical > 0 {
|
|
parts = append(parts, fmt.Sprintf("%d critical", summary.Critical))
|
|
}
|
|
if summary.Warning > 0 {
|
|
parts = append(parts, fmt.Sprintf("%d warning", summary.Warning))
|
|
}
|
|
findingsSummaryStr = fmt.Sprintf("%s", joinParts(parts))
|
|
if summary.Critical > 0 {
|
|
status = "critical"
|
|
} else {
|
|
status = "issues_found"
|
|
}
|
|
}
|
|
if runStats.errors > 0 {
|
|
status = "error"
|
|
if findingsSummaryStr == "All healthy" {
|
|
findingsSummaryStr = fmt.Sprintf("Analysis incomplete (%d errors)", runStats.errors)
|
|
}
|
|
}
|
|
|
|
runRecord := PatrolRunRecord{
|
|
ID: runID,
|
|
StartedAt: start,
|
|
CompletedAt: completedAt,
|
|
Duration: duration,
|
|
DurationMs: duration.Milliseconds(),
|
|
Type: "scoped",
|
|
TriggerReason: string(scope.Reason),
|
|
ScopeResourceIDs: scope.ResourceIDs,
|
|
EffectiveScopeResourceIDs: effectiveScopeIDs,
|
|
ScopeResourceTypes: scope.ResourceTypes,
|
|
ScopeContext: scope.Context,
|
|
AlertIdentifier: scope.AlertIdentifier,
|
|
FindingID: scope.FindingID,
|
|
ResourcesChecked: runStats.resourceCount,
|
|
NodesChecked: runStats.nodesChecked,
|
|
GuestsChecked: runStats.guestsChecked,
|
|
DockerChecked: runStats.dockerChecked,
|
|
StorageChecked: runStats.storageChecked,
|
|
HostsChecked: runStats.hostsChecked,
|
|
TrueNASChecked: runStats.trueNASChecked,
|
|
PBSChecked: runStats.pbsChecked,
|
|
PMGChecked: runStats.pmgChecked,
|
|
KubernetesChecked: runStats.kubernetesChecked,
|
|
NewFindings: runStats.newFindings,
|
|
ExistingFindings: runStats.existingFindings,
|
|
RejectedFindings: runStats.rejectedFindings,
|
|
FindingsSummary: findingsSummaryStr,
|
|
FindingIDs: runStats.findingIDs,
|
|
ErrorCount: runStats.errors,
|
|
Status: status,
|
|
}
|
|
|
|
if runStats.aiAnalysis != nil {
|
|
runRecord.AIAnalysis = runStats.aiAnalysis.Response
|
|
runRecord.InputTokens = runStats.aiAnalysis.InputTokens
|
|
runRecord.OutputTokens = runStats.aiAnalysis.OutputTokens
|
|
runRecord.TriageFlags = runStats.triageFlags
|
|
runRecord.TriageSkippedLLM = runStats.triageSkippedLLM
|
|
toolCalls := runStats.aiAnalysis.ToolCalls
|
|
if len(toolCalls) > MaxToolCallsPerRun {
|
|
toolCalls = toolCalls[:MaxToolCallsPerRun]
|
|
}
|
|
runRecord.ToolCalls = toolCalls
|
|
runRecord.ToolCallCount = len(runStats.aiAnalysis.ToolCalls)
|
|
}
|
|
|
|
p.mu.Lock()
|
|
p.lastActivity = completedAt
|
|
p.lastDuration = duration
|
|
p.resourcesChecked = runStats.resourceCount
|
|
p.errorCount = runStats.errors
|
|
p.mu.Unlock()
|
|
|
|
p.runHistoryStore.Add(runRecord)
|
|
|
|
log.Info().
|
|
Strs("requested_ids", scope.ResourceIDs).
|
|
Strs("requested_types", scope.ResourceTypes).
|
|
Strs("effective_scope_ids", patrolLogResourceIDs(effectiveScopeIDs)).
|
|
Int("effective_scope_count", len(effectiveScopeIDs)).
|
|
Dur("duration", duration).
|
|
Int("resources", resourceCount).
|
|
Str("reason", string(scope.Reason)).
|
|
Msg("AI Patrol: Scoped patrol complete")
|
|
}
|
|
|
|
func patrolLogResourceIDs(ids []string) []string {
|
|
if len(ids) <= scopedPatrolLogIDLimit {
|
|
return ids
|
|
}
|
|
|
|
trimmed := append([]string(nil), ids[:scopedPatrolLogIDLimit]...)
|
|
trimmed = append(trimmed, fmt.Sprintf("... +%d more", len(ids)-scopedPatrolLogIDLimit))
|
|
return trimmed
|
|
}
|
|
|
|
type patrolScopeMatcher struct {
|
|
resourceIDSet map[string]bool
|
|
typeSet map[string]bool
|
|
hasIDs bool
|
|
hasTypes bool
|
|
}
|
|
|
|
func newPatrolScopeMatcher(scope PatrolScope) patrolScopeMatcher {
|
|
resourceIDSet := make(map[string]bool)
|
|
for _, id := range scope.ResourceIDs {
|
|
trimmed := strings.TrimSpace(id)
|
|
if trimmed == "" {
|
|
continue
|
|
}
|
|
resourceIDSet[trimmed] = true
|
|
}
|
|
|
|
typeSet := make(map[string]bool)
|
|
addScopeType := func(t string) {
|
|
trimmed := strings.TrimSpace(strings.ToLower(t))
|
|
if trimmed == "" {
|
|
return
|
|
}
|
|
switch trimmed {
|
|
case "docker-host", "app-container":
|
|
typeSet["docker-host"] = true
|
|
typeSet["app-container"] = true
|
|
case "k8s-cluster":
|
|
typeSet["k8s-cluster"] = true
|
|
case "system-container", "vm", "node", "storage", "agent", "pbs", "pmg", "physical_disk":
|
|
typeSet[trimmed] = true
|
|
default:
|
|
typeSet[trimmed] = true
|
|
}
|
|
}
|
|
for _, t := range scope.ResourceTypes {
|
|
addScopeType(t)
|
|
}
|
|
|
|
return patrolScopeMatcher{
|
|
resourceIDSet: resourceIDSet,
|
|
typeSet: typeSet,
|
|
hasIDs: len(resourceIDSet) > 0,
|
|
hasTypes: len(typeSet) > 0,
|
|
}
|
|
}
|
|
|
|
func (m patrolScopeMatcher) matchesType(candidates ...string) bool {
|
|
if !m.hasTypes {
|
|
return true
|
|
}
|
|
for _, candidate := range candidates {
|
|
if candidate == "" {
|
|
continue
|
|
}
|
|
if m.typeSet[strings.ToLower(candidate)] {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (m patrolScopeMatcher) matchesID(candidates ...string) bool {
|
|
if !m.hasIDs {
|
|
return true
|
|
}
|
|
for _, candidate := range candidates {
|
|
if candidate == "" {
|
|
continue
|
|
}
|
|
if m.resourceIDSet[candidate] {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
type patrolScopedFilterState struct {
|
|
filtered patrolRuntimeState
|
|
includedResourceIDs map[string]bool
|
|
includedGuestVMIDs map[int]bool
|
|
}
|
|
|
|
func newPatrolScopedFilterState(snap patrolRuntimeState) patrolScopedFilterState {
|
|
return patrolScopedFilterState{
|
|
filtered: patrolRuntimeState{
|
|
readState: snap.readState,
|
|
unifiedResourceProvider: snap.unifiedResourceProvider,
|
|
},
|
|
includedResourceIDs: make(map[string]bool),
|
|
includedGuestVMIDs: make(map[int]bool),
|
|
}
|
|
}
|
|
|
|
func (s *patrolScopedFilterState) includeResourceID(ids ...string) {
|
|
for _, id := range ids {
|
|
if strings.TrimSpace(id) == "" {
|
|
continue
|
|
}
|
|
s.includedResourceIDs[id] = true
|
|
}
|
|
}
|
|
|
|
func (s *patrolScopedFilterState) includeGuestVMID(vmid int) {
|
|
if vmid > 0 {
|
|
s.includedGuestVMIDs[vmid] = true
|
|
}
|
|
}
|
|
|
|
func patrolDockerScopeName(d models.DockerHost) string {
|
|
hostName := d.CustomDisplayName
|
|
if hostName == "" {
|
|
hostName = d.DisplayName
|
|
}
|
|
if hostName == "" {
|
|
hostName = d.Hostname
|
|
}
|
|
return hostName
|
|
}
|
|
|
|
func scopePatrolDockerHost(d models.DockerHost, matcher patrolScopeMatcher) (models.DockerHost, []string, bool) {
|
|
if !matcher.matchesType("docker-host", "app-container") {
|
|
return models.DockerHost{}, nil, false
|
|
}
|
|
|
|
hostMatches := matcher.matchesID(d.ID, patrolDockerScopeName(d), d.Hostname, d.DisplayName, d.CustomDisplayName)
|
|
if !matcher.hasIDs {
|
|
included := make([]string, 0, len(d.Containers)+1)
|
|
if matcher.typeSet["docker-host"] || !matcher.hasTypes {
|
|
included = append(included, d.ID)
|
|
}
|
|
if matcher.typeSet["app-container"] || !matcher.hasTypes {
|
|
for _, c := range d.Containers {
|
|
included = append(included, c.ID)
|
|
}
|
|
}
|
|
return d, included, true
|
|
}
|
|
|
|
matchedContainers := make([]models.DockerContainer, 0)
|
|
for _, c := range d.Containers {
|
|
if matcher.matchesID(c.ID, c.Name) {
|
|
matchedContainers = append(matchedContainers, c)
|
|
}
|
|
}
|
|
|
|
if hostMatches {
|
|
included := make([]string, 0, len(d.Containers)+1)
|
|
if matcher.typeSet["docker-host"] || !matcher.hasTypes {
|
|
included = append(included, d.ID)
|
|
}
|
|
if matcher.typeSet["app-container"] || !matcher.hasTypes {
|
|
for _, c := range d.Containers {
|
|
included = append(included, c.ID)
|
|
}
|
|
}
|
|
return d, included, true
|
|
}
|
|
if len(matchedContainers) > 0 {
|
|
hostCopy := d
|
|
hostCopy.Containers = matchedContainers
|
|
included := make([]string, 0, len(matchedContainers))
|
|
for _, c := range matchedContainers {
|
|
included = append(included, c.ID)
|
|
}
|
|
return hostCopy, included, true
|
|
}
|
|
|
|
return models.DockerHost{}, nil, false
|
|
}
|
|
|
|
func collectPatrolScopedDockerHosts(hosts []models.DockerHost, matcher patrolScopeMatcher) ([]models.DockerHost, []string) {
|
|
filtered := make([]models.DockerHost, 0, len(hosts))
|
|
ids := make([]string, 0, len(hosts))
|
|
for _, host := range hosts {
|
|
scopedHost, includeIDs, ok := scopePatrolDockerHost(host, matcher)
|
|
if !ok {
|
|
continue
|
|
}
|
|
filtered = append(filtered, scopedHost)
|
|
ids = append(ids, includeIDs...)
|
|
}
|
|
return filtered, ids
|
|
}
|
|
|
|
func scopePatrolPBSInstance(pbs models.PBSInstance, matcher patrolScopeMatcher) bool {
|
|
if !matcher.matchesType("pbs") {
|
|
return false
|
|
}
|
|
|
|
pbsName := pbs.Name
|
|
if pbsName == "" {
|
|
pbsName = pbs.Host
|
|
}
|
|
pbsMatches := matcher.matchesID(pbs.ID, pbs.Name, pbsName, pbs.Host)
|
|
if !matcher.hasIDs {
|
|
return true
|
|
}
|
|
if !pbsMatches {
|
|
for _, ds := range pbs.Datastores {
|
|
if matcher.matchesID(pbs.ID+":"+ds.Name, ds.Name) {
|
|
pbsMatches = true
|
|
break
|
|
}
|
|
}
|
|
}
|
|
if !pbsMatches {
|
|
for _, job := range pbs.BackupJobs {
|
|
if matcher.matchesID(pbs.ID+":job:"+job.ID, job.ID) {
|
|
pbsMatches = true
|
|
break
|
|
}
|
|
}
|
|
}
|
|
if !pbsMatches {
|
|
for _, job := range pbs.VerifyJobs {
|
|
if matcher.matchesID(pbs.ID+":verify:"+job.ID, job.ID) {
|
|
pbsMatches = true
|
|
break
|
|
}
|
|
}
|
|
}
|
|
return pbsMatches
|
|
}
|
|
|
|
func scopePatrolNode(n models.Node, matcher patrolScopeMatcher) bool {
|
|
return matcher.matchesType("node") && matcher.matchesID(n.ID, n.Name)
|
|
}
|
|
|
|
func scopePatrolVM(vm models.VM, matcher patrolScopeMatcher) bool {
|
|
return matcher.matchesType("vm") && matcher.matchesID(vm.ID, vm.Name)
|
|
}
|
|
|
|
func scopePatrolContainer(ct models.Container, matcher patrolScopeMatcher) bool {
|
|
return matcher.matchesType("system-container") && matcher.matchesID(ct.ID, ct.Name)
|
|
}
|
|
|
|
func scopePatrolStorage(storage models.Storage, matcher patrolScopeMatcher) bool {
|
|
return matcher.matchesType("storage") && matcher.matchesID(storage.ID, storage.Name)
|
|
}
|
|
|
|
func scopePatrolPhysicalDisk(disk models.PhysicalDisk, matcher patrolScopeMatcher) bool {
|
|
return matcher.matchesType("physical_disk") && matcher.matchesID(disk.ID, disk.DevPath, disk.Model)
|
|
}
|
|
|
|
func scopePatrolPMGInstance(pmg models.PMGInstance, matcher patrolScopeMatcher) bool {
|
|
return matcher.matchesType("pmg") && matcher.matchesID(pmg.ID, pmg.Name, pmg.Host)
|
|
}
|
|
|
|
func scopePatrolHost(h models.Host, matcher patrolScopeMatcher) bool {
|
|
return matcher.matchesType("agent") && matcher.matchesID(h.ID, h.DisplayName, h.Hostname)
|
|
}
|
|
|
|
func scopePatrolKubernetesCluster(k models.KubernetesCluster, matcher patrolScopeMatcher) bool {
|
|
return matcher.matchesType("k8s-cluster") && matcher.matchesID(k.ID, patrolKubernetesScopeName(k))
|
|
}
|
|
|
|
func collectPatrolScopedNodes(nodes []models.Node, matcher patrolScopeMatcher) ([]models.Node, []string) {
|
|
filtered := make([]models.Node, 0, len(nodes))
|
|
ids := make([]string, 0, len(nodes))
|
|
for _, n := range nodes {
|
|
if !scopePatrolNode(n, matcher) {
|
|
continue
|
|
}
|
|
filtered = append(filtered, n)
|
|
ids = append(ids, n.ID)
|
|
}
|
|
return filtered, ids
|
|
}
|
|
|
|
func collectPatrolScopedVMs(vms []models.VM, matcher patrolScopeMatcher) ([]models.VM, []string, []int) {
|
|
filtered := make([]models.VM, 0, len(vms))
|
|
ids := make([]string, 0, len(vms))
|
|
vmids := make([]int, 0, len(vms))
|
|
for _, vm := range vms {
|
|
if !scopePatrolVM(vm, matcher) {
|
|
continue
|
|
}
|
|
filtered = append(filtered, vm)
|
|
ids = append(ids, vm.ID)
|
|
vmids = append(vmids, vm.VMID)
|
|
}
|
|
return filtered, ids, vmids
|
|
}
|
|
|
|
func collectPatrolScopedContainers(containers []models.Container, matcher patrolScopeMatcher) ([]models.Container, []string, []int) {
|
|
filtered := make([]models.Container, 0, len(containers))
|
|
ids := make([]string, 0, len(containers))
|
|
vmids := make([]int, 0, len(containers))
|
|
for _, ct := range containers {
|
|
if !scopePatrolContainer(ct, matcher) {
|
|
continue
|
|
}
|
|
filtered = append(filtered, ct)
|
|
ids = append(ids, ct.ID)
|
|
vmids = append(vmids, ct.VMID)
|
|
}
|
|
return filtered, ids, vmids
|
|
}
|
|
|
|
func collectPatrolScopedStorage(storage []models.Storage, matcher patrolScopeMatcher) ([]models.Storage, []string) {
|
|
filtered := make([]models.Storage, 0, len(storage))
|
|
ids := make([]string, 0, len(storage))
|
|
for _, s := range storage {
|
|
if !scopePatrolStorage(s, matcher) {
|
|
continue
|
|
}
|
|
filtered = append(filtered, s)
|
|
ids = append(ids, s.ID)
|
|
}
|
|
return filtered, ids
|
|
}
|
|
|
|
func collectPatrolScopedPhysicalDisks(disks []models.PhysicalDisk, matcher patrolScopeMatcher) ([]models.PhysicalDisk, []string) {
|
|
filtered := make([]models.PhysicalDisk, 0, len(disks))
|
|
ids := make([]string, 0, len(disks)*2)
|
|
for _, disk := range disks {
|
|
if !scopePatrolPhysicalDisk(disk, matcher) {
|
|
continue
|
|
}
|
|
filtered = append(filtered, disk)
|
|
ids = append(ids, disk.ID, disk.DevPath)
|
|
}
|
|
return filtered, ids
|
|
}
|
|
|
|
func collectPatrolScopedPBSInstances(instances []models.PBSInstance, matcher patrolScopeMatcher) ([]models.PBSInstance, []string) {
|
|
filtered := make([]models.PBSInstance, 0, len(instances))
|
|
ids := make([]string, 0, len(instances))
|
|
for _, pbs := range instances {
|
|
if !scopePatrolPBSInstance(pbs, matcher) {
|
|
continue
|
|
}
|
|
filtered = append(filtered, pbs)
|
|
ids = append(ids, pbs.ID)
|
|
}
|
|
return filtered, ids
|
|
}
|
|
|
|
func collectPatrolScopedPMGInstances(instances []models.PMGInstance, matcher patrolScopeMatcher) ([]models.PMGInstance, []string) {
|
|
filtered := make([]models.PMGInstance, 0, len(instances))
|
|
ids := make([]string, 0, len(instances))
|
|
for _, pmg := range instances {
|
|
if !scopePatrolPMGInstance(pmg, matcher) {
|
|
continue
|
|
}
|
|
filtered = append(filtered, pmg)
|
|
ids = append(ids, pmg.ID)
|
|
}
|
|
return filtered, ids
|
|
}
|
|
|
|
func collectPatrolScopedHosts(hosts []models.Host, matcher patrolScopeMatcher) ([]models.Host, []string) {
|
|
filtered := make([]models.Host, 0, len(hosts))
|
|
ids := make([]string, 0, len(hosts))
|
|
for _, h := range hosts {
|
|
if !scopePatrolHost(h, matcher) {
|
|
continue
|
|
}
|
|
filtered = append(filtered, h)
|
|
ids = append(ids, h.ID)
|
|
}
|
|
return filtered, ids
|
|
}
|
|
|
|
func collectPatrolScopedKubernetesClusters(clusters []models.KubernetesCluster, matcher patrolScopeMatcher) ([]models.KubernetesCluster, []string) {
|
|
filtered := make([]models.KubernetesCluster, 0, len(clusters))
|
|
ids := make([]string, 0, len(clusters))
|
|
for _, k := range clusters {
|
|
if !scopePatrolKubernetesCluster(k, matcher) {
|
|
continue
|
|
}
|
|
filtered = append(filtered, k)
|
|
ids = append(ids, k.ID)
|
|
}
|
|
return filtered, ids
|
|
}
|
|
|
|
func patrolKubernetesScopeName(k models.KubernetesCluster) string {
|
|
clusterName := k.CustomDisplayName
|
|
if clusterName == "" {
|
|
clusterName = k.DisplayName
|
|
}
|
|
if clusterName == "" {
|
|
clusterName = k.Name
|
|
}
|
|
return clusterName
|
|
}
|
|
|
|
func collectPatrolScopedActiveAlerts(alerts []models.Alert, includedResourceIDs map[string]bool) []models.Alert {
|
|
filtered := make([]models.Alert, 0, len(alerts))
|
|
for _, alert := range alerts {
|
|
if includedResourceIDs[alert.ResourceID] {
|
|
filtered = append(filtered, alert)
|
|
}
|
|
}
|
|
return filtered
|
|
}
|
|
|
|
func collectPatrolScopedResolvedAlerts(alerts []models.ResolvedAlert, includedResourceIDs map[string]bool) []models.ResolvedAlert {
|
|
filtered := make([]models.ResolvedAlert, 0, len(alerts))
|
|
for _, resolved := range alerts {
|
|
if includedResourceIDs[resolved.ResourceID] {
|
|
filtered = append(filtered, resolved)
|
|
}
|
|
}
|
|
return filtered
|
|
}
|
|
|
|
func collectPatrolScopedConnectionHealth(connectionHealth map[string]bool, includedResourceIDs map[string]bool) map[string]bool {
|
|
filtered := make(map[string]bool, len(connectionHealth))
|
|
for resourceID, healthy := range connectionHealth {
|
|
if includedResourceIDs[resourceID] {
|
|
filtered[resourceID] = healthy
|
|
}
|
|
}
|
|
return filtered
|
|
}
|
|
|
|
func collectPatrolScopedBackupTasks(tasks []models.BackupTask, includedGuestVMIDs map[int]bool) []models.BackupTask {
|
|
filtered := make([]models.BackupTask, 0, len(tasks))
|
|
for _, backupTask := range tasks {
|
|
if includedGuestVMIDs[backupTask.VMID] {
|
|
filtered = append(filtered, backupTask)
|
|
}
|
|
}
|
|
return filtered
|
|
}
|
|
|
|
func collectPatrolScopedStorageBackups(backups []models.StorageBackup, includedGuestVMIDs map[int]bool) []models.StorageBackup {
|
|
filtered := make([]models.StorageBackup, 0, len(backups))
|
|
for _, storageBackup := range backups {
|
|
if includedGuestVMIDs[storageBackup.VMID] {
|
|
filtered = append(filtered, storageBackup)
|
|
}
|
|
}
|
|
return filtered
|
|
}
|
|
|
|
func collectPatrolScopedGuestSnapshots(snapshots []models.GuestSnapshot, includedGuestVMIDs map[int]bool) []models.GuestSnapshot {
|
|
filtered := make([]models.GuestSnapshot, 0, len(snapshots))
|
|
for _, guestSnapshot := range snapshots {
|
|
if includedGuestVMIDs[guestSnapshot.VMID] {
|
|
filtered = append(filtered, guestSnapshot)
|
|
}
|
|
}
|
|
return filtered
|
|
}
|
|
|
|
func collectPatrolScopedPBSBackups(backups []models.PBSBackup, includedGuestVMIDs map[int]bool) []models.PBSBackup {
|
|
filtered := make([]models.PBSBackup, 0, len(backups))
|
|
for _, backup := range backups {
|
|
vmid, err := strconv.Atoi(backup.VMID)
|
|
if err == nil && includedGuestVMIDs[vmid] {
|
|
filtered = append(filtered, backup)
|
|
}
|
|
}
|
|
return filtered
|
|
}
|
|
|
|
func copyScopedPatrolMetadata(dst *patrolRuntimeState, snap patrolRuntimeState, includedResourceIDs map[string]bool, includedGuestVMIDs map[int]bool) {
|
|
if len(snap.ActiveAlerts) > 0 {
|
|
dst.ActiveAlerts = collectPatrolScopedActiveAlerts(snap.ActiveAlerts, includedResourceIDs)
|
|
}
|
|
if len(snap.RecentlyResolved) > 0 {
|
|
dst.RecentlyResolved = collectPatrolScopedResolvedAlerts(snap.RecentlyResolved, includedResourceIDs)
|
|
}
|
|
if len(snap.ConnectionHealth) > 0 {
|
|
dst.ConnectionHealth = collectPatrolScopedConnectionHealth(snap.ConnectionHealth, includedResourceIDs)
|
|
}
|
|
if len(includedGuestVMIDs) == 0 {
|
|
return
|
|
}
|
|
|
|
dst.PVEBackups.BackupTasks = collectPatrolScopedBackupTasks(snap.PVEBackups.BackupTasks, includedGuestVMIDs)
|
|
dst.PVEBackups.StorageBackups = collectPatrolScopedStorageBackups(snap.PVEBackups.StorageBackups, includedGuestVMIDs)
|
|
dst.PVEBackups.GuestSnapshots = collectPatrolScopedGuestSnapshots(snap.PVEBackups.GuestSnapshots, includedGuestVMIDs)
|
|
dst.PBSBackups = collectPatrolScopedPBSBackups(snap.PBSBackups, includedGuestVMIDs)
|
|
}
|
|
|
|
func (p *PatrolService) filterStateByScopeState(snap patrolRuntimeState, scope PatrolScope) patrolRuntimeState {
|
|
matcher := newPatrolScopeMatcher(scope)
|
|
filterState := newPatrolScopedFilterState(snap)
|
|
|
|
filteredNodes, nodeIDs := collectPatrolScopedNodes(snap.Nodes, matcher)
|
|
filterState.filtered.Nodes = filteredNodes
|
|
filterState.includeResourceID(nodeIDs...)
|
|
|
|
filteredVMs, vmIDs, guestVMIDs := collectPatrolScopedVMs(snap.VMs, matcher)
|
|
filterState.filtered.VMs = filteredVMs
|
|
filterState.includeResourceID(vmIDs...)
|
|
for _, vmid := range guestVMIDs {
|
|
filterState.includeGuestVMID(vmid)
|
|
}
|
|
|
|
filteredContainers, containerIDs, containerVMIDs := collectPatrolScopedContainers(snap.Containers, matcher)
|
|
filterState.filtered.Containers = filteredContainers
|
|
filterState.includeResourceID(containerIDs...)
|
|
for _, vmid := range containerVMIDs {
|
|
filterState.includeGuestVMID(vmid)
|
|
}
|
|
|
|
filteredDockerHosts, dockerIDs := collectPatrolScopedDockerHosts(snap.DockerHosts, matcher)
|
|
filterState.filtered.DockerHosts = filteredDockerHosts
|
|
filterState.includeResourceID(dockerIDs...)
|
|
|
|
filteredStorage, storageIDs := collectPatrolScopedStorage(snap.Storage, matcher)
|
|
filterState.filtered.Storage = filteredStorage
|
|
filterState.includeResourceID(storageIDs...)
|
|
|
|
filteredDisks, diskIDs := collectPatrolScopedPhysicalDisks(snap.PhysicalDisks, matcher)
|
|
filterState.filtered.PhysicalDisks = filteredDisks
|
|
filterState.includeResourceID(diskIDs...)
|
|
|
|
filteredPBS, pbsIDs := collectPatrolScopedPBSInstances(snap.PBSInstances, matcher)
|
|
filterState.filtered.PBSInstances = filteredPBS
|
|
filterState.includeResourceID(pbsIDs...)
|
|
|
|
filteredPMG, pmgIDs := collectPatrolScopedPMGInstances(snap.PMGInstances, matcher)
|
|
filterState.filtered.PMGInstances = filteredPMG
|
|
filterState.includeResourceID(pmgIDs...)
|
|
|
|
filteredHosts, hostIDs := collectPatrolScopedHosts(snap.Hosts, matcher)
|
|
filterState.filtered.Hosts = filteredHosts
|
|
filterState.includeResourceID(hostIDs...)
|
|
|
|
filteredK8sClusters, k8sClusterIDs := collectPatrolScopedKubernetesClusters(snap.KubernetesClusters, matcher)
|
|
filterState.filtered.KubernetesClusters = filteredK8sClusters
|
|
filterState.includeResourceID(k8sClusterIDs...)
|
|
|
|
copyScopedPatrolMetadata(&filterState.filtered, snap, filterState.includedResourceIDs, filterState.includedGuestVMIDs)
|
|
|
|
return filterState.filtered.withDerivedProviders()
|
|
}
|
|
|
|
// GetStatus returns the current patrol status
|
|
func (p *PatrolService) GetStatus() PatrolStatus {
|
|
p.mu.RLock()
|
|
defer p.mu.RUnlock()
|
|
|
|
interval := p.config.GetInterval()
|
|
intervalMs := int64(interval / time.Millisecond)
|
|
|
|
// "Running" means an analysis is currently in progress, not just the service loop
|
|
analysisInProgress := p.runInProgress
|
|
|
|
status := PatrolStatus{
|
|
RuntimeState: PatrolRuntimeStateActive,
|
|
Running: analysisInProgress,
|
|
Enabled: p.config.Enabled,
|
|
LastDuration: p.lastDuration,
|
|
ResourcesChecked: p.resourcesChecked,
|
|
FindingsCount: len(p.findings.GetActive(FindingSeverityInfo)),
|
|
ErrorCount: p.errorCount,
|
|
IntervalMs: intervalMs,
|
|
BlockedReason: p.lastBlockedReason,
|
|
}
|
|
if p.triggerManager != nil {
|
|
triggerStatus := p.triggerManager.GetStatus()
|
|
status.TriggerStatus = &triggerStatus
|
|
}
|
|
|
|
if p.quickstartCredits != nil {
|
|
status.QuickstartCreditsRemaining = p.quickstartCredits.CreditsRemaining()
|
|
status.QuickstartCreditsTotal = p.quickstartCredits.CreditsTotal()
|
|
status.UsingQuickstart = p.aiService != nil && p.aiService.IsUsingQuickstart()
|
|
}
|
|
|
|
quickstartReason := ""
|
|
if p.aiService != nil {
|
|
quickstartReason = strings.TrimSpace(p.aiService.QuickstartBlockedReason())
|
|
}
|
|
if quickstartReason == "" &&
|
|
p.config.Enabled &&
|
|
p.quickstartCredits != nil &&
|
|
!p.quickstartCredits.HasBYOK() &&
|
|
!p.quickstartCredits.HasCredits() {
|
|
quickstartReason = patrolQuickstartCreditsExhaustedReason
|
|
}
|
|
if quickstartReason != "" {
|
|
status.BlockedReason = quickstartReason
|
|
} else if strings.TrimSpace(status.BlockedReason) == patrolQuickstartCreditsExhaustedReason ||
|
|
strings.TrimSpace(status.BlockedReason) == patrolQuickstartActivationRequiredReason ||
|
|
strings.TrimSpace(status.BlockedReason) == patrolQuickstartUnavailableReason {
|
|
status.BlockedReason = ""
|
|
}
|
|
|
|
switch {
|
|
case analysisInProgress:
|
|
status.RuntimeState = PatrolRuntimeStateRunning
|
|
case !p.config.Enabled:
|
|
status.RuntimeState = PatrolRuntimeStateDisabled
|
|
case strings.TrimSpace(status.BlockedReason) != "":
|
|
status.RuntimeState = PatrolRuntimeStateBlocked
|
|
default:
|
|
status.RuntimeState = PatrolRuntimeStateActive
|
|
}
|
|
|
|
if !p.lastFullPatrol.IsZero() {
|
|
status.LastPatrolAt = &p.lastFullPatrol
|
|
}
|
|
if !p.lastActivity.IsZero() {
|
|
status.LastActivityAt = &p.lastActivity
|
|
}
|
|
if strings.TrimSpace(status.BlockedReason) != "" && !p.lastBlockedAt.IsZero() {
|
|
status.BlockedAt = &p.lastBlockedAt
|
|
}
|
|
|
|
// Use the tracked next scheduled time (accounts for ticker resets on interval changes)
|
|
if p.config.Enabled && interval > 0 && !p.nextScheduledAt.IsZero() {
|
|
next := p.nextScheduledAt
|
|
status.NextPatrolAt = &next
|
|
}
|
|
|
|
summary := p.findings.GetSummary()
|
|
status.Healthy = summary.IsHealthy()
|
|
if status.RuntimeState == PatrolRuntimeStateBlocked {
|
|
status.Healthy = false
|
|
}
|
|
|
|
return status
|
|
}
|
|
|
|
// SubscribeToStream returns a channel that will receive streaming patrol events
|
|
func (p *PatrolService) SubscribeToStream() chan PatrolStreamEvent {
|
|
return p.SubscribeToStreamFrom(0)
|
|
}
|
|
|
|
// SubscribeToStreamFrom subscribes a client to patrol streaming events and optionally replays
|
|
// events with Seq > lastSeq (best-effort). This allows SSE clients to resume after disconnects
|
|
// using the Last-Event-ID header.
|
|
func (p *PatrolService) SubscribeToStreamFrom(lastSeq int64) chan PatrolStreamEvent {
|
|
ch := make(chan PatrolStreamEvent, 100) // Buffered to prevent blocking
|
|
sub := &streamSubscriber{ch: ch}
|
|
replayedCount := 0
|
|
snapshotReasons := make([]string, 0, 2)
|
|
snapshotReasonSeen := make(map[string]struct{}, 2)
|
|
|
|
p.streamMu.Lock()
|
|
p.streamSubscribers[ch] = sub
|
|
|
|
trySendSnapshot := func(reason string) bool {
|
|
if _, seen := snapshotReasonSeen[reason]; seen {
|
|
return true
|
|
}
|
|
snap := p.makeSnapshotLocked(reason)
|
|
select {
|
|
case ch <- snap:
|
|
snapshotReasonSeen[reason] = struct{}{}
|
|
snapshotReasons = append(snapshotReasons, reason)
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
bufferStart, bufferEnd := p.streamBufferWindowLocked()
|
|
// If the client is behind the buffered window, proactively emit a snapshot that
|
|
// advertises truncation. (We may still replay what we have.)
|
|
if lastSeq > 0 && bufferStart > 0 && lastSeq < bufferStart && p.streamPhase != "idle" {
|
|
trySendSnapshot("buffer_rotated")
|
|
}
|
|
|
|
// Best-effort replay / snapshot:
|
|
// - If client provides lastSeq, replay newer buffered events (Seq > lastSeq).
|
|
// - If lastSeq is stale/ahead (e.g. from a different run), send a snapshot so UI can resync.
|
|
// - If no lastSeq, send a snapshot (late-joiner).
|
|
if lastSeq > 0 && len(p.streamEvents) > 0 {
|
|
events := p.streamEventsSinceLocked(lastSeq)
|
|
replayLoop:
|
|
for _, ev := range events {
|
|
select {
|
|
case ch <- ev:
|
|
replayedCount++
|
|
default:
|
|
// If subscriber can't catch up, stop replaying and let it receive live events.
|
|
break replayLoop
|
|
}
|
|
}
|
|
}
|
|
if replayedCount == 0 && len(snapshotReasons) == 0 && lastSeq > 0 && p.streamPhase != "idle" {
|
|
// lastSeq is likely stale (ahead of this run) or we're missing buffered events.
|
|
// Provide a snapshot to allow the UI to resync.
|
|
reason := "stale_last_event_id"
|
|
if bufferEnd > 0 && lastSeq > bufferEnd {
|
|
reason = "stale_last_event_id"
|
|
} else if bufferStart > 0 && lastSeq < bufferStart {
|
|
reason = "buffer_rotated"
|
|
}
|
|
trySendSnapshot(reason)
|
|
}
|
|
if lastSeq == 0 && p.streamPhase != "idle" {
|
|
trySendSnapshot("late_joiner")
|
|
}
|
|
p.streamMu.Unlock()
|
|
|
|
metrics := GetPatrolMetrics()
|
|
if replayedCount > 0 {
|
|
metrics.RecordStreamReplay(replayedCount)
|
|
log.Debug().Int64("last_seq", lastSeq).Int("replayed_events", replayedCount).Msg("patrol stream replayed buffered events")
|
|
}
|
|
for _, reason := range snapshotReasons {
|
|
metrics.RecordStreamSnapshot(reason)
|
|
log.Debug().Int64("last_seq", lastSeq).Str("resync_reason", reason).Msg("patrol stream sent synthetic snapshot")
|
|
}
|
|
if lastSeq > 0 && replayedCount == 0 && len(snapshotReasons) == 0 {
|
|
metrics.RecordStreamMiss()
|
|
log.Debug().Int64("last_seq", lastSeq).Msg("patrol stream resume had no replay or snapshot")
|
|
}
|
|
|
|
return ch
|
|
}
|
|
|
|
// UnsubscribeFromStream removes a subscriber
|
|
func (p *PatrolService) UnsubscribeFromStream(ch chan PatrolStreamEvent) {
|
|
p.streamMu.Lock()
|
|
sub, exists := p.streamSubscribers[ch]
|
|
delete(p.streamSubscribers, ch)
|
|
p.streamMu.Unlock()
|
|
|
|
// Use atomic CAS to ensure exactly one goroutine closes the channel,
|
|
// even if broadcast and unsubscribe race.
|
|
if exists && sub.closed.CompareAndSwap(false, true) {
|
|
close(ch)
|
|
}
|
|
}
|
|
|
|
// broadcast sends an event to all subscribers
|
|
// Subscribers with full channels are automatically removed to prevent memory leaks
|
|
func (p *PatrolService) broadcast(event PatrolStreamEvent) {
|
|
p.streamMu.Lock()
|
|
defer p.streamMu.Unlock()
|
|
|
|
// Track a couple pieces of best-effort state for snapshots/resync.
|
|
switch event.Type {
|
|
case "tool_start":
|
|
if event.ToolName != "" {
|
|
p.streamCurrentTool = event.ToolName
|
|
}
|
|
case "tool_end":
|
|
p.streamCurrentTool = ""
|
|
}
|
|
|
|
// Bound payload sizes so streaming and replay buffers can't balloon due to a single tool
|
|
// output or oversized content chunk.
|
|
event = truncateStreamEvent(event)
|
|
|
|
// Decorate once so every subscriber sees identical meta.
|
|
event = p.decorateStreamEventLocked(event)
|
|
p.appendStreamEventLocked(event)
|
|
|
|
var staleChannels []chan PatrolStreamEvent
|
|
dropReasons := make(map[chan PatrolStreamEvent]string)
|
|
for ch, sub := range p.streamSubscribers {
|
|
if sub == nil || sub.closed.Load() {
|
|
staleChannels = append(staleChannels, ch)
|
|
dropReasons[ch] = "closed"
|
|
continue
|
|
}
|
|
select {
|
|
case ch <- event:
|
|
// Successfully sent
|
|
sub.fullCount = 0
|
|
default:
|
|
// Channel full. Tolerate bursts, but disconnect subscribers that are
|
|
// consistently unable to receive events (likely dead/slow clients).
|
|
sub.fullCount++
|
|
if sub.fullCount >= 25 {
|
|
staleChannels = append(staleChannels, ch)
|
|
dropReasons[ch] = "backpressure"
|
|
}
|
|
}
|
|
}
|
|
|
|
// Clean up stale subscribers using atomic CAS for safe close
|
|
for _, ch := range staleChannels {
|
|
sub := p.streamSubscribers[ch]
|
|
delete(p.streamSubscribers, ch)
|
|
reason := dropReasons[ch]
|
|
GetPatrolMetrics().RecordStreamSubscriberDrop(reason)
|
|
log.Debug().Str("reason", reason).Msg("patrol stream subscriber dropped")
|
|
if sub != nil && sub.closed.CompareAndSwap(false, true) {
|
|
close(ch)
|
|
}
|
|
}
|
|
}
|
|
|
|
// resetStreamForRun resets stream state for a new run so late-joiners don't see stale output.
|
|
// This should only be called for runs that will actually stream events (NoStream=false).
|
|
func (p *PatrolService) resetStreamForRun(runID string) {
|
|
p.streamMu.Lock()
|
|
p.streamRunID = runID
|
|
p.streamSeq = 0
|
|
p.streamPhase = "idle"
|
|
p.streamCurrentTool = ""
|
|
p.currentOutput.Reset()
|
|
p.streamEvents = nil
|
|
p.streamMu.Unlock()
|
|
}
|
|
|
|
func (p *PatrolService) decorateStreamEventLocked(event PatrolStreamEvent) PatrolStreamEvent {
|
|
if event.RunID == "" {
|
|
event.RunID = p.streamRunID
|
|
}
|
|
if event.Seq == 0 {
|
|
p.streamSeq++
|
|
event.Seq = p.streamSeq
|
|
}
|
|
if event.TsMs == 0 {
|
|
event.TsMs = time.Now().UnixMilli()
|
|
}
|
|
return event
|
|
}
|
|
|
|
const patrolStreamReplayBufferSize = 200
|
|
const patrolStreamMaxEventFieldBytes = 8 * 1024
|
|
|
|
func truncateStreamEvent(event PatrolStreamEvent) PatrolStreamEvent {
|
|
event.Content = truncateStreamField(event.Content, patrolStreamMaxEventFieldBytes)
|
|
event.ToolInput = truncateStreamField(event.ToolInput, patrolStreamMaxEventFieldBytes)
|
|
event.ToolRawInput = truncateStreamField(event.ToolRawInput, patrolStreamMaxEventFieldBytes)
|
|
event.ToolOutput = truncateStreamField(event.ToolOutput, patrolStreamMaxEventFieldBytes)
|
|
return event
|
|
}
|
|
|
|
func truncateStreamField(s string, max int) string {
|
|
if max <= 0 || len(s) <= max {
|
|
return s
|
|
}
|
|
const suffix = "...[truncated]"
|
|
if max <= len(suffix) {
|
|
return s[:max]
|
|
}
|
|
return s[:max-len(suffix)] + suffix
|
|
}
|
|
|
|
func (p *PatrolService) appendStreamEventLocked(event PatrolStreamEvent) {
|
|
// Keep a bounded buffer for Last-Event-ID replay (best-effort).
|
|
p.streamEvents = append(p.streamEvents, event)
|
|
if len(p.streamEvents) > patrolStreamReplayBufferSize {
|
|
p.streamEvents = p.streamEvents[len(p.streamEvents)-patrolStreamReplayBufferSize:]
|
|
}
|
|
}
|
|
|
|
func (p *PatrolService) streamEventsSinceLocked(lastSeq int64) []PatrolStreamEvent {
|
|
// Seq is monotonic within a run; we reset buffer on new run.
|
|
for i := len(p.streamEvents) - 1; i >= 0; i-- {
|
|
if p.streamEvents[i].Seq <= lastSeq {
|
|
// Return events after i
|
|
out := make([]PatrolStreamEvent, len(p.streamEvents)-(i+1))
|
|
copy(out, p.streamEvents[i+1:])
|
|
return out
|
|
}
|
|
}
|
|
// All buffered events are newer
|
|
out := make([]PatrolStreamEvent, len(p.streamEvents))
|
|
copy(out, p.streamEvents)
|
|
return out
|
|
}
|
|
|
|
func (p *PatrolService) streamBufferWindowLocked() (start, end int64) {
|
|
if len(p.streamEvents) == 0 {
|
|
return 0, 0
|
|
}
|
|
return p.streamEvents[0].Seq, p.streamEvents[len(p.streamEvents)-1].Seq
|
|
}
|
|
|
|
func (p *PatrolService) makeSnapshotLocked(reason string) PatrolStreamEvent {
|
|
start, end := p.streamBufferWindowLocked()
|
|
phase := p.streamPhase
|
|
if phase == "idle" {
|
|
phase = ""
|
|
}
|
|
tr := p.currentOutput.Truncated()
|
|
var trPtr *bool
|
|
if tr {
|
|
trPtr = &tr
|
|
}
|
|
// Snapshot is synthetic and should not advance seq; use the most recent real event seq
|
|
// so clients can resume from a meaningful Last-Event-ID.
|
|
seq := end
|
|
return PatrolStreamEvent{
|
|
Type: "snapshot",
|
|
RunID: p.streamRunID,
|
|
Seq: seq,
|
|
TsMs: time.Now().UnixMilli(),
|
|
ResyncReason: reason,
|
|
BufferStart: start,
|
|
BufferEnd: end,
|
|
ContentTruncated: trPtr,
|
|
Phase: phase,
|
|
Content: p.currentOutput.String(),
|
|
ToolName: p.streamCurrentTool,
|
|
}
|
|
}
|
|
|
|
// appendStreamContent adds content to the current output and broadcasts it
|
|
func (p *PatrolService) appendStreamContent(content string) {
|
|
p.streamMu.Lock()
|
|
p.currentOutput.WriteString(content)
|
|
p.streamMu.Unlock()
|
|
|
|
p.broadcast(PatrolStreamEvent{
|
|
Type: "content",
|
|
Content: content,
|
|
})
|
|
}
|
|
|
|
// setStreamPhase updates the current phase and broadcasts it to all subscribers.
|
|
// The frontend only updates its phase display when it receives a 'phase' event,
|
|
// so we must broadcast phase changes to keep the UI in sync.
|
|
func (p *PatrolService) setStreamPhase(phase string) {
|
|
p.streamMu.Lock()
|
|
oldPhase := p.streamPhase
|
|
p.streamPhase = phase
|
|
p.streamMu.Unlock()
|
|
|
|
// Broadcast phase change (except for idle which just clears state)
|
|
// This ensures late joiners and continuous watchers see the current phase
|
|
if phase != "idle" && phase != oldPhase {
|
|
p.broadcast(PatrolStreamEvent{
|
|
Type: "phase",
|
|
Phase: phase,
|
|
})
|
|
}
|
|
}
|
|
|
|
// GetCurrentStreamOutput returns the current buffered output (for late joiners)
|
|
func (p *PatrolService) GetCurrentStreamOutput() (string, string) {
|
|
p.streamMu.RLock()
|
|
defer p.streamMu.RUnlock()
|
|
return p.currentOutput.String(), p.streamPhase
|
|
}
|
|
|
|
// reviewAndResolveAlerts uses AI to review active alerts and resolve those where the issue is fixed.
|
|
// This is the core of autonomous alert management - the AI looks at each alert, checks current state,
|
|
// and determines if the underlying issue has been resolved.
|
|
func (p *PatrolService) reviewAndResolveAlertsState(ctx context.Context, state patrolRuntimeState, llmAllowed bool, executionID string) int {
|
|
p.mu.RLock()
|
|
resolver := p.alertResolver
|
|
aiService := p.aiService
|
|
p.mu.RUnlock()
|
|
|
|
if resolver == nil {
|
|
return 0
|
|
}
|
|
|
|
activeAlerts := resolver.GetActiveAlerts()
|
|
if len(activeAlerts) == 0 {
|
|
return 0
|
|
}
|
|
|
|
// Only review alerts that have been active for at least 10 minutes
|
|
// This avoids thrashing on transient alerts
|
|
minAge := 10 * time.Minute
|
|
var alertsToReview []AlertInfo
|
|
for _, alert := range activeAlerts {
|
|
if time.Since(alert.StartTime) >= minAge {
|
|
alertsToReview = append(alertsToReview, alert)
|
|
}
|
|
}
|
|
|
|
if len(alertsToReview) == 0 {
|
|
return 0
|
|
}
|
|
|
|
log.Info().
|
|
Int("total_active", len(activeAlerts)).
|
|
Int("to_review", len(alertsToReview)).
|
|
Msg("AI Patrol: Reviewing alerts for auto-resolution")
|
|
|
|
resolvedCount := 0
|
|
|
|
// Pass nil for aiService if LLM is not allowed (use heuristic checks only).
|
|
aiSvc := aiService
|
|
if !llmAllowed {
|
|
aiSvc = nil
|
|
}
|
|
|
|
for _, alert := range alertsToReview {
|
|
shouldResolve, reason := p.shouldResolveAlertState(ctx, alert, state, aiSvc, executionID)
|
|
if shouldResolve {
|
|
if resolver.ResolveAlert(alert.ID) {
|
|
resolvedCount++
|
|
log.Info().
|
|
Str("alertID", alert.ID).
|
|
Str("resource", alert.ResourceName).
|
|
Str("reason", reason).
|
|
Dur("age", time.Since(alert.StartTime)).
|
|
Msg("AI Patrol: Auto-resolved alert - issue no longer detected")
|
|
}
|
|
}
|
|
}
|
|
|
|
if resolvedCount > 0 {
|
|
log.Info().
|
|
Int("resolved", resolvedCount).
|
|
Msg("AI Patrol: Completed alert review")
|
|
}
|
|
|
|
return resolvedCount
|
|
}
|
|
|
|
// shouldResolveAlert determines if an alert should be auto-resolved based on current state.
|
|
// Returns (shouldResolve, reason)
|
|
func (p *PatrolService) shouldResolveAlertState(ctx context.Context, alert AlertInfo, snap patrolRuntimeState, aiService *Service, executionID string) (bool, string) {
|
|
// First, try smart heuristic checks based on alert type
|
|
switch alert.Type {
|
|
case "usage": // Storage usage alert
|
|
resource := lookupPatrolAlertResourceState(alert, snap)
|
|
if resource.found {
|
|
if resource.disk < alert.Threshold*0.95 { // 5% margin below threshold
|
|
return true, fmt.Sprintf("storage usage dropped from %.1f%% to %.1f%% (threshold: %.1f%%)",
|
|
alert.Value, resource.disk, alert.Threshold)
|
|
}
|
|
return false, ""
|
|
}
|
|
// Storage not found in current snapshot - might have been removed
|
|
// Resolve after 24 hours if resource is gone
|
|
if time.Since(alert.StartTime) > 24*time.Hour {
|
|
return true, "resource no longer present in infrastructure"
|
|
}
|
|
|
|
case "cpu", "memory": // Resource utilization alerts
|
|
// Check if this is a node, VM, container, or docker container
|
|
currentValue := p.getCurrentMetricValueState(alert, snap)
|
|
if currentValue >= 0 && currentValue < alert.Threshold*0.95 {
|
|
return true, fmt.Sprintf("%s dropped from %.1f%% to %.1f%% (threshold: %.1f%%)",
|
|
alert.Type, alert.Value, currentValue, alert.Threshold)
|
|
}
|
|
|
|
case "offline", "stopped", "docker-offline":
|
|
// Check if the resource is now online
|
|
if p.isResourceOnlineState(alert, snap) {
|
|
return true, "resource is now online/running"
|
|
}
|
|
}
|
|
|
|
// For complex cases or when heuristics don't apply, use AI judgment if available
|
|
if aiService != nil && aiService.IsEnabled() {
|
|
return p.askAIAboutAlertState(ctx, alert, snap, aiService, executionID)
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
// getCurrentMetricValue gets the current value of the metric that triggered the alert
|
|
type patrolAlertResourceState struct {
|
|
resourceType string
|
|
platform string
|
|
name string
|
|
status string
|
|
cpu float64
|
|
memory float64
|
|
disk float64
|
|
found bool
|
|
}
|
|
|
|
func patrolAlertNameMatches(alert AlertInfo, ids ...string) bool {
|
|
for _, id := range ids {
|
|
if strings.TrimSpace(id) == "" {
|
|
continue
|
|
}
|
|
if id == alert.ResourceID || id == alert.ResourceName {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func patrolAlertLookupType(alert AlertInfo) string {
|
|
resourceType := canonicalizeAICompatibilityResourceType(alert.ResourceType)
|
|
if alert.Type == "usage" && (resourceType == "" || strings.EqualFold(resourceType, "usage")) {
|
|
return "storage"
|
|
}
|
|
return resourceType
|
|
}
|
|
|
|
func lookupPatrolAlertResourceState(alert AlertInfo, snap patrolRuntimeState) patrolAlertResourceState {
|
|
alert.ResourceType = patrolAlertLookupType(alert)
|
|
if alert.ResourceType == "app-container" {
|
|
if resource, ok := patrolLookupAppContainerAlertResourceState(alert, snap); ok {
|
|
return resource
|
|
}
|
|
}
|
|
switch alert.ResourceType {
|
|
case "storage":
|
|
if resource, ok := patrolLookupStorageAlertResourceState(alert, snap); ok {
|
|
return resource
|
|
}
|
|
case "node":
|
|
if resource, ok := patrolLookupNodeAlertResourceState(alert, snap); ok {
|
|
return resource
|
|
}
|
|
case "vm":
|
|
if resource, ok := patrolLookupGuestAlertResourceState(alert, snap, "VM", "vm"); ok {
|
|
return resource
|
|
}
|
|
case "system-container":
|
|
if resource, ok := patrolLookupGuestAlertResourceState(alert, snap, "Container", "system-container"); ok {
|
|
return resource
|
|
}
|
|
case "agent":
|
|
if resource, ok := patrolLookupHostAlertResourceState(alert, snap); ok {
|
|
return resource
|
|
}
|
|
}
|
|
return patrolAlertResourceState{}
|
|
}
|
|
|
|
func patrolLookupAppContainerAlertResourceState(alert AlertInfo, snap patrolRuntimeState) (patrolAlertResourceState, bool) {
|
|
for _, container := range patrolAppContainerRows(snap, nil) {
|
|
if !patrolAlertNameMatches(alert, container.id, container.name) {
|
|
continue
|
|
}
|
|
return patrolAlertResourceState{
|
|
resourceType: "app-container",
|
|
name: container.name,
|
|
status: container.status,
|
|
cpu: container.cpu,
|
|
memory: container.memory,
|
|
found: true,
|
|
}, true
|
|
}
|
|
return patrolAlertResourceState{}, false
|
|
}
|
|
|
|
func patrolLookupStorageAlertResourceState(alert AlertInfo, snap patrolRuntimeState) (patrolAlertResourceState, bool) {
|
|
for _, storage := range patrolStoragePoolRows(snap, nil) {
|
|
if !patrolAlertNameMatches(alert, storage.id, storage.name) {
|
|
continue
|
|
}
|
|
return patrolAlertResourceState{
|
|
resourceType: "storage",
|
|
name: storage.name,
|
|
status: storage.status,
|
|
disk: storage.usage,
|
|
found: true,
|
|
}, true
|
|
}
|
|
return patrolAlertResourceState{}, false
|
|
}
|
|
|
|
func patrolLookupNodeAlertResourceState(alert AlertInfo, snap patrolRuntimeState) (patrolAlertResourceState, bool) {
|
|
for _, node := range patrolNodeInventoryRows(snap, nil) {
|
|
if !patrolAlertNameMatches(alert, node.id, node.name) {
|
|
continue
|
|
}
|
|
return patrolAlertResourceState{
|
|
resourceType: "node",
|
|
name: node.name,
|
|
status: node.status,
|
|
cpu: node.cpu,
|
|
memory: node.mem,
|
|
found: true,
|
|
}, true
|
|
}
|
|
return patrolAlertResourceState{}, false
|
|
}
|
|
|
|
func patrolLookupGuestAlertResourceState(alert AlertInfo, snap patrolRuntimeState, guestType, resourceType string) (patrolAlertResourceState, bool) {
|
|
for _, guest := range patrolGuestInventoryRows(snap, nil, nil) {
|
|
if guest.gType != guestType || !patrolAlertNameMatches(alert, guest.id, guest.name) {
|
|
continue
|
|
}
|
|
return patrolAlertResourceState{
|
|
resourceType: resourceType,
|
|
name: guest.name,
|
|
status: guest.status,
|
|
cpu: guest.cpu,
|
|
memory: guest.mem,
|
|
found: true,
|
|
}, true
|
|
}
|
|
return patrolAlertResourceState{}, false
|
|
}
|
|
|
|
type patrolHostAlertRow struct {
|
|
id, name, hostname, status, platform string
|
|
cpu, memory float64
|
|
}
|
|
|
|
func patrolHostAlertRows(snap patrolRuntimeState) []patrolHostAlertRow {
|
|
if snap.readState != nil {
|
|
hosts := snap.readState.Hosts()
|
|
rows := make([]patrolHostAlertRow, 0, len(hosts))
|
|
for _, host := range hosts {
|
|
rows = append(rows, patrolHostAlertRow{
|
|
id: host.ID(),
|
|
name: host.Name(),
|
|
hostname: host.Hostname(),
|
|
status: string(host.Status()),
|
|
platform: host.Platform(),
|
|
cpu: host.CPUPercent(),
|
|
memory: host.MemoryPercent(),
|
|
})
|
|
}
|
|
return rows
|
|
}
|
|
|
|
rows := make([]patrolHostAlertRow, 0, len(snap.Hosts))
|
|
for _, host := range snap.Hosts {
|
|
rows = append(rows, patrolHostAlertRow{
|
|
id: host.ID,
|
|
name: host.DisplayName,
|
|
hostname: host.Hostname,
|
|
status: host.Status,
|
|
platform: host.Platform,
|
|
cpu: host.CPUUsage,
|
|
memory: host.Memory.Usage,
|
|
})
|
|
}
|
|
return rows
|
|
}
|
|
|
|
func patrolHostResourceType(platform string) string {
|
|
return "agent"
|
|
}
|
|
|
|
func patrolLookupHostAlertResourceState(alert AlertInfo, snap patrolRuntimeState) (patrolAlertResourceState, bool) {
|
|
for _, host := range patrolHostAlertRows(snap) {
|
|
if !patrolAlertNameMatches(alert, host.id, host.name, host.hostname) {
|
|
continue
|
|
}
|
|
name := host.hostname
|
|
if name == "" {
|
|
name = host.name
|
|
}
|
|
return patrolAlertResourceState{
|
|
resourceType: patrolHostResourceType(host.platform),
|
|
platform: host.platform,
|
|
name: name,
|
|
status: host.status,
|
|
cpu: host.cpu,
|
|
memory: host.memory,
|
|
found: true,
|
|
}, true
|
|
}
|
|
return patrolAlertResourceState{}, false
|
|
}
|
|
|
|
func (p *PatrolService) getCurrentMetricValueState(alert AlertInfo, snap patrolRuntimeState) float64 {
|
|
resource := lookupPatrolAlertResourceState(alert, snap)
|
|
if !resource.found {
|
|
return -1
|
|
}
|
|
switch alert.Type {
|
|
case "cpu":
|
|
return resource.cpu
|
|
case "memory":
|
|
return resource.memory
|
|
default:
|
|
if resource.resourceType == "storage" {
|
|
return resource.disk
|
|
}
|
|
}
|
|
return -1
|
|
}
|
|
|
|
// isResourceOnline checks if a resource that triggered an offline alert is now online
|
|
func (p *PatrolService) isResourceOnlineState(alert AlertInfo, snap patrolRuntimeState) bool {
|
|
resource := lookupPatrolAlertResourceState(alert, snap)
|
|
if !resource.found {
|
|
return false
|
|
}
|
|
switch resource.resourceType {
|
|
case "node", "agent":
|
|
return resource.status == "online"
|
|
case "vm", "system-container", "app-container":
|
|
return resource.status == "running" || resource.status == string(unifiedresources.StatusOnline)
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// askAIAboutAlert uses the AI to determine if an alert should be resolved
|
|
func (p *PatrolService) askAIAboutAlertState(ctx context.Context, alert AlertInfo, snap patrolRuntimeState, aiService *Service, executionID string) (bool, string) {
|
|
alertType := patrolAlertLookupType(alert)
|
|
// Build a focused prompt for the AI
|
|
prompt := fmt.Sprintf(`Review this alert and determine if it should be auto-resolved based on current state.
|
|
|
|
ALERT:
|
|
- ID: %s
|
|
- Type: %s
|
|
- Resource: %s (%s)
|
|
- Message: %s
|
|
- Value when triggered: %.1f
|
|
- Threshold: %.1f
|
|
- Active for: %s
|
|
|
|
CURRENT STATE OF THIS RESOURCE:
|
|
%s
|
|
|
|
Should this alert be RESOLVED because the underlying issue is fixed?
|
|
Respond with ONLY one of:
|
|
- RESOLVE: <brief reason>
|
|
- KEEP: <brief reason>`,
|
|
alert.ID, alert.Type, alert.ResourceName, alertType,
|
|
alert.Message, alert.Value, alert.Threshold, alert.Duration,
|
|
p.getResourceCurrentStateState(alert, snap))
|
|
|
|
// Use a quick, low-cost AI call
|
|
response, err := aiService.QuickAnalysis(ctx, QuickAnalysisRequest{
|
|
Prompt: prompt,
|
|
ExecutionID: executionID,
|
|
UseCase: "patrol",
|
|
})
|
|
if err != nil {
|
|
log.Debug().Err(err).Str("alertID", alert.ID).Msg("AI Patrol: Failed to get AI judgment on alert")
|
|
return false, ""
|
|
}
|
|
|
|
response = strings.TrimSpace(response)
|
|
if strings.HasPrefix(strings.ToUpper(response), "RESOLVE:") {
|
|
reason := strings.TrimSpace(strings.TrimPrefix(response, "RESOLVE:"))
|
|
if reason == "" {
|
|
reason = strings.TrimSpace(strings.TrimPrefix(strings.ToUpper(response), "RESOLVE:"))
|
|
}
|
|
return true, "Patrol: " + reason
|
|
}
|
|
|
|
return false, ""
|
|
}
|
|
|
|
// getResourceCurrentState returns a description of the resource's current state
|
|
func (p *PatrolService) getResourceCurrentStateState(alert AlertInfo, snap patrolRuntimeState) string {
|
|
resource := lookupPatrolAlertResourceState(alert, snap)
|
|
if !resource.found {
|
|
switch patrolAlertLookupType(alert) {
|
|
case "storage":
|
|
return "Storage not found in current state (may have been removed)"
|
|
case "node":
|
|
return "Node not found in current state"
|
|
case "agent":
|
|
return "Agent host not found in current state"
|
|
case "vm":
|
|
return "VM not found in current state"
|
|
case "system-container":
|
|
return "Container not found in current state"
|
|
case "app-container":
|
|
return "Docker container not found in current state"
|
|
default:
|
|
return "Resource state unknown"
|
|
}
|
|
}
|
|
switch resource.resourceType {
|
|
case "storage":
|
|
return fmt.Sprintf("Storage '%s': %.1f%% used, status: %s", resource.name, resource.disk, resource.status)
|
|
case "node":
|
|
return fmt.Sprintf("Node '%s': CPU %.1f%%, Memory %.1f%%, Status: %s",
|
|
resource.name, resource.cpu, resource.memory, resource.status)
|
|
case "agent":
|
|
if strings.EqualFold(strings.TrimSpace(resource.platform), "truenas") {
|
|
return fmt.Sprintf("TrueNAS system '%s': CPU %.1f%%, Memory %.1f%%, Status: %s",
|
|
resource.name, resource.cpu, resource.memory, resource.status)
|
|
}
|
|
return fmt.Sprintf("Agent host '%s': CPU %.1f%%, Memory %.1f%%, Status: %s",
|
|
resource.name, resource.cpu, resource.memory, resource.status)
|
|
case "vm":
|
|
return fmt.Sprintf("VM '%s': CPU %.1f%%, Memory %.1f%%, Status: %s",
|
|
resource.name, resource.cpu, resource.memory, resource.status)
|
|
case "system-container":
|
|
return fmt.Sprintf("Container '%s': CPU %.1f%%, Memory %.1f%%, Status: %s",
|
|
resource.name, resource.cpu, resource.memory, resource.status)
|
|
case "app-container":
|
|
return fmt.Sprintf("Docker container '%s': CPU %.1f%%, Memory %.1f%%, State: %s",
|
|
resource.name, resource.cpu, resource.memory, resource.status)
|
|
default:
|
|
return "Resource state unknown"
|
|
}
|
|
}
|
|
|
|
// TriggerPatrolForAlert triggers an immediate patrol for a specific alert
|
|
func (p *PatrolService) TriggerPatrolForAlert(alert *alerts.Alert) {
|
|
if alert == nil {
|
|
return
|
|
}
|
|
|
|
p.mu.RLock()
|
|
triggerManager := p.triggerManager
|
|
eventTriggerConfig := p.eventTriggerConfig
|
|
p.mu.RUnlock()
|
|
|
|
// Gate: skip if alert-driven scoped patrols are disabled
|
|
if !eventTriggerConfig.AlertTriggersEnabled {
|
|
log.Debug().
|
|
Str("alert_identifier", alert.ID).
|
|
Msg("alert-triggered patrol skipped: alert trigger source disabled")
|
|
return
|
|
}
|
|
|
|
resourceType := inferResourceType(alert.Type, alert.Metadata)
|
|
|
|
if triggerManager != nil {
|
|
scope := AlertTriggeredPatrolScope(alert.ID, alert.ResourceID, resourceType, alert.Type)
|
|
if triggerManager.TriggerPatrol(scope) {
|
|
log.Debug().Str("alert_identifier", alert.ID).Msg("queued alert-triggered patrol via trigger manager")
|
|
} else {
|
|
log.Warn().Str("alert_identifier", alert.ID).Msg("alert-triggered patrol rejected by trigger manager")
|
|
}
|
|
return
|
|
}
|
|
|
|
// Non-blocking send
|
|
select {
|
|
case p.adHocTrigger <- alert:
|
|
log.Debug().Str("alert_identifier", alert.ID).Msg("queued ad-hoc patrol trigger")
|
|
default:
|
|
log.Warn().Str("alert_identifier", alert.ID).Msg("patrol trigger queue full, dropping trigger")
|
|
}
|
|
}
|
|
|
|
func (p *PatrolService) tryStartRun(kind string) bool {
|
|
p.mu.Lock()
|
|
if p.runInProgress {
|
|
// Detect stuck runs: if the current run has been going for >20 minutes,
|
|
// force-clear the flag so a new run can proceed.
|
|
if !p.runStartedAt.IsZero() && time.Since(p.runStartedAt) > 20*time.Minute {
|
|
log.Warn().
|
|
Str("kind", kind).
|
|
Time("started_at", p.runStartedAt).
|
|
Dur("elapsed", time.Since(p.runStartedAt)).
|
|
Msg("AI Patrol: Previous run appears stuck (>20min), force-clearing runInProgress")
|
|
p.runInProgress = false
|
|
// Fall through to start new run
|
|
} else {
|
|
p.mu.Unlock()
|
|
if kind == "scoped" {
|
|
GetPatrolMetrics().RecordScopedDropped()
|
|
log.Warn().Str("kind", kind).Msg("AI Patrol: Run already in progress, dropping scoped patrol")
|
|
} else {
|
|
log.Debug().Str("kind", kind).Msg("AI Patrol: Run already in progress, skipping")
|
|
}
|
|
return false
|
|
}
|
|
}
|
|
p.runInProgress = true
|
|
p.runStartedAt = time.Now()
|
|
p.mu.Unlock()
|
|
return true
|
|
}
|
|
|
|
func (p *PatrolService) endRun() {
|
|
p.mu.Lock()
|
|
p.runInProgress = false
|
|
orch := p.investigationOrchestrator
|
|
p.mu.Unlock()
|
|
|
|
// Periodic investigation store maintenance after each run
|
|
if maintainer, ok := orch.(InvestigationStoreMaintainer); ok {
|
|
maintainer.CleanupInvestigationStore(24*time.Hour, 1000)
|
|
}
|
|
}
|
|
|
|
// runTargetedPatrol executes a focused patrol for a specific alert
|
|
func (p *PatrolService) runTargetedPatrol(ctx context.Context, alert *alerts.Alert) {
|
|
log.Info().
|
|
Str("alert_identifier", alert.ID).
|
|
Str("resource_id", alert.ResourceID).
|
|
Msg("Running targeted AI patrol for alert")
|
|
|
|
resourceType := inferResourceType(alert.Type, alert.Metadata)
|
|
scope := AlertTriggeredPatrolScope(alert.ID, alert.ResourceID, resourceType, alert.Type)
|
|
p.TriggerScopedPatrol(ctx, scope)
|
|
}
|
|
|
|
// joinParts joins string parts with commas and "and" for the last element
|
|
func joinParts(parts []string) string {
|
|
if len(parts) == 0 {
|
|
return ""
|
|
}
|
|
if len(parts) == 1 {
|
|
return parts[0]
|
|
}
|
|
if len(parts) == 2 {
|
|
return parts[0] + " and " + parts[1]
|
|
}
|
|
return strings.Join(parts[:len(parts)-1], ", ") + ", and " + parts[len(parts)-1]
|
|
}
|
|
|
|
// generateFindingID creates a stable ID for a finding based on resource, category, and issue.
|
|
// All three components are included to ensure distinct issues on the same resource remain separate.
|
|
func generateFindingID(resourceID, category, issue string) string {
|
|
hash := sha256.Sum256([]byte(fmt.Sprintf("%s:%s:%s", resourceID, category, issue)))
|
|
return fmt.Sprintf("%x", hash[:8])
|
|
}
|