Pulse/internal/ai/patrol_run.go
2026-04-03 19:45:38 +01:00

2506 lines
80 KiB
Go

// patrol_run.go implements the PatrolService runtime: Start/Stop lifecycle,
// the main patrol loop, scoped patrol execution, alert auto-resolution,
// live streaming to UI subscribers, and run history tracking.
package ai
import (
"context"
"crypto/sha256"
"fmt"
"strconv"
"strings"
"time"
"github.com/google/uuid"
"github.com/rcourtman/pulse-go-rewrite/internal/ai/circuit"
"github.com/rcourtman/pulse-go-rewrite/internal/ai/providers"
"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
"github.com/rs/zerolog/log"
)
// Patrol run lifecycle constants.
const (
initialPatrolStartDelay = 30 * time.Second // Delay before first patrol after startup
findingCleanupAge = 24 * time.Hour // Resolved findings older than this are purged
scopedPatrolRetryBackoff1 = 5 * time.Second // First retry backoff for dropped scoped patrols
scopedPatrolRetryBackoff2 = 15 * time.Second // Second retry backoff for dropped scoped patrols
scopedPatrolMaxRetries = 2 // Maximum re-queue attempts for dropped scoped patrols
scopedPatrolLogIDLimit = 10 // Maximum number of effective scope IDs to log inline
)
// Start begins the background patrol loop
func (p *PatrolService) Start(ctx context.Context) {
p.mu.Lock()
if p.running {
p.mu.Unlock()
return
}
p.running = true
p.stopCh = make(chan struct{})
p.configChanged = make(chan struct{}, 1) // Buffered to allow non-blocking send
p.mu.Unlock()
log.Info().
Dur("interval", p.config.GetInterval()).
Msg("Starting AI Patrol Service")
go p.patrolLoop(ctx)
}
// Stop stops the patrol service. It signals the patrol loop to exit, then
// waits up to 15 seconds for in-flight investigations to finish and
// force-saves findings/investigation state to disk.
func (p *PatrolService) Stop() {
p.mu.Lock()
if !p.running {
p.mu.Unlock()
return
}
p.running = false
close(p.stopCh)
orchestrator := p.investigationOrchestrator
findings := p.findings
p.mu.Unlock()
log.Info().Msg("stopping AI Patrol Service")
// Give investigations 15 seconds to finish (leaves headroom within server's 30s budget)
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 15*time.Second)
defer shutdownCancel()
// Signal orchestrator to cancel running investigations and persist state
if orchestrator != nil {
if err := orchestrator.Shutdown(shutdownCtx); err != nil {
log.Warn().Err(err).Msg("AI Patrol: Investigation orchestrator shutdown returned error")
}
}
// Wait for investigation goroutines tracked by PatrolService
done := make(chan struct{})
go func() {
p.investigationWg.Wait()
close(done)
}()
select {
case <-done:
// All investigation goroutines finished
case <-shutdownCtx.Done():
log.Warn().Msg("AI Patrol: Timed out waiting for investigation goroutines to finish")
}
// Force-save findings store
if findings != nil {
if err := findings.ForceSave(); err != nil {
log.Error().Err(err).Msg("AI Patrol: Failed to force-save findings during shutdown")
}
}
log.Info().Msg("AI Patrol Service stopped")
}
// patrolLoop is the main background loop
func (p *PatrolService) patrolLoop(ctx context.Context) {
// Seed recency from persisted run history so the API can return Patrol timing
// metadata immediately (before the first in-process patrol completes).
if history := p.GetRunHistory(10); len(history) > 0 {
lastActivity, lastFullPatrol := patrolRecencyFromHistory(history)
p.mu.Lock()
p.lastActivity = lastActivity
p.lastFullPatrol = lastFullPatrol
p.mu.Unlock()
}
// Run initial patrol shortly after startup, but only if one hasn't run recently
initialDelay := initialPatrolStartDelay
initialTimer := time.NewTimer(initialDelay)
defer initialTimer.Stop()
select {
case <-initialTimer.C:
// Check if a patrol ran recently (within last hour) to avoid wasting tokens on restarts
runHistory := p.GetRunHistory(10)
skipInitial := shouldSkipInitialFullPatrol(runHistory, time.Now())
if !skipInitial {
p.runPatrolWithTrigger(ctx, TriggerReasonStartup, nil)
}
case <-p.stopCh:
if !initialTimer.Stop() {
select {
case <-initialTimer.C:
default:
}
}
return
case <-ctx.Done():
if !initialTimer.Stop() {
select {
case <-initialTimer.C:
default:
}
}
return
}
p.mu.RLock()
interval := p.config.GetInterval()
configCh := p.configChanged
p.mu.RUnlock()
ticker := time.NewTicker(interval)
defer ticker.Stop()
p.mu.Lock()
p.nextScheduledAt = time.Now().Add(interval)
p.mu.Unlock()
for {
select {
case <-ticker.C:
// Update next scheduled time before the run starts — time.Now() closely
// matches the tick time here, and the ticker will fire again at roughly
// this moment + interval regardless of how long the run takes.
p.mu.Lock()
p.nextScheduledAt = time.Now().Add(interval)
p.mu.Unlock()
p.runPatrolWithTrigger(ctx, TriggerReasonScheduled, nil)
case alert := <-p.adHocTrigger:
// Run immediate targeted patrol for this alert
log.Info().Str("alert_identifier", alert.ID).Msg("patrol triggered by alert")
p.runTargetedPatrol(ctx, alert)
case <-configCh:
// Config changed - reset ticker with new interval
p.mu.RLock()
newInterval := p.config.GetInterval()
p.mu.RUnlock()
if newInterval != interval {
interval = newInterval
ticker.Reset(interval)
p.mu.Lock()
p.nextScheduledAt = time.Now().Add(interval)
p.mu.Unlock()
log.Info().
Dur("interval", interval).
Msg("Patrol ticker reset to new interval")
}
case <-p.stopCh:
return
case <-ctx.Done():
return
}
}
}
func shouldSkipInitialFullPatrol(runHistory []PatrolRunRecord, now time.Time) bool {
for _, run := range runHistory {
if run.CompletedAt.IsZero() {
continue
}
timeSinceLastRun := now.Sub(run.CompletedAt)
if timeSinceLastRun >= 1*time.Hour {
continue
}
if isSuccessfulFullPatrolRun(run) {
log.Info().
Dur("time_since_last", timeSinceLastRun).
Str("run_type", run.Type).
Str("trigger_reason", run.TriggerReason).
Msg("AI Patrol: Skipping initial patrol - recent successful full run exists")
return true
}
}
return false
}
func patrolRecencyFromHistory(runHistory []PatrolRunRecord) (time.Time, time.Time) {
var lastActivity time.Time
var lastFullPatrol time.Time
for _, run := range runHistory {
if run.CompletedAt.IsZero() {
continue
}
if lastActivity.IsZero() || run.CompletedAt.After(lastActivity) {
lastActivity = run.CompletedAt
}
if isFullPatrolRun(run) && (lastFullPatrol.IsZero() || run.CompletedAt.After(lastFullPatrol)) {
lastFullPatrol = run.CompletedAt
}
}
return lastActivity, lastFullPatrol
}
// runPatrol executes a scheduled patrol run
func (p *PatrolService) runPatrol(ctx context.Context) {
p.runPatrolWithTrigger(ctx, TriggerReasonScheduled, nil)
}
// runPatrolWithTrigger executes a patrol run with trigger context
func (p *PatrolService) runPatrolWithTrigger(ctx context.Context, trigger TriggerReason, scope *PatrolScope) {
p.mu.RLock()
cfg := p.config
breaker := p.circuitBreaker
p.mu.RUnlock()
if !cfg.Enabled {
return
}
if !p.tryStartRun("full") {
return
}
defer p.endRun()
// Check if circuit breaker allows LLM calls.
llmAllowed := breaker == nil || breaker.Allow()
if !llmAllowed {
log.Warn().Msg("AI Patrol: Circuit breaker is open (LLM calls blocked)")
}
start := time.Now()
runID := fmt.Sprintf("%d", start.UnixNano())
executionID := uuid.NewString()
patrolType := "patrol"
GetPatrolMetrics().RecordRun(string(trigger), "full")
log.Debug().Msg("AI Patrol: Starting patrol run")
// Track run statistics
var runStats struct {
resourceCount int
nodesChecked int
guestsChecked int
dockerChecked int
storageChecked int
hostsChecked int
trueNASChecked int
pbsChecked int
pmgChecked int
kubernetesChecked int
newFindings int
existingFindings int
rejectedFindings int
triageFlags int
triageSkippedLLM bool
findingIDs []string
errors int
lastAIError error // Preserve original error for circuit breaker categorization
aiAnalysis *AIAnalysisResult // Stores the AI's analysis for the run record
}
var newFindings []*Finding
// Get current state
if !p.hasPatrolRuntimeInputs() {
log.Warn().Msg("AI Patrol: No runtime state available")
return
}
state := p.currentPatrolRuntimeState()
// Helper to track findings
// Note: Only warning+ severity findings count toward newFindings since watch/info are filtered from UI
trackFinding := func(f *Finding) bool {
isNew := p.recordFinding(f)
if isNew {
// Only count warning+ findings as "new" for user-facing stats
if f.Severity == FindingSeverityWarning || f.Severity == FindingSeverityCritical {
runStats.newFindings++
newFindings = append(newFindings, f)
}
} else {
runStats.existingFindings++
}
// Only track warning+ severity finding IDs in the run record
if f.Severity == FindingSeverityWarning || f.Severity == FindingSeverityCritical {
runStats.findingIDs = append(runStats.findingIDs, f.ID)
}
return isNew
}
// Count resources for statistics from the patrol runtime state so scoped and
// unscoped runs share the same read-state-first semantics.
resourceCounts := patrolRuntimeCountResources(state)
if cfg.AnalyzeNodes {
runStats.nodesChecked = resourceCounts.nodes
}
if cfg.AnalyzeGuests {
runStats.guestsChecked = resourceCounts.guests
}
if cfg.AnalyzeDocker {
runStats.dockerChecked = resourceCounts.docker
}
if cfg.AnalyzeStorage {
runStats.storageChecked = resourceCounts.storage
}
if cfg.AnalyzePBS {
runStats.pbsChecked = resourceCounts.pbs
}
if cfg.AnalyzePMG {
runStats.pmgChecked = resourceCounts.pmg
}
if cfg.AnalyzeHosts {
runStats.hostsChecked = resourceCounts.hosts
runStats.trueNASChecked = resourceCounts.truenas
}
if cfg.AnalyzeKubernetes {
runStats.kubernetesChecked = resourceCounts.kubernetes
}
runStats.resourceCount = runStats.nodesChecked + runStats.guestsChecked +
runStats.dockerChecked + runStats.storageChecked + runStats.pbsChecked + runStats.pmgChecked + runStats.hostsChecked +
runStats.trueNASChecked +
runStats.kubernetesChecked
// Determine if we can run LLM analysis (requires AI service + circuit breaker not open)
aiServiceEnabled := p.aiService != nil && p.aiService.IsEnabled()
canRunLLM := aiServiceEnabled && llmAllowed
// Check quickstart credit status for messaging
p.mu.RLock()
qsMgr := p.quickstartCredits
p.mu.RUnlock()
// Check if we can run LLM analysis (AI-only patrol)
if !canRunLLM {
reason := "AI not configured - set up a provider in Settings > Pulse Assistant"
if !aiServiceEnabled {
if p.aiService != nil && strings.TrimSpace(p.aiService.QuickstartBlockedReason()) != "" {
reason = p.aiService.QuickstartBlockedReason()
} else if qsMgr != nil && !qsMgr.HasBYOK() && !qsMgr.HasCredits() {
// Distinguish between exhausted credits and no AI configured.
reason = patrolQuickstartCreditsExhaustedReason
} else {
reason = "AI not configured - set up a provider in Settings > Pulse Assistant"
}
} else if !llmAllowed {
reason = "circuit breaker is open"
GetPatrolMetrics().RecordCircuitBlock()
}
p.setBlockedReason(reason)
log.Info().Str("reason", reason).Msg("AI Patrol: Skipping run - AI unavailable")
return
}
// Check if using quickstart credits — verify credits remain before starting
usingQuickstart := p.aiService != nil && p.aiService.IsUsingQuickstart()
if usingQuickstart && p.aiService != nil && strings.TrimSpace(p.aiService.QuickstartBlockedReason()) != "" {
p.setBlockedReason(p.aiService.QuickstartBlockedReason())
log.Info().Str("reason", p.aiService.QuickstartBlockedReason()).Msg("AI Patrol: Skipping run - quickstart unavailable")
return
}
if usingQuickstart && qsMgr != nil && !qsMgr.HasCredits() {
p.setBlockedReason(patrolQuickstartCreditsExhaustedReason)
log.Info().Msg("AI Patrol: Skipping run - quickstart credits exhausted")
return
}
{
p.clearBlockedReason()
// Ensure stream state is clean for this run before the first streamed event.
p.resetStreamForRun(runID)
// Run agentic AI analysis — the LLM uses tools to investigate and reports findings
aiResult, aiErr := p.runAIAnalysisState(ctx, state, scope, executionID)
if aiErr != nil {
log.Warn().Err(aiErr).Msg("AI Patrol: LLM analysis failed")
runStats.errors++
runStats.lastAIError = aiErr
// Create a finding to surface this error to the user
errMsg := aiErr.Error()
var title, description, recommendation string
if usingQuickstart && (strings.Contains(errMsg, "dial tcp") || strings.Contains(errMsg, "no such host") || strings.Contains(errMsg, "connection refused") || strings.Contains(errMsg, "i/o timeout")) {
title = "Pulse Patrol: Quickstart credits require internet"
description = "Pulse Patrol cannot reach the quickstart proxy server. Quickstart credits require an internet connection."
recommendation = patrolQuickstartUnavailableReason
} else if strings.Contains(errMsg, "Insufficient Balance") || strings.Contains(errMsg, "402") {
title = "Pulse Patrol: Insufficient API credits"
description = "Pulse Patrol cannot analyze your infrastructure because your provider account has insufficient credits."
recommendation = "Add credits to your provider account (DeepSeek, OpenAI, etc.) or switch to a different provider in Pulse Assistant settings."
} else if strings.Contains(errMsg, "401") || strings.Contains(errMsg, "Unauthorized") {
title = "Pulse Patrol: Invalid API key"
description = "Pulse Patrol cannot analyze your infrastructure because the API key is invalid or expired."
recommendation = "Check your API key in Pulse Assistant settings and verify it is correct."
} else if strings.Contains(errMsg, "rate limit") || strings.Contains(errMsg, "429") {
title = "Pulse Patrol: Rate limited"
description = "Pulse Patrol is being rate limited by your provider. Analysis will be retried on the next patrol run."
recommendation = "Wait for the rate limit to reset, or consider upgrading your API plan for higher limits."
} else {
title = "Pulse Patrol: Analysis failed"
description = fmt.Sprintf("Pulse Patrol encountered an error while analyzing your infrastructure: %s", errMsg)
recommendation = "Check your Pulse Assistant settings and API key. If the problem persists, check the logs for more details."
}
errorFinding := &Finding{
ID: generateFindingID("ai-service", "reliability", "ai-patrol-error"),
Key: "ai-patrol-error",
Severity: "warning",
Category: "reliability",
ResourceID: "ai-service",
ResourceName: "Pulse Patrol Service",
ResourceType: "service",
Title: title,
Description: description,
Recommendation: recommendation,
Evidence: fmt.Sprintf("Error: %s", errMsg),
DetectedAt: time.Now(),
LastSeenAt: time.Now(),
}
trackFinding(errorFinding)
if usingQuickstart {
switch {
case providers.IsQuickstartCreditsExhausted(aiErr):
p.setBlockedReason(patrolQuickstartCreditsExhaustedReason)
case providers.IsQuickstartUnavailable(aiErr), quickstartBootstrapUnavailable(aiErr):
p.setBlockedReason(patrolQuickstartUnavailableReason)
}
}
} else if aiResult != nil {
runStats.aiAnalysis = aiResult
runStats.rejectedFindings = aiResult.RejectedFindings
runStats.triageFlags = aiResult.TriageFlags
runStats.triageSkippedLLM = aiResult.TriageSkippedLLM
// Auto-resolve previous patrol error finding if this run succeeded
errorFindingID := generateFindingID("ai-service", "reliability", "ai-patrol-error")
if existing := p.findings.Get(errorFindingID); existing != nil && !existing.IsResolved() {
p.findings.Resolve(errorFindingID, true) // auto-resolved
if resolver := p.unifiedFindingResolver; resolver != nil {
resolver(errorFindingID)
}
log.Info().Msg("AI Patrol: Auto-resolved previous patrol error finding after successful run")
}
// Findings are already recorded via patrol_report_finding tool calls.
// Track stats from the collected findings.
for _, f := range aiResult.Findings {
if f.Severity == FindingSeverityWarning || f.Severity == FindingSeverityCritical {
runStats.findingIDs = append(runStats.findingIDs, f.ID)
// Check if this finding was new by looking at the store
stored := p.findings.Get(f.ID)
if stored != nil && stored.TimesRaised <= 1 {
runStats.newFindings++
newFindings = append(newFindings, f)
} else {
runStats.existingFindings++
}
}
}
}
}
// Count resolved findings: LLM-resolved (via tool) + auto-reconciled stale findings.
var resolvedCount int
if runStats.aiAnalysis != nil {
resolvedCount = len(runStats.aiAnalysis.ResolvedIDs)
// Auto-resolve stale findings: active findings that were presented to the LLM
// in seed context but were neither re-reported nor explicitly resolved.
// Only runs after successful full patrols (not scoped).
autoResolved := p.reconcileStaleFindings(
runStats.aiAnalysis.ReportedIDs,
runStats.aiAnalysis.ResolvedIDs,
runStats.aiAnalysis.SeededFindingIDs,
runStats.errors > 0,
)
resolvedCount += autoResolved
if autoResolved > 0 {
log.Info().
Int("auto_resolved", autoResolved).
Msg("AI Patrol: Auto-resolved stale findings after full patrol")
}
}
// Cleanup old resolved findings (always runs, doesn't require LLM)
cleaned := p.findings.Cleanup(findingCleanupAge)
if cleaned > 0 {
log.Debug().Int("cleaned", cleaned).Msg("AI Patrol: Cleaned up old findings")
}
// Recover investigations stuck in "running" state (goroutine panicked or was killed)
p.recoverStuckInvestigations()
// Retry investigations that failed due to timeout (shorter cooldown than permanent failures)
p.retryTimedOutInvestigations()
// AI-based alert review: check active alerts against current state and auto-resolve fixed issues
// Pass llmAllowed so it knows whether AI calls are allowed.
alertsResolved := p.reviewAndResolveAlertsState(ctx, state, llmAllowed, executionID)
if alertsResolved > 0 {
log.Info().Int("alerts_resolved", alertsResolved).Msg("AI Patrol: Auto-resolved alerts where issues are fixed")
}
duration := time.Since(start)
completedAt := time.Now()
// Build findings summary string
summary := p.findings.GetSummary()
var findingsSummaryStr string
var status string
// Only count critical and warning as active issues (watch/info are filtered from UI)
totalActive := summary.Critical + summary.Warning
if totalActive == 0 {
findingsSummaryStr = "All healthy"
status = "healthy"
} else {
parts := []string{}
if summary.Critical > 0 {
parts = append(parts, fmt.Sprintf("%d critical", summary.Critical))
}
if summary.Warning > 0 {
parts = append(parts, fmt.Sprintf("%d warning", summary.Warning))
}
findingsSummaryStr = fmt.Sprintf("%s", joinParts(parts))
if summary.Critical > 0 {
status = "critical"
} else {
status = "issues_found"
}
}
if runStats.errors > 0 {
status = "error"
// Don't claim "All healthy" if there were errors - the patrol didn't complete properly
if findingsSummaryStr == "All healthy" {
findingsSummaryStr = fmt.Sprintf("Analysis incomplete (%d errors)", runStats.errors)
}
}
// Create run record
runRecord := PatrolRunRecord{
ID: runID,
StartedAt: start,
CompletedAt: completedAt,
Duration: duration,
DurationMs: duration.Milliseconds(),
Type: patrolType,
TriggerReason: string(trigger),
ResourcesChecked: runStats.resourceCount,
NodesChecked: runStats.nodesChecked,
GuestsChecked: runStats.guestsChecked,
DockerChecked: runStats.dockerChecked,
StorageChecked: runStats.storageChecked,
HostsChecked: runStats.hostsChecked,
TrueNASChecked: runStats.trueNASChecked,
PBSChecked: runStats.pbsChecked,
PMGChecked: runStats.pmgChecked,
KubernetesChecked: runStats.kubernetesChecked,
NewFindings: runStats.newFindings,
ExistingFindings: runStats.existingFindings,
RejectedFindings: runStats.rejectedFindings,
ResolvedFindings: resolvedCount,
AutoFixCount: 0,
FindingsSummary: findingsSummaryStr,
FindingIDs: runStats.findingIDs,
ErrorCount: runStats.errors,
Status: status,
}
if scope != nil {
runRecord.ScopeResourceIDs = scope.ResourceIDs
runRecord.ScopeResourceTypes = scope.ResourceTypes
runRecord.ScopeContext = scope.Context
runRecord.AlertIdentifier = scope.AlertIdentifier
runRecord.FindingID = scope.FindingID
}
// Add AI analysis details if available
if runStats.aiAnalysis != nil {
runRecord.AIAnalysis = runStats.aiAnalysis.Response
runRecord.InputTokens = runStats.aiAnalysis.InputTokens
runRecord.OutputTokens = runStats.aiAnalysis.OutputTokens
runRecord.TriageFlags = runStats.triageFlags
runRecord.TriageSkippedLLM = runStats.triageSkippedLLM
toolCalls := runStats.aiAnalysis.ToolCalls
if len(toolCalls) > MaxToolCallsPerRun {
toolCalls = toolCalls[:MaxToolCallsPerRun]
}
runRecord.ToolCalls = toolCalls
runRecord.ToolCallCount = len(runStats.aiAnalysis.ToolCalls)
log.Debug().
Int("response_length", len(runStats.aiAnalysis.Response)).
Int("input_tokens", runStats.aiAnalysis.InputTokens).
Int("output_tokens", runStats.aiAnalysis.OutputTokens).
Int("tool_calls", runRecord.ToolCallCount).
Msg("AI Patrol: Storing AI analysis in run record")
} else {
log.Debug().Msg("AI Patrol: No AI analysis to store (aiAnalysis is nil)")
}
p.mu.Lock()
p.lastActivity = completedAt
p.lastFullPatrol = completedAt
p.lastDuration = duration
p.resourcesChecked = runStats.resourceCount
p.errorCount = runStats.errors
p.mu.Unlock()
// Record circuit breaker result only if we actually attempted LLM calls.
// canRunLLM is true only when AI is enabled, licensed, AND breaker allowed.
// Use error categorization so non-transient errors (auth failures, insufficient
// credits) don't trip the breaker — those won't be fixed by waiting.
if breaker != nil && canRunLLM {
if runStats.errors > 0 {
aiErr := runStats.lastAIError
if aiErr == nil {
aiErr = fmt.Errorf("patrol completed with %d errors", runStats.errors)
}
breaker.RecordFailureWithCategory(aiErr, circuit.CategorizeError(aiErr))
} else {
breaker.RecordSuccess()
}
}
// Add to history store (handles persistence automatically)
p.runHistoryStore.Add(runRecord)
log.Info().
Str("type", patrolType).
Dur("duration", duration).
Int("resources", runStats.resourceCount).
Int("new_findings", runStats.newFindings).
Int("resolved", resolvedCount).
Int("critical", summary.Critical).
Int("warning", summary.Warning).
Int("watch", summary.Watch).
Msg("AI Patrol: Completed patrol run")
}
// runScopedPatrol runs a patrol on a filtered subset of resources.
// This provides token-efficient analysis for event-driven patrols.
func (p *PatrolService) runScopedPatrol(ctx context.Context, scope PatrolScope) {
p.mu.RLock()
cfg := p.config
breaker := p.circuitBreaker
p.mu.RUnlock()
if !cfg.Enabled {
return
}
if !p.tryStartRun("scoped") {
// Re-queue with backoff if retries remain
if scope.RetryCount < scopedPatrolMaxRetries {
scope.RetryCount++
backoff := scopedPatrolRetryBackoff1
if scope.RetryCount == scopedPatrolMaxRetries {
backoff = scopedPatrolRetryBackoff2
}
scope.RetryAfter = time.Now().Add(backoff)
if tm := p.GetTriggerManager(); tm != nil {
tm.TriggerPatrol(scope)
log.Info().
Int("retry", scope.RetryCount).
Dur("backoff", backoff).
Strs("resources", scope.ResourceIDs).
Msg("AI Patrol: Re-queued dropped scoped patrol with backoff")
}
} else {
GetPatrolMetrics().RecordScopedDroppedFinal()
log.Error().
Strs("resources", scope.ResourceIDs).
Str("reason", string(scope.Reason)).
Msg("AI Patrol: Scoped patrol permanently dropped after 2 retries")
}
return
}
defer p.endRun()
// Check if circuit breaker allows LLM calls.
llmAllowed := breaker == nil || breaker.Allow()
if !llmAllowed {
log.Warn().Msg("AI Patrol: Circuit breaker is open for scoped patrol (LLM calls blocked)")
}
start := time.Now()
runID := fmt.Sprintf("%d", start.UnixNano())
executionID := uuid.NewString()
GetPatrolMetrics().RecordRun(string(scope.Reason), "scoped")
var runStats struct {
resourceCount int
nodesChecked int
guestsChecked int
dockerChecked int
storageChecked int
hostsChecked int
trueNASChecked int
pbsChecked int
pmgChecked int
kubernetesChecked int
newFindings int
existingFindings int
rejectedFindings int
triageFlags int
triageSkippedLLM bool
findingIDs []string
errors int
aiAnalysis *AIAnalysisResult
}
// Get current state
if !p.hasPatrolRuntimeInputs() {
log.Warn().Msg("AI Patrol: No runtime state available for scoped patrol")
return
}
fullState := p.currentPatrolRuntimeState()
// Filter state based on scope
filteredState := p.filterStateByScopeState(fullState, scope)
effectiveScopeIDs := patrolRuntimeSortedResourceIDs(filteredState)
resourceCounts := patrolRuntimeCountResources(filteredState)
resourceCount := 0
if cfg.AnalyzeNodes {
resourceCount += resourceCounts.nodes
}
if cfg.AnalyzeGuests {
resourceCount += resourceCounts.guests
}
if cfg.AnalyzeDocker {
resourceCount += resourceCounts.docker
}
if cfg.AnalyzeStorage {
resourceCount += resourceCounts.storage
}
if cfg.AnalyzePBS {
resourceCount += resourceCounts.pbs
}
if cfg.AnalyzeHosts {
resourceCount += resourceCounts.hosts
resourceCount += resourceCounts.truenas
}
if cfg.AnalyzeKubernetes {
resourceCount += resourceCounts.kubernetes
}
if cfg.AnalyzePMG {
resourceCount += resourceCounts.pmg
}
if resourceCount == 0 {
log.Debug().
Strs("requested_ids", scope.ResourceIDs).
Strs("requested_types", scope.ResourceTypes).
Int("effective_scope_count", len(effectiveScopeIDs)).
Msg("AI Patrol: No resources matched scope filter")
return
}
log.Debug().
Strs("requested_ids", scope.ResourceIDs).
Strs("requested_types", scope.ResourceTypes).
Strs("effective_scope_ids", patrolLogResourceIDs(effectiveScopeIDs)).
Int("effective_scope_count", len(effectiveScopeIDs)).
Int("resource_count", resourceCount).
Str("reason", string(scope.Reason)).
Msg("AI Patrol: Running scoped analysis")
// Track run statistics
if cfg.AnalyzeNodes {
runStats.nodesChecked = resourceCounts.nodes
}
if cfg.AnalyzeGuests {
runStats.guestsChecked = resourceCounts.guests
}
if cfg.AnalyzeDocker {
runStats.dockerChecked = resourceCounts.docker
}
if cfg.AnalyzeStorage {
runStats.storageChecked = resourceCounts.storage
}
if cfg.AnalyzePBS {
runStats.pbsChecked = resourceCounts.pbs
}
if cfg.AnalyzeHosts {
runStats.hostsChecked = resourceCounts.hosts
runStats.trueNASChecked = resourceCounts.truenas
}
if cfg.AnalyzeKubernetes {
runStats.kubernetesChecked = resourceCounts.kubernetes
}
if cfg.AnalyzePMG {
runStats.pmgChecked = resourceCounts.pmg
}
runStats.resourceCount = resourceCount
// Determine if we can run LLM analysis
aiServiceEnabled := p.aiService != nil && p.aiService.IsEnabled()
canRunLLM := aiServiceEnabled && llmAllowed
// Check quickstart credit status for scoped runs
p.mu.RLock()
scopedQsMgr := p.quickstartCredits
p.mu.RUnlock()
scopedUsingQuickstart := p.aiService != nil && p.aiService.IsUsingQuickstart()
if !canRunLLM {
reason := "AI not configured - set up a provider in Settings > Pulse Assistant"
if !aiServiceEnabled {
if p.aiService != nil && strings.TrimSpace(p.aiService.QuickstartBlockedReason()) != "" {
reason = p.aiService.QuickstartBlockedReason()
} else if scopedQsMgr != nil && !scopedQsMgr.HasBYOK() && !scopedQsMgr.HasCredits() {
reason = patrolQuickstartCreditsExhaustedReason
}
} else if !llmAllowed {
reason = "circuit breaker is open"
GetPatrolMetrics().RecordCircuitBlock()
}
p.setBlockedReason(reason)
log.Info().Str("reason", reason).Msg("AI Patrol: Skipping scoped run - AI unavailable")
return
}
// Check if using quickstart credits — verify credits remain before starting
if scopedUsingQuickstart && p.aiService != nil && strings.TrimSpace(p.aiService.QuickstartBlockedReason()) != "" {
p.setBlockedReason(p.aiService.QuickstartBlockedReason())
log.Info().Str("reason", p.aiService.QuickstartBlockedReason()).Msg("AI Patrol: Skipping scoped run - quickstart unavailable")
return
}
if scopedUsingQuickstart && scopedQsMgr != nil && !scopedQsMgr.HasCredits() {
p.setBlockedReason(patrolQuickstartCreditsExhaustedReason)
log.Info().Msg("AI Patrol: Skipping scoped run - quickstart credits exhausted")
return
}
{
p.clearBlockedReason()
if !scope.NoStream {
// Ensure stream state is clean for this run before the first streamed event.
p.resetStreamForRun(runID)
}
// Run agentic AI analysis on filtered state with scope
aiResult, aiErr := p.runAIAnalysisState(ctx, filteredState, &scope, executionID)
if aiErr != nil {
log.Warn().Err(aiErr).Msg("AI Patrol (scoped): LLM analysis failed")
runStats.errors++
if scopedUsingQuickstart {
switch {
case providers.IsQuickstartCreditsExhausted(aiErr):
p.setBlockedReason(patrolQuickstartCreditsExhaustedReason)
case providers.IsQuickstartUnavailable(aiErr), quickstartBootstrapUnavailable(aiErr):
p.setBlockedReason(patrolQuickstartUnavailableReason)
}
}
} else if aiResult != nil {
runStats.aiAnalysis = aiResult
runStats.rejectedFindings = aiResult.RejectedFindings
runStats.triageFlags = aiResult.TriageFlags
runStats.triageSkippedLLM = aiResult.TriageSkippedLLM
// Findings are already recorded via patrol_report_finding tool calls.
for _, f := range aiResult.Findings {
if f.Severity == FindingSeverityWarning || f.Severity == FindingSeverityCritical {
runStats.findingIDs = append(runStats.findingIDs, f.ID)
stored := p.findings.Get(f.ID)
if stored != nil && stored.TimesRaised <= 1 {
runStats.newFindings++
} else {
runStats.existingFindings++
}
}
}
}
}
duration := time.Since(start)
completedAt := time.Now()
// Build findings summary string
summary := p.findings.GetSummary()
var findingsSummaryStr string
var status string
totalActive := summary.Critical + summary.Warning
if totalActive == 0 {
findingsSummaryStr = "All healthy"
status = "healthy"
} else {
parts := []string{}
if summary.Critical > 0 {
parts = append(parts, fmt.Sprintf("%d critical", summary.Critical))
}
if summary.Warning > 0 {
parts = append(parts, fmt.Sprintf("%d warning", summary.Warning))
}
findingsSummaryStr = fmt.Sprintf("%s", joinParts(parts))
if summary.Critical > 0 {
status = "critical"
} else {
status = "issues_found"
}
}
if runStats.errors > 0 {
status = "error"
if findingsSummaryStr == "All healthy" {
findingsSummaryStr = fmt.Sprintf("Analysis incomplete (%d errors)", runStats.errors)
}
}
runRecord := PatrolRunRecord{
ID: runID,
StartedAt: start,
CompletedAt: completedAt,
Duration: duration,
DurationMs: duration.Milliseconds(),
Type: "scoped",
TriggerReason: string(scope.Reason),
ScopeResourceIDs: scope.ResourceIDs,
EffectiveScopeResourceIDs: effectiveScopeIDs,
ScopeResourceTypes: scope.ResourceTypes,
ScopeContext: scope.Context,
AlertIdentifier: scope.AlertIdentifier,
FindingID: scope.FindingID,
ResourcesChecked: runStats.resourceCount,
NodesChecked: runStats.nodesChecked,
GuestsChecked: runStats.guestsChecked,
DockerChecked: runStats.dockerChecked,
StorageChecked: runStats.storageChecked,
HostsChecked: runStats.hostsChecked,
TrueNASChecked: runStats.trueNASChecked,
PBSChecked: runStats.pbsChecked,
PMGChecked: runStats.pmgChecked,
KubernetesChecked: runStats.kubernetesChecked,
NewFindings: runStats.newFindings,
ExistingFindings: runStats.existingFindings,
RejectedFindings: runStats.rejectedFindings,
FindingsSummary: findingsSummaryStr,
FindingIDs: runStats.findingIDs,
ErrorCount: runStats.errors,
Status: status,
}
if runStats.aiAnalysis != nil {
runRecord.AIAnalysis = runStats.aiAnalysis.Response
runRecord.InputTokens = runStats.aiAnalysis.InputTokens
runRecord.OutputTokens = runStats.aiAnalysis.OutputTokens
runRecord.TriageFlags = runStats.triageFlags
runRecord.TriageSkippedLLM = runStats.triageSkippedLLM
toolCalls := runStats.aiAnalysis.ToolCalls
if len(toolCalls) > MaxToolCallsPerRun {
toolCalls = toolCalls[:MaxToolCallsPerRun]
}
runRecord.ToolCalls = toolCalls
runRecord.ToolCallCount = len(runStats.aiAnalysis.ToolCalls)
}
p.mu.Lock()
p.lastActivity = completedAt
p.lastDuration = duration
p.resourcesChecked = runStats.resourceCount
p.errorCount = runStats.errors
p.mu.Unlock()
p.runHistoryStore.Add(runRecord)
log.Info().
Strs("requested_ids", scope.ResourceIDs).
Strs("requested_types", scope.ResourceTypes).
Strs("effective_scope_ids", patrolLogResourceIDs(effectiveScopeIDs)).
Int("effective_scope_count", len(effectiveScopeIDs)).
Dur("duration", duration).
Int("resources", resourceCount).
Str("reason", string(scope.Reason)).
Msg("AI Patrol: Scoped patrol complete")
}
func patrolLogResourceIDs(ids []string) []string {
if len(ids) <= scopedPatrolLogIDLimit {
return ids
}
trimmed := append([]string(nil), ids[:scopedPatrolLogIDLimit]...)
trimmed = append(trimmed, fmt.Sprintf("... +%d more", len(ids)-scopedPatrolLogIDLimit))
return trimmed
}
type patrolScopeMatcher struct {
resourceIDSet map[string]bool
typeSet map[string]bool
hasIDs bool
hasTypes bool
}
func newPatrolScopeMatcher(scope PatrolScope) patrolScopeMatcher {
resourceIDSet := make(map[string]bool)
for _, id := range scope.ResourceIDs {
trimmed := strings.TrimSpace(id)
if trimmed == "" {
continue
}
resourceIDSet[trimmed] = true
}
typeSet := make(map[string]bool)
addScopeType := func(t string) {
trimmed := strings.TrimSpace(strings.ToLower(t))
if trimmed == "" {
return
}
switch trimmed {
case "docker-host", "app-container":
typeSet["docker-host"] = true
typeSet["app-container"] = true
case "k8s-cluster":
typeSet["k8s-cluster"] = true
case "system-container", "vm", "node", "storage", "agent", "pbs", "pmg", "physical_disk":
typeSet[trimmed] = true
default:
typeSet[trimmed] = true
}
}
for _, t := range scope.ResourceTypes {
addScopeType(t)
}
return patrolScopeMatcher{
resourceIDSet: resourceIDSet,
typeSet: typeSet,
hasIDs: len(resourceIDSet) > 0,
hasTypes: len(typeSet) > 0,
}
}
func (m patrolScopeMatcher) matchesType(candidates ...string) bool {
if !m.hasTypes {
return true
}
for _, candidate := range candidates {
if candidate == "" {
continue
}
if m.typeSet[strings.ToLower(candidate)] {
return true
}
}
return false
}
func (m patrolScopeMatcher) matchesID(candidates ...string) bool {
if !m.hasIDs {
return true
}
for _, candidate := range candidates {
if candidate == "" {
continue
}
if m.resourceIDSet[candidate] {
return true
}
}
return false
}
type patrolScopedFilterState struct {
filtered patrolRuntimeState
includedResourceIDs map[string]bool
includedGuestVMIDs map[int]bool
}
func newPatrolScopedFilterState(snap patrolRuntimeState) patrolScopedFilterState {
return patrolScopedFilterState{
filtered: patrolRuntimeState{
readState: snap.readState,
unifiedResourceProvider: snap.unifiedResourceProvider,
},
includedResourceIDs: make(map[string]bool),
includedGuestVMIDs: make(map[int]bool),
}
}
func (s *patrolScopedFilterState) includeResourceID(ids ...string) {
for _, id := range ids {
if strings.TrimSpace(id) == "" {
continue
}
s.includedResourceIDs[id] = true
}
}
func (s *patrolScopedFilterState) includeGuestVMID(vmid int) {
if vmid > 0 {
s.includedGuestVMIDs[vmid] = true
}
}
func patrolDockerScopeName(d models.DockerHost) string {
hostName := d.CustomDisplayName
if hostName == "" {
hostName = d.DisplayName
}
if hostName == "" {
hostName = d.Hostname
}
return hostName
}
func scopePatrolDockerHost(d models.DockerHost, matcher patrolScopeMatcher) (models.DockerHost, []string, bool) {
if !matcher.matchesType("docker-host", "app-container") {
return models.DockerHost{}, nil, false
}
hostMatches := matcher.matchesID(d.ID, patrolDockerScopeName(d), d.Hostname, d.DisplayName, d.CustomDisplayName)
if !matcher.hasIDs {
included := make([]string, 0, len(d.Containers)+1)
if matcher.typeSet["docker-host"] || !matcher.hasTypes {
included = append(included, d.ID)
}
if matcher.typeSet["app-container"] || !matcher.hasTypes {
for _, c := range d.Containers {
included = append(included, c.ID)
}
}
return d, included, true
}
matchedContainers := make([]models.DockerContainer, 0)
for _, c := range d.Containers {
if matcher.matchesID(c.ID, c.Name) {
matchedContainers = append(matchedContainers, c)
}
}
if hostMatches {
included := make([]string, 0, len(d.Containers)+1)
if matcher.typeSet["docker-host"] || !matcher.hasTypes {
included = append(included, d.ID)
}
if matcher.typeSet["app-container"] || !matcher.hasTypes {
for _, c := range d.Containers {
included = append(included, c.ID)
}
}
return d, included, true
}
if len(matchedContainers) > 0 {
hostCopy := d
hostCopy.Containers = matchedContainers
included := make([]string, 0, len(matchedContainers))
for _, c := range matchedContainers {
included = append(included, c.ID)
}
return hostCopy, included, true
}
return models.DockerHost{}, nil, false
}
func collectPatrolScopedDockerHosts(hosts []models.DockerHost, matcher patrolScopeMatcher) ([]models.DockerHost, []string) {
filtered := make([]models.DockerHost, 0, len(hosts))
ids := make([]string, 0, len(hosts))
for _, host := range hosts {
scopedHost, includeIDs, ok := scopePatrolDockerHost(host, matcher)
if !ok {
continue
}
filtered = append(filtered, scopedHost)
ids = append(ids, includeIDs...)
}
return filtered, ids
}
func scopePatrolPBSInstance(pbs models.PBSInstance, matcher patrolScopeMatcher) bool {
if !matcher.matchesType("pbs") {
return false
}
pbsName := pbs.Name
if pbsName == "" {
pbsName = pbs.Host
}
pbsMatches := matcher.matchesID(pbs.ID, pbs.Name, pbsName, pbs.Host)
if !matcher.hasIDs {
return true
}
if !pbsMatches {
for _, ds := range pbs.Datastores {
if matcher.matchesID(pbs.ID+":"+ds.Name, ds.Name) {
pbsMatches = true
break
}
}
}
if !pbsMatches {
for _, job := range pbs.BackupJobs {
if matcher.matchesID(pbs.ID+":job:"+job.ID, job.ID) {
pbsMatches = true
break
}
}
}
if !pbsMatches {
for _, job := range pbs.VerifyJobs {
if matcher.matchesID(pbs.ID+":verify:"+job.ID, job.ID) {
pbsMatches = true
break
}
}
}
return pbsMatches
}
func scopePatrolNode(n models.Node, matcher patrolScopeMatcher) bool {
return matcher.matchesType("node") && matcher.matchesID(n.ID, n.Name)
}
func scopePatrolVM(vm models.VM, matcher patrolScopeMatcher) bool {
return matcher.matchesType("vm") && matcher.matchesID(vm.ID, vm.Name)
}
func scopePatrolContainer(ct models.Container, matcher patrolScopeMatcher) bool {
return matcher.matchesType("system-container") && matcher.matchesID(ct.ID, ct.Name)
}
func scopePatrolStorage(storage models.Storage, matcher patrolScopeMatcher) bool {
return matcher.matchesType("storage") && matcher.matchesID(storage.ID, storage.Name)
}
func scopePatrolPhysicalDisk(disk models.PhysicalDisk, matcher patrolScopeMatcher) bool {
return matcher.matchesType("physical_disk") && matcher.matchesID(disk.ID, disk.DevPath, disk.Model)
}
func scopePatrolPMGInstance(pmg models.PMGInstance, matcher patrolScopeMatcher) bool {
return matcher.matchesType("pmg") && matcher.matchesID(pmg.ID, pmg.Name, pmg.Host)
}
func scopePatrolHost(h models.Host, matcher patrolScopeMatcher) bool {
return matcher.matchesType("agent") && matcher.matchesID(h.ID, h.DisplayName, h.Hostname)
}
func scopePatrolKubernetesCluster(k models.KubernetesCluster, matcher patrolScopeMatcher) bool {
return matcher.matchesType("k8s-cluster") && matcher.matchesID(k.ID, patrolKubernetesScopeName(k))
}
func collectPatrolScopedNodes(nodes []models.Node, matcher patrolScopeMatcher) ([]models.Node, []string) {
filtered := make([]models.Node, 0, len(nodes))
ids := make([]string, 0, len(nodes))
for _, n := range nodes {
if !scopePatrolNode(n, matcher) {
continue
}
filtered = append(filtered, n)
ids = append(ids, n.ID)
}
return filtered, ids
}
func collectPatrolScopedVMs(vms []models.VM, matcher patrolScopeMatcher) ([]models.VM, []string, []int) {
filtered := make([]models.VM, 0, len(vms))
ids := make([]string, 0, len(vms))
vmids := make([]int, 0, len(vms))
for _, vm := range vms {
if !scopePatrolVM(vm, matcher) {
continue
}
filtered = append(filtered, vm)
ids = append(ids, vm.ID)
vmids = append(vmids, vm.VMID)
}
return filtered, ids, vmids
}
func collectPatrolScopedContainers(containers []models.Container, matcher patrolScopeMatcher) ([]models.Container, []string, []int) {
filtered := make([]models.Container, 0, len(containers))
ids := make([]string, 0, len(containers))
vmids := make([]int, 0, len(containers))
for _, ct := range containers {
if !scopePatrolContainer(ct, matcher) {
continue
}
filtered = append(filtered, ct)
ids = append(ids, ct.ID)
vmids = append(vmids, ct.VMID)
}
return filtered, ids, vmids
}
func collectPatrolScopedStorage(storage []models.Storage, matcher patrolScopeMatcher) ([]models.Storage, []string) {
filtered := make([]models.Storage, 0, len(storage))
ids := make([]string, 0, len(storage))
for _, s := range storage {
if !scopePatrolStorage(s, matcher) {
continue
}
filtered = append(filtered, s)
ids = append(ids, s.ID)
}
return filtered, ids
}
func collectPatrolScopedPhysicalDisks(disks []models.PhysicalDisk, matcher patrolScopeMatcher) ([]models.PhysicalDisk, []string) {
filtered := make([]models.PhysicalDisk, 0, len(disks))
ids := make([]string, 0, len(disks)*2)
for _, disk := range disks {
if !scopePatrolPhysicalDisk(disk, matcher) {
continue
}
filtered = append(filtered, disk)
ids = append(ids, disk.ID, disk.DevPath)
}
return filtered, ids
}
func collectPatrolScopedPBSInstances(instances []models.PBSInstance, matcher patrolScopeMatcher) ([]models.PBSInstance, []string) {
filtered := make([]models.PBSInstance, 0, len(instances))
ids := make([]string, 0, len(instances))
for _, pbs := range instances {
if !scopePatrolPBSInstance(pbs, matcher) {
continue
}
filtered = append(filtered, pbs)
ids = append(ids, pbs.ID)
}
return filtered, ids
}
func collectPatrolScopedPMGInstances(instances []models.PMGInstance, matcher patrolScopeMatcher) ([]models.PMGInstance, []string) {
filtered := make([]models.PMGInstance, 0, len(instances))
ids := make([]string, 0, len(instances))
for _, pmg := range instances {
if !scopePatrolPMGInstance(pmg, matcher) {
continue
}
filtered = append(filtered, pmg)
ids = append(ids, pmg.ID)
}
return filtered, ids
}
func collectPatrolScopedHosts(hosts []models.Host, matcher patrolScopeMatcher) ([]models.Host, []string) {
filtered := make([]models.Host, 0, len(hosts))
ids := make([]string, 0, len(hosts))
for _, h := range hosts {
if !scopePatrolHost(h, matcher) {
continue
}
filtered = append(filtered, h)
ids = append(ids, h.ID)
}
return filtered, ids
}
func collectPatrolScopedKubernetesClusters(clusters []models.KubernetesCluster, matcher patrolScopeMatcher) ([]models.KubernetesCluster, []string) {
filtered := make([]models.KubernetesCluster, 0, len(clusters))
ids := make([]string, 0, len(clusters))
for _, k := range clusters {
if !scopePatrolKubernetesCluster(k, matcher) {
continue
}
filtered = append(filtered, k)
ids = append(ids, k.ID)
}
return filtered, ids
}
func patrolKubernetesScopeName(k models.KubernetesCluster) string {
clusterName := k.CustomDisplayName
if clusterName == "" {
clusterName = k.DisplayName
}
if clusterName == "" {
clusterName = k.Name
}
return clusterName
}
func collectPatrolScopedActiveAlerts(alerts []models.Alert, includedResourceIDs map[string]bool) []models.Alert {
filtered := make([]models.Alert, 0, len(alerts))
for _, alert := range alerts {
if includedResourceIDs[alert.ResourceID] {
filtered = append(filtered, alert)
}
}
return filtered
}
func collectPatrolScopedResolvedAlerts(alerts []models.ResolvedAlert, includedResourceIDs map[string]bool) []models.ResolvedAlert {
filtered := make([]models.ResolvedAlert, 0, len(alerts))
for _, resolved := range alerts {
if includedResourceIDs[resolved.ResourceID] {
filtered = append(filtered, resolved)
}
}
return filtered
}
func collectPatrolScopedConnectionHealth(connectionHealth map[string]bool, includedResourceIDs map[string]bool) map[string]bool {
filtered := make(map[string]bool, len(connectionHealth))
for resourceID, healthy := range connectionHealth {
if includedResourceIDs[resourceID] {
filtered[resourceID] = healthy
}
}
return filtered
}
func collectPatrolScopedBackupTasks(tasks []models.BackupTask, includedGuestVMIDs map[int]bool) []models.BackupTask {
filtered := make([]models.BackupTask, 0, len(tasks))
for _, backupTask := range tasks {
if includedGuestVMIDs[backupTask.VMID] {
filtered = append(filtered, backupTask)
}
}
return filtered
}
func collectPatrolScopedStorageBackups(backups []models.StorageBackup, includedGuestVMIDs map[int]bool) []models.StorageBackup {
filtered := make([]models.StorageBackup, 0, len(backups))
for _, storageBackup := range backups {
if includedGuestVMIDs[storageBackup.VMID] {
filtered = append(filtered, storageBackup)
}
}
return filtered
}
func collectPatrolScopedGuestSnapshots(snapshots []models.GuestSnapshot, includedGuestVMIDs map[int]bool) []models.GuestSnapshot {
filtered := make([]models.GuestSnapshot, 0, len(snapshots))
for _, guestSnapshot := range snapshots {
if includedGuestVMIDs[guestSnapshot.VMID] {
filtered = append(filtered, guestSnapshot)
}
}
return filtered
}
func collectPatrolScopedPBSBackups(backups []models.PBSBackup, includedGuestVMIDs map[int]bool) []models.PBSBackup {
filtered := make([]models.PBSBackup, 0, len(backups))
for _, backup := range backups {
vmid, err := strconv.Atoi(backup.VMID)
if err == nil && includedGuestVMIDs[vmid] {
filtered = append(filtered, backup)
}
}
return filtered
}
func copyScopedPatrolMetadata(dst *patrolRuntimeState, snap patrolRuntimeState, includedResourceIDs map[string]bool, includedGuestVMIDs map[int]bool) {
if len(snap.ActiveAlerts) > 0 {
dst.ActiveAlerts = collectPatrolScopedActiveAlerts(snap.ActiveAlerts, includedResourceIDs)
}
if len(snap.RecentlyResolved) > 0 {
dst.RecentlyResolved = collectPatrolScopedResolvedAlerts(snap.RecentlyResolved, includedResourceIDs)
}
if len(snap.ConnectionHealth) > 0 {
dst.ConnectionHealth = collectPatrolScopedConnectionHealth(snap.ConnectionHealth, includedResourceIDs)
}
if len(includedGuestVMIDs) == 0 {
return
}
dst.PVEBackups.BackupTasks = collectPatrolScopedBackupTasks(snap.PVEBackups.BackupTasks, includedGuestVMIDs)
dst.PVEBackups.StorageBackups = collectPatrolScopedStorageBackups(snap.PVEBackups.StorageBackups, includedGuestVMIDs)
dst.PVEBackups.GuestSnapshots = collectPatrolScopedGuestSnapshots(snap.PVEBackups.GuestSnapshots, includedGuestVMIDs)
dst.PBSBackups = collectPatrolScopedPBSBackups(snap.PBSBackups, includedGuestVMIDs)
}
func (p *PatrolService) filterStateByScopeState(snap patrolRuntimeState, scope PatrolScope) patrolRuntimeState {
matcher := newPatrolScopeMatcher(scope)
filterState := newPatrolScopedFilterState(snap)
filteredNodes, nodeIDs := collectPatrolScopedNodes(snap.Nodes, matcher)
filterState.filtered.Nodes = filteredNodes
filterState.includeResourceID(nodeIDs...)
filteredVMs, vmIDs, guestVMIDs := collectPatrolScopedVMs(snap.VMs, matcher)
filterState.filtered.VMs = filteredVMs
filterState.includeResourceID(vmIDs...)
for _, vmid := range guestVMIDs {
filterState.includeGuestVMID(vmid)
}
filteredContainers, containerIDs, containerVMIDs := collectPatrolScopedContainers(snap.Containers, matcher)
filterState.filtered.Containers = filteredContainers
filterState.includeResourceID(containerIDs...)
for _, vmid := range containerVMIDs {
filterState.includeGuestVMID(vmid)
}
filteredDockerHosts, dockerIDs := collectPatrolScopedDockerHosts(snap.DockerHosts, matcher)
filterState.filtered.DockerHosts = filteredDockerHosts
filterState.includeResourceID(dockerIDs...)
filteredStorage, storageIDs := collectPatrolScopedStorage(snap.Storage, matcher)
filterState.filtered.Storage = filteredStorage
filterState.includeResourceID(storageIDs...)
filteredDisks, diskIDs := collectPatrolScopedPhysicalDisks(snap.PhysicalDisks, matcher)
filterState.filtered.PhysicalDisks = filteredDisks
filterState.includeResourceID(diskIDs...)
filteredPBS, pbsIDs := collectPatrolScopedPBSInstances(snap.PBSInstances, matcher)
filterState.filtered.PBSInstances = filteredPBS
filterState.includeResourceID(pbsIDs...)
filteredPMG, pmgIDs := collectPatrolScopedPMGInstances(snap.PMGInstances, matcher)
filterState.filtered.PMGInstances = filteredPMG
filterState.includeResourceID(pmgIDs...)
filteredHosts, hostIDs := collectPatrolScopedHosts(snap.Hosts, matcher)
filterState.filtered.Hosts = filteredHosts
filterState.includeResourceID(hostIDs...)
filteredK8sClusters, k8sClusterIDs := collectPatrolScopedKubernetesClusters(snap.KubernetesClusters, matcher)
filterState.filtered.KubernetesClusters = filteredK8sClusters
filterState.includeResourceID(k8sClusterIDs...)
copyScopedPatrolMetadata(&filterState.filtered, snap, filterState.includedResourceIDs, filterState.includedGuestVMIDs)
return filterState.filtered.withDerivedProviders()
}
// GetStatus returns the current patrol status
func (p *PatrolService) GetStatus() PatrolStatus {
p.mu.RLock()
defer p.mu.RUnlock()
interval := p.config.GetInterval()
intervalMs := int64(interval / time.Millisecond)
// "Running" means an analysis is currently in progress, not just the service loop
analysisInProgress := p.runInProgress
status := PatrolStatus{
RuntimeState: PatrolRuntimeStateActive,
Running: analysisInProgress,
Enabled: p.config.Enabled,
LastDuration: p.lastDuration,
ResourcesChecked: p.resourcesChecked,
FindingsCount: len(p.findings.GetActive(FindingSeverityInfo)),
ErrorCount: p.errorCount,
IntervalMs: intervalMs,
BlockedReason: p.lastBlockedReason,
}
if p.triggerManager != nil {
triggerStatus := p.triggerManager.GetStatus()
status.TriggerStatus = &triggerStatus
}
if p.quickstartCredits != nil {
status.QuickstartCreditsRemaining = p.quickstartCredits.CreditsRemaining()
status.QuickstartCreditsTotal = p.quickstartCredits.CreditsTotal()
status.UsingQuickstart = p.aiService != nil && p.aiService.IsUsingQuickstart()
}
quickstartReason := ""
if p.aiService != nil {
quickstartReason = strings.TrimSpace(p.aiService.QuickstartBlockedReason())
}
if quickstartReason == "" &&
p.config.Enabled &&
p.quickstartCredits != nil &&
!p.quickstartCredits.HasBYOK() &&
!p.quickstartCredits.HasCredits() {
quickstartReason = patrolQuickstartCreditsExhaustedReason
}
if quickstartReason != "" {
status.BlockedReason = quickstartReason
} else if strings.TrimSpace(status.BlockedReason) == patrolQuickstartCreditsExhaustedReason ||
strings.TrimSpace(status.BlockedReason) == patrolQuickstartActivationRequiredReason ||
strings.TrimSpace(status.BlockedReason) == patrolQuickstartUnavailableReason {
status.BlockedReason = ""
}
switch {
case analysisInProgress:
status.RuntimeState = PatrolRuntimeStateRunning
case !p.config.Enabled:
status.RuntimeState = PatrolRuntimeStateDisabled
case strings.TrimSpace(status.BlockedReason) != "":
status.RuntimeState = PatrolRuntimeStateBlocked
default:
status.RuntimeState = PatrolRuntimeStateActive
}
if !p.lastFullPatrol.IsZero() {
status.LastPatrolAt = &p.lastFullPatrol
}
if !p.lastActivity.IsZero() {
status.LastActivityAt = &p.lastActivity
}
if strings.TrimSpace(status.BlockedReason) != "" && !p.lastBlockedAt.IsZero() {
status.BlockedAt = &p.lastBlockedAt
}
// Use the tracked next scheduled time (accounts for ticker resets on interval changes)
if p.config.Enabled && interval > 0 && !p.nextScheduledAt.IsZero() {
next := p.nextScheduledAt
status.NextPatrolAt = &next
}
summary := p.findings.GetSummary()
status.Healthy = summary.IsHealthy()
if status.RuntimeState == PatrolRuntimeStateBlocked {
status.Healthy = false
}
return status
}
// SubscribeToStream returns a channel that will receive streaming patrol events
func (p *PatrolService) SubscribeToStream() chan PatrolStreamEvent {
return p.SubscribeToStreamFrom(0)
}
// SubscribeToStreamFrom subscribes a client to patrol streaming events and optionally replays
// events with Seq > lastSeq (best-effort). This allows SSE clients to resume after disconnects
// using the Last-Event-ID header.
func (p *PatrolService) SubscribeToStreamFrom(lastSeq int64) chan PatrolStreamEvent {
ch := make(chan PatrolStreamEvent, 100) // Buffered to prevent blocking
sub := &streamSubscriber{ch: ch}
replayedCount := 0
snapshotReasons := make([]string, 0, 2)
snapshotReasonSeen := make(map[string]struct{}, 2)
p.streamMu.Lock()
p.streamSubscribers[ch] = sub
trySendSnapshot := func(reason string) bool {
if _, seen := snapshotReasonSeen[reason]; seen {
return true
}
snap := p.makeSnapshotLocked(reason)
select {
case ch <- snap:
snapshotReasonSeen[reason] = struct{}{}
snapshotReasons = append(snapshotReasons, reason)
return true
default:
return false
}
}
bufferStart, bufferEnd := p.streamBufferWindowLocked()
// If the client is behind the buffered window, proactively emit a snapshot that
// advertises truncation. (We may still replay what we have.)
if lastSeq > 0 && bufferStart > 0 && lastSeq < bufferStart && p.streamPhase != "idle" {
trySendSnapshot("buffer_rotated")
}
// Best-effort replay / snapshot:
// - If client provides lastSeq, replay newer buffered events (Seq > lastSeq).
// - If lastSeq is stale/ahead (e.g. from a different run), send a snapshot so UI can resync.
// - If no lastSeq, send a snapshot (late-joiner).
if lastSeq > 0 && len(p.streamEvents) > 0 {
events := p.streamEventsSinceLocked(lastSeq)
replayLoop:
for _, ev := range events {
select {
case ch <- ev:
replayedCount++
default:
// If subscriber can't catch up, stop replaying and let it receive live events.
break replayLoop
}
}
}
if replayedCount == 0 && len(snapshotReasons) == 0 && lastSeq > 0 && p.streamPhase != "idle" {
// lastSeq is likely stale (ahead of this run) or we're missing buffered events.
// Provide a snapshot to allow the UI to resync.
reason := "stale_last_event_id"
if bufferEnd > 0 && lastSeq > bufferEnd {
reason = "stale_last_event_id"
} else if bufferStart > 0 && lastSeq < bufferStart {
reason = "buffer_rotated"
}
trySendSnapshot(reason)
}
if lastSeq == 0 && p.streamPhase != "idle" {
trySendSnapshot("late_joiner")
}
p.streamMu.Unlock()
metrics := GetPatrolMetrics()
if replayedCount > 0 {
metrics.RecordStreamReplay(replayedCount)
log.Debug().Int64("last_seq", lastSeq).Int("replayed_events", replayedCount).Msg("patrol stream replayed buffered events")
}
for _, reason := range snapshotReasons {
metrics.RecordStreamSnapshot(reason)
log.Debug().Int64("last_seq", lastSeq).Str("resync_reason", reason).Msg("patrol stream sent synthetic snapshot")
}
if lastSeq > 0 && replayedCount == 0 && len(snapshotReasons) == 0 {
metrics.RecordStreamMiss()
log.Debug().Int64("last_seq", lastSeq).Msg("patrol stream resume had no replay or snapshot")
}
return ch
}
// UnsubscribeFromStream removes a subscriber
func (p *PatrolService) UnsubscribeFromStream(ch chan PatrolStreamEvent) {
p.streamMu.Lock()
sub, exists := p.streamSubscribers[ch]
delete(p.streamSubscribers, ch)
p.streamMu.Unlock()
// Use atomic CAS to ensure exactly one goroutine closes the channel,
// even if broadcast and unsubscribe race.
if exists && sub.closed.CompareAndSwap(false, true) {
close(ch)
}
}
// broadcast sends an event to all subscribers
// Subscribers with full channels are automatically removed to prevent memory leaks
func (p *PatrolService) broadcast(event PatrolStreamEvent) {
p.streamMu.Lock()
defer p.streamMu.Unlock()
// Track a couple pieces of best-effort state for snapshots/resync.
switch event.Type {
case "tool_start":
if event.ToolName != "" {
p.streamCurrentTool = event.ToolName
}
case "tool_end":
p.streamCurrentTool = ""
}
// Bound payload sizes so streaming and replay buffers can't balloon due to a single tool
// output or oversized content chunk.
event = truncateStreamEvent(event)
// Decorate once so every subscriber sees identical meta.
event = p.decorateStreamEventLocked(event)
p.appendStreamEventLocked(event)
var staleChannels []chan PatrolStreamEvent
dropReasons := make(map[chan PatrolStreamEvent]string)
for ch, sub := range p.streamSubscribers {
if sub == nil || sub.closed.Load() {
staleChannels = append(staleChannels, ch)
dropReasons[ch] = "closed"
continue
}
select {
case ch <- event:
// Successfully sent
sub.fullCount = 0
default:
// Channel full. Tolerate bursts, but disconnect subscribers that are
// consistently unable to receive events (likely dead/slow clients).
sub.fullCount++
if sub.fullCount >= 25 {
staleChannels = append(staleChannels, ch)
dropReasons[ch] = "backpressure"
}
}
}
// Clean up stale subscribers using atomic CAS for safe close
for _, ch := range staleChannels {
sub := p.streamSubscribers[ch]
delete(p.streamSubscribers, ch)
reason := dropReasons[ch]
GetPatrolMetrics().RecordStreamSubscriberDrop(reason)
log.Debug().Str("reason", reason).Msg("patrol stream subscriber dropped")
if sub != nil && sub.closed.CompareAndSwap(false, true) {
close(ch)
}
}
}
// resetStreamForRun resets stream state for a new run so late-joiners don't see stale output.
// This should only be called for runs that will actually stream events (NoStream=false).
func (p *PatrolService) resetStreamForRun(runID string) {
p.streamMu.Lock()
p.streamRunID = runID
p.streamSeq = 0
p.streamPhase = "idle"
p.streamCurrentTool = ""
p.currentOutput.Reset()
p.streamEvents = nil
p.streamMu.Unlock()
}
func (p *PatrolService) decorateStreamEventLocked(event PatrolStreamEvent) PatrolStreamEvent {
if event.RunID == "" {
event.RunID = p.streamRunID
}
if event.Seq == 0 {
p.streamSeq++
event.Seq = p.streamSeq
}
if event.TsMs == 0 {
event.TsMs = time.Now().UnixMilli()
}
return event
}
const patrolStreamReplayBufferSize = 200
const patrolStreamMaxEventFieldBytes = 8 * 1024
func truncateStreamEvent(event PatrolStreamEvent) PatrolStreamEvent {
event.Content = truncateStreamField(event.Content, patrolStreamMaxEventFieldBytes)
event.ToolInput = truncateStreamField(event.ToolInput, patrolStreamMaxEventFieldBytes)
event.ToolRawInput = truncateStreamField(event.ToolRawInput, patrolStreamMaxEventFieldBytes)
event.ToolOutput = truncateStreamField(event.ToolOutput, patrolStreamMaxEventFieldBytes)
return event
}
func truncateStreamField(s string, max int) string {
if max <= 0 || len(s) <= max {
return s
}
const suffix = "...[truncated]"
if max <= len(suffix) {
return s[:max]
}
return s[:max-len(suffix)] + suffix
}
func (p *PatrolService) appendStreamEventLocked(event PatrolStreamEvent) {
// Keep a bounded buffer for Last-Event-ID replay (best-effort).
p.streamEvents = append(p.streamEvents, event)
if len(p.streamEvents) > patrolStreamReplayBufferSize {
p.streamEvents = p.streamEvents[len(p.streamEvents)-patrolStreamReplayBufferSize:]
}
}
func (p *PatrolService) streamEventsSinceLocked(lastSeq int64) []PatrolStreamEvent {
// Seq is monotonic within a run; we reset buffer on new run.
for i := len(p.streamEvents) - 1; i >= 0; i-- {
if p.streamEvents[i].Seq <= lastSeq {
// Return events after i
out := make([]PatrolStreamEvent, len(p.streamEvents)-(i+1))
copy(out, p.streamEvents[i+1:])
return out
}
}
// All buffered events are newer
out := make([]PatrolStreamEvent, len(p.streamEvents))
copy(out, p.streamEvents)
return out
}
func (p *PatrolService) streamBufferWindowLocked() (start, end int64) {
if len(p.streamEvents) == 0 {
return 0, 0
}
return p.streamEvents[0].Seq, p.streamEvents[len(p.streamEvents)-1].Seq
}
func (p *PatrolService) makeSnapshotLocked(reason string) PatrolStreamEvent {
start, end := p.streamBufferWindowLocked()
phase := p.streamPhase
if phase == "idle" {
phase = ""
}
tr := p.currentOutput.Truncated()
var trPtr *bool
if tr {
trPtr = &tr
}
// Snapshot is synthetic and should not advance seq; use the most recent real event seq
// so clients can resume from a meaningful Last-Event-ID.
seq := end
return PatrolStreamEvent{
Type: "snapshot",
RunID: p.streamRunID,
Seq: seq,
TsMs: time.Now().UnixMilli(),
ResyncReason: reason,
BufferStart: start,
BufferEnd: end,
ContentTruncated: trPtr,
Phase: phase,
Content: p.currentOutput.String(),
ToolName: p.streamCurrentTool,
}
}
// appendStreamContent adds content to the current output and broadcasts it
func (p *PatrolService) appendStreamContent(content string) {
p.streamMu.Lock()
p.currentOutput.WriteString(content)
p.streamMu.Unlock()
p.broadcast(PatrolStreamEvent{
Type: "content",
Content: content,
})
}
// setStreamPhase updates the current phase and broadcasts it to all subscribers.
// The frontend only updates its phase display when it receives a 'phase' event,
// so we must broadcast phase changes to keep the UI in sync.
func (p *PatrolService) setStreamPhase(phase string) {
p.streamMu.Lock()
oldPhase := p.streamPhase
p.streamPhase = phase
p.streamMu.Unlock()
// Broadcast phase change (except for idle which just clears state)
// This ensures late joiners and continuous watchers see the current phase
if phase != "idle" && phase != oldPhase {
p.broadcast(PatrolStreamEvent{
Type: "phase",
Phase: phase,
})
}
}
// GetCurrentStreamOutput returns the current buffered output (for late joiners)
func (p *PatrolService) GetCurrentStreamOutput() (string, string) {
p.streamMu.RLock()
defer p.streamMu.RUnlock()
return p.currentOutput.String(), p.streamPhase
}
// reviewAndResolveAlerts uses AI to review active alerts and resolve those where the issue is fixed.
// This is the core of autonomous alert management - the AI looks at each alert, checks current state,
// and determines if the underlying issue has been resolved.
func (p *PatrolService) reviewAndResolveAlertsState(ctx context.Context, state patrolRuntimeState, llmAllowed bool, executionID string) int {
p.mu.RLock()
resolver := p.alertResolver
aiService := p.aiService
p.mu.RUnlock()
if resolver == nil {
return 0
}
activeAlerts := resolver.GetActiveAlerts()
if len(activeAlerts) == 0 {
return 0
}
// Only review alerts that have been active for at least 10 minutes
// This avoids thrashing on transient alerts
minAge := 10 * time.Minute
var alertsToReview []AlertInfo
for _, alert := range activeAlerts {
if time.Since(alert.StartTime) >= minAge {
alertsToReview = append(alertsToReview, alert)
}
}
if len(alertsToReview) == 0 {
return 0
}
log.Info().
Int("total_active", len(activeAlerts)).
Int("to_review", len(alertsToReview)).
Msg("AI Patrol: Reviewing alerts for auto-resolution")
resolvedCount := 0
// Pass nil for aiService if LLM is not allowed (use heuristic checks only).
aiSvc := aiService
if !llmAllowed {
aiSvc = nil
}
for _, alert := range alertsToReview {
shouldResolve, reason := p.shouldResolveAlertState(ctx, alert, state, aiSvc, executionID)
if shouldResolve {
if resolver.ResolveAlert(alert.ID) {
resolvedCount++
log.Info().
Str("alertID", alert.ID).
Str("resource", alert.ResourceName).
Str("reason", reason).
Dur("age", time.Since(alert.StartTime)).
Msg("AI Patrol: Auto-resolved alert - issue no longer detected")
}
}
}
if resolvedCount > 0 {
log.Info().
Int("resolved", resolvedCount).
Msg("AI Patrol: Completed alert review")
}
return resolvedCount
}
// shouldResolveAlert determines if an alert should be auto-resolved based on current state.
// Returns (shouldResolve, reason)
func (p *PatrolService) shouldResolveAlertState(ctx context.Context, alert AlertInfo, snap patrolRuntimeState, aiService *Service, executionID string) (bool, string) {
// First, try smart heuristic checks based on alert type
switch alert.Type {
case "usage": // Storage usage alert
resource := lookupPatrolAlertResourceState(alert, snap)
if resource.found {
if resource.disk < alert.Threshold*0.95 { // 5% margin below threshold
return true, fmt.Sprintf("storage usage dropped from %.1f%% to %.1f%% (threshold: %.1f%%)",
alert.Value, resource.disk, alert.Threshold)
}
return false, ""
}
// Storage not found in current snapshot - might have been removed
// Resolve after 24 hours if resource is gone
if time.Since(alert.StartTime) > 24*time.Hour {
return true, "resource no longer present in infrastructure"
}
case "cpu", "memory": // Resource utilization alerts
// Check if this is a node, VM, container, or docker container
currentValue := p.getCurrentMetricValueState(alert, snap)
if currentValue >= 0 && currentValue < alert.Threshold*0.95 {
return true, fmt.Sprintf("%s dropped from %.1f%% to %.1f%% (threshold: %.1f%%)",
alert.Type, alert.Value, currentValue, alert.Threshold)
}
case "offline", "stopped", "docker-offline":
// Check if the resource is now online
if p.isResourceOnlineState(alert, snap) {
return true, "resource is now online/running"
}
}
// For complex cases or when heuristics don't apply, use AI judgment if available
if aiService != nil && aiService.IsEnabled() {
return p.askAIAboutAlertState(ctx, alert, snap, aiService, executionID)
}
return false, ""
}
// getCurrentMetricValue gets the current value of the metric that triggered the alert
type patrolAlertResourceState struct {
resourceType string
platform string
name string
status string
cpu float64
memory float64
disk float64
found bool
}
func patrolAlertNameMatches(alert AlertInfo, ids ...string) bool {
for _, id := range ids {
if strings.TrimSpace(id) == "" {
continue
}
if id == alert.ResourceID || id == alert.ResourceName {
return true
}
}
return false
}
func patrolAlertLookupType(alert AlertInfo) string {
resourceType := canonicalizeAICompatibilityResourceType(alert.ResourceType)
if alert.Type == "usage" && (resourceType == "" || strings.EqualFold(resourceType, "usage")) {
return "storage"
}
return resourceType
}
func lookupPatrolAlertResourceState(alert AlertInfo, snap patrolRuntimeState) patrolAlertResourceState {
alert.ResourceType = patrolAlertLookupType(alert)
if alert.ResourceType == "app-container" {
if resource, ok := patrolLookupAppContainerAlertResourceState(alert, snap); ok {
return resource
}
}
switch alert.ResourceType {
case "storage":
if resource, ok := patrolLookupStorageAlertResourceState(alert, snap); ok {
return resource
}
case "node":
if resource, ok := patrolLookupNodeAlertResourceState(alert, snap); ok {
return resource
}
case "vm":
if resource, ok := patrolLookupGuestAlertResourceState(alert, snap, "VM", "vm"); ok {
return resource
}
case "system-container":
if resource, ok := patrolLookupGuestAlertResourceState(alert, snap, "Container", "system-container"); ok {
return resource
}
case "agent":
if resource, ok := patrolLookupHostAlertResourceState(alert, snap); ok {
return resource
}
}
return patrolAlertResourceState{}
}
func patrolLookupAppContainerAlertResourceState(alert AlertInfo, snap patrolRuntimeState) (patrolAlertResourceState, bool) {
for _, container := range patrolAppContainerRows(snap, nil) {
if !patrolAlertNameMatches(alert, container.id, container.name) {
continue
}
return patrolAlertResourceState{
resourceType: "app-container",
name: container.name,
status: container.status,
cpu: container.cpu,
memory: container.memory,
found: true,
}, true
}
return patrolAlertResourceState{}, false
}
func patrolLookupStorageAlertResourceState(alert AlertInfo, snap patrolRuntimeState) (patrolAlertResourceState, bool) {
for _, storage := range patrolStoragePoolRows(snap, nil) {
if !patrolAlertNameMatches(alert, storage.id, storage.name) {
continue
}
return patrolAlertResourceState{
resourceType: "storage",
name: storage.name,
status: storage.status,
disk: storage.usage,
found: true,
}, true
}
return patrolAlertResourceState{}, false
}
func patrolLookupNodeAlertResourceState(alert AlertInfo, snap patrolRuntimeState) (patrolAlertResourceState, bool) {
for _, node := range patrolNodeInventoryRows(snap, nil) {
if !patrolAlertNameMatches(alert, node.id, node.name) {
continue
}
return patrolAlertResourceState{
resourceType: "node",
name: node.name,
status: node.status,
cpu: node.cpu,
memory: node.mem,
found: true,
}, true
}
return patrolAlertResourceState{}, false
}
func patrolLookupGuestAlertResourceState(alert AlertInfo, snap patrolRuntimeState, guestType, resourceType string) (patrolAlertResourceState, bool) {
for _, guest := range patrolGuestInventoryRows(snap, nil, nil) {
if guest.gType != guestType || !patrolAlertNameMatches(alert, guest.id, guest.name) {
continue
}
return patrolAlertResourceState{
resourceType: resourceType,
name: guest.name,
status: guest.status,
cpu: guest.cpu,
memory: guest.mem,
found: true,
}, true
}
return patrolAlertResourceState{}, false
}
type patrolHostAlertRow struct {
id, name, hostname, status, platform string
cpu, memory float64
}
func patrolHostAlertRows(snap patrolRuntimeState) []patrolHostAlertRow {
if snap.readState != nil {
hosts := snap.readState.Hosts()
rows := make([]patrolHostAlertRow, 0, len(hosts))
for _, host := range hosts {
rows = append(rows, patrolHostAlertRow{
id: host.ID(),
name: host.Name(),
hostname: host.Hostname(),
status: string(host.Status()),
platform: host.Platform(),
cpu: host.CPUPercent(),
memory: host.MemoryPercent(),
})
}
return rows
}
rows := make([]patrolHostAlertRow, 0, len(snap.Hosts))
for _, host := range snap.Hosts {
rows = append(rows, patrolHostAlertRow{
id: host.ID,
name: host.DisplayName,
hostname: host.Hostname,
status: host.Status,
platform: host.Platform,
cpu: host.CPUUsage,
memory: host.Memory.Usage,
})
}
return rows
}
func patrolHostResourceType(platform string) string {
return "agent"
}
func patrolLookupHostAlertResourceState(alert AlertInfo, snap patrolRuntimeState) (patrolAlertResourceState, bool) {
for _, host := range patrolHostAlertRows(snap) {
if !patrolAlertNameMatches(alert, host.id, host.name, host.hostname) {
continue
}
name := host.hostname
if name == "" {
name = host.name
}
return patrolAlertResourceState{
resourceType: patrolHostResourceType(host.platform),
platform: host.platform,
name: name,
status: host.status,
cpu: host.cpu,
memory: host.memory,
found: true,
}, true
}
return patrolAlertResourceState{}, false
}
func (p *PatrolService) getCurrentMetricValueState(alert AlertInfo, snap patrolRuntimeState) float64 {
resource := lookupPatrolAlertResourceState(alert, snap)
if !resource.found {
return -1
}
switch alert.Type {
case "cpu":
return resource.cpu
case "memory":
return resource.memory
default:
if resource.resourceType == "storage" {
return resource.disk
}
}
return -1
}
// isResourceOnline checks if a resource that triggered an offline alert is now online
func (p *PatrolService) isResourceOnlineState(alert AlertInfo, snap patrolRuntimeState) bool {
resource := lookupPatrolAlertResourceState(alert, snap)
if !resource.found {
return false
}
switch resource.resourceType {
case "node", "agent":
return resource.status == "online"
case "vm", "system-container", "app-container":
return resource.status == "running" || resource.status == string(unifiedresources.StatusOnline)
default:
return false
}
}
// askAIAboutAlert uses the AI to determine if an alert should be resolved
func (p *PatrolService) askAIAboutAlertState(ctx context.Context, alert AlertInfo, snap patrolRuntimeState, aiService *Service, executionID string) (bool, string) {
alertType := patrolAlertLookupType(alert)
// Build a focused prompt for the AI
prompt := fmt.Sprintf(`Review this alert and determine if it should be auto-resolved based on current state.
ALERT:
- ID: %s
- Type: %s
- Resource: %s (%s)
- Message: %s
- Value when triggered: %.1f
- Threshold: %.1f
- Active for: %s
CURRENT STATE OF THIS RESOURCE:
%s
Should this alert be RESOLVED because the underlying issue is fixed?
Respond with ONLY one of:
- RESOLVE: <brief reason>
- KEEP: <brief reason>`,
alert.ID, alert.Type, alert.ResourceName, alertType,
alert.Message, alert.Value, alert.Threshold, alert.Duration,
p.getResourceCurrentStateState(alert, snap))
// Use a quick, low-cost AI call
response, err := aiService.QuickAnalysis(ctx, QuickAnalysisRequest{
Prompt: prompt,
ExecutionID: executionID,
UseCase: "patrol",
})
if err != nil {
log.Debug().Err(err).Str("alertID", alert.ID).Msg("AI Patrol: Failed to get AI judgment on alert")
return false, ""
}
response = strings.TrimSpace(response)
if strings.HasPrefix(strings.ToUpper(response), "RESOLVE:") {
reason := strings.TrimSpace(strings.TrimPrefix(response, "RESOLVE:"))
if reason == "" {
reason = strings.TrimSpace(strings.TrimPrefix(strings.ToUpper(response), "RESOLVE:"))
}
return true, "Patrol: " + reason
}
return false, ""
}
// getResourceCurrentState returns a description of the resource's current state
func (p *PatrolService) getResourceCurrentStateState(alert AlertInfo, snap patrolRuntimeState) string {
resource := lookupPatrolAlertResourceState(alert, snap)
if !resource.found {
switch patrolAlertLookupType(alert) {
case "storage":
return "Storage not found in current state (may have been removed)"
case "node":
return "Node not found in current state"
case "agent":
return "Agent host not found in current state"
case "vm":
return "VM not found in current state"
case "system-container":
return "Container not found in current state"
case "app-container":
return "Docker container not found in current state"
default:
return "Resource state unknown"
}
}
switch resource.resourceType {
case "storage":
return fmt.Sprintf("Storage '%s': %.1f%% used, status: %s", resource.name, resource.disk, resource.status)
case "node":
return fmt.Sprintf("Node '%s': CPU %.1f%%, Memory %.1f%%, Status: %s",
resource.name, resource.cpu, resource.memory, resource.status)
case "agent":
if strings.EqualFold(strings.TrimSpace(resource.platform), "truenas") {
return fmt.Sprintf("TrueNAS system '%s': CPU %.1f%%, Memory %.1f%%, Status: %s",
resource.name, resource.cpu, resource.memory, resource.status)
}
return fmt.Sprintf("Agent host '%s': CPU %.1f%%, Memory %.1f%%, Status: %s",
resource.name, resource.cpu, resource.memory, resource.status)
case "vm":
return fmt.Sprintf("VM '%s': CPU %.1f%%, Memory %.1f%%, Status: %s",
resource.name, resource.cpu, resource.memory, resource.status)
case "system-container":
return fmt.Sprintf("Container '%s': CPU %.1f%%, Memory %.1f%%, Status: %s",
resource.name, resource.cpu, resource.memory, resource.status)
case "app-container":
return fmt.Sprintf("Docker container '%s': CPU %.1f%%, Memory %.1f%%, State: %s",
resource.name, resource.cpu, resource.memory, resource.status)
default:
return "Resource state unknown"
}
}
// TriggerPatrolForAlert triggers an immediate patrol for a specific alert
func (p *PatrolService) TriggerPatrolForAlert(alert *alerts.Alert) {
if alert == nil {
return
}
p.mu.RLock()
triggerManager := p.triggerManager
eventTriggerConfig := p.eventTriggerConfig
p.mu.RUnlock()
// Gate: skip if alert-driven scoped patrols are disabled
if !eventTriggerConfig.AlertTriggersEnabled {
log.Debug().
Str("alert_identifier", alert.ID).
Msg("alert-triggered patrol skipped: alert trigger source disabled")
return
}
resourceType := inferResourceType(alert.Type, alert.Metadata)
if triggerManager != nil {
scope := AlertTriggeredPatrolScope(alert.ID, alert.ResourceID, resourceType, alert.Type)
if triggerManager.TriggerPatrol(scope) {
log.Debug().Str("alert_identifier", alert.ID).Msg("queued alert-triggered patrol via trigger manager")
} else {
log.Warn().Str("alert_identifier", alert.ID).Msg("alert-triggered patrol rejected by trigger manager")
}
return
}
// Non-blocking send
select {
case p.adHocTrigger <- alert:
log.Debug().Str("alert_identifier", alert.ID).Msg("queued ad-hoc patrol trigger")
default:
log.Warn().Str("alert_identifier", alert.ID).Msg("patrol trigger queue full, dropping trigger")
}
}
func (p *PatrolService) tryStartRun(kind string) bool {
p.mu.Lock()
if p.runInProgress {
// Detect stuck runs: if the current run has been going for >20 minutes,
// force-clear the flag so a new run can proceed.
if !p.runStartedAt.IsZero() && time.Since(p.runStartedAt) > 20*time.Minute {
log.Warn().
Str("kind", kind).
Time("started_at", p.runStartedAt).
Dur("elapsed", time.Since(p.runStartedAt)).
Msg("AI Patrol: Previous run appears stuck (>20min), force-clearing runInProgress")
p.runInProgress = false
// Fall through to start new run
} else {
p.mu.Unlock()
if kind == "scoped" {
GetPatrolMetrics().RecordScopedDropped()
log.Warn().Str("kind", kind).Msg("AI Patrol: Run already in progress, dropping scoped patrol")
} else {
log.Debug().Str("kind", kind).Msg("AI Patrol: Run already in progress, skipping")
}
return false
}
}
p.runInProgress = true
p.runStartedAt = time.Now()
p.mu.Unlock()
return true
}
func (p *PatrolService) endRun() {
p.mu.Lock()
p.runInProgress = false
orch := p.investigationOrchestrator
p.mu.Unlock()
// Periodic investigation store maintenance after each run
if maintainer, ok := orch.(InvestigationStoreMaintainer); ok {
maintainer.CleanupInvestigationStore(24*time.Hour, 1000)
}
}
// runTargetedPatrol executes a focused patrol for a specific alert
func (p *PatrolService) runTargetedPatrol(ctx context.Context, alert *alerts.Alert) {
log.Info().
Str("alert_identifier", alert.ID).
Str("resource_id", alert.ResourceID).
Msg("Running targeted AI patrol for alert")
resourceType := inferResourceType(alert.Type, alert.Metadata)
scope := AlertTriggeredPatrolScope(alert.ID, alert.ResourceID, resourceType, alert.Type)
p.TriggerScopedPatrol(ctx, scope)
}
// joinParts joins string parts with commas and "and" for the last element
func joinParts(parts []string) string {
if len(parts) == 0 {
return ""
}
if len(parts) == 1 {
return parts[0]
}
if len(parts) == 2 {
return parts[0] + " and " + parts[1]
}
return strings.Join(parts[:len(parts)-1], ", ") + ", and " + parts[len(parts)-1]
}
// generateFindingID creates a stable ID for a finding based on resource, category, and issue.
// All three components are included to ensure distinct issues on the same resource remain separate.
func generateFindingID(resourceID, category, issue string) string {
hash := sha256.Sum256([]byte(fmt.Sprintf("%s:%s:%s", resourceID, category, issue)))
return fmt.Sprintf("%x", hash[:8])
}