Pulse/internal/ai/patrol_ai.go
2026-04-03 19:45:38 +01:00

3795 lines
117 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// patrol_ai.go handles all LLM interaction for patrol: seed context building,
// system/user prompt construction, the agentic analysis loop, evaluation passes,
// stale finding reconciliation, and thinking-token cleanup for model responses.
package ai
import (
"context"
"encoding/json"
"fmt"
"regexp"
"sort"
"strconv"
"strings"
"sync"
"time"
"github.com/google/uuid"
"github.com/rcourtman/pulse-go-rewrite/internal/ai/baseline"
"github.com/rcourtman/pulse-go-rewrite/internal/ai/chat"
"github.com/rcourtman/pulse-go-rewrite/internal/ai/correlation"
"github.com/rcourtman/pulse-go-rewrite/internal/ai/cost"
"github.com/rcourtman/pulse-go-rewrite/internal/ai/memory"
"github.com/rcourtman/pulse-go-rewrite/internal/ai/providers"
"github.com/rcourtman/pulse-go-rewrite/internal/ai/tools"
"github.com/rcourtman/pulse-go-rewrite/internal/models"
"github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
"github.com/rs/zerolog/log"
)
// AIAnalysisResult contains the results of an AI analysis
type AIAnalysisResult struct {
Response string // The AI's raw response text
Findings []*Finding // Parsed findings from the response
RejectedFindings int // Findings rejected by threshold validation
TriageFlags int // Number of deterministic triage flags
TriageSkippedLLM bool // True if LLM was skipped due to quiet triage
InputTokens int
OutputTokens int
ToolCalls []ToolCallRecord // Tool invocations during this analysis
ReportedIDs []string // Finding IDs reported (created/re-reported) this run
ResolvedIDs []string // Finding IDs explicitly resolved by LLM this run
SeededFindingIDs []string // Finding IDs that were presented in seed context
}
const (
patrolMinTurns = 20
patrolMaxTurnsLimit = 80
patrolTurnsPer50Devices = 5
patrolQuickMinTurns = 10
patrolQuickMaxTurns = 30
patrolRetrySeedBudget1 = 16_000
patrolRetrySeedBudget2 = 8_000
patrolRetrySeedBudget3 = 4_000
)
var patrolContextWindowPatterns = []*regexp.Regexp{
regexp.MustCompile(`(?i)"n_ctx"\s*:\s*(\d+)`),
regexp.MustCompile(`(?i)available context size\s*\((\d+)\s*tokens?\)`),
regexp.MustCompile(`(?i)maximum context length[^0-9]*(\d+)`),
regexp.MustCompile(`(?i)context window[^0-9]*(\d+)`),
regexp.MustCompile(`(?i)context length[^0-9]*(\d+)`),
}
// CleanThinkingTokens removes model-specific thinking markers from AI responses.
// Different AI models use different markers for their internal reasoning:
// - DeepSeek: <end▁of▁thinking> or similar unicode variants
// - DeepSeek: <DSML...> internal function call format (hallucinated tool calls)
// - Generic: <think>...</think>, <thought>...</thought>
// - Reasoning: <|reasoning|>...</|/reasoning|>
//
// This function is exported so it can be used by both patrol and chat responses.
func CleanThinkingTokens(content string) string {
if content == "" {
return content
}
// Phase 0: Remove DeepSeek internal function call format leakage.
// When DeepSeek doesn't properly use the function calling API, it may output
// its internal markup like <DSMLfunction_calls>, <DSMLinvoke>, etc.
// These patterns can appear with Unicode pipe () or ASCII pipe (|).
deepseekFunctionMarkers := []string{
"<DSML", // Unicode pipe variant (opening)
"</DSML", // Unicode pipe variant (closing)
"<|DSML|", // ASCII pipe variant (opening)
"</|DSML|", // ASCII pipe variant (closing)
"</DSML", // Alternative Unicode closing
"<|/DSML|", // Alternative ASCII closing
}
for _, marker := range deepseekFunctionMarkers {
if strings.Contains(content, marker) {
// Find the start of the block and remove everything from there to the end
// DeepSeek function call blocks typically appear at the end of responses
idx := strings.Index(content, marker)
if idx >= 0 {
content = strings.TrimSpace(content[:idx])
}
}
}
// Phase 1: Remove entire block-level tags (opening + content + closing).
// Case-insensitive matching via lowercased copy.
type blockTag struct {
open string
close string
}
blockTags := []blockTag{
{"<think>", "</think>"},
{"<thought>", "</thought>"},
{"<|reasoning|>", "<|/reasoning|>"},
}
for _, bt := range blockTags {
lower := strings.ToLower(content)
for {
openIdx := strings.Index(lower, bt.open)
if openIdx < 0 {
break
}
closeIdx := strings.Index(lower[openIdx+len(bt.open):], bt.close)
if closeIdx < 0 {
// Unclosed block — remove from open tag to end
content = content[:openIdx]
lower = strings.ToLower(content)
} else {
end := openIdx + len(bt.open) + closeIdx + len(bt.close)
content = content[:openIdx] + content[end:]
lower = strings.ToLower(content)
}
}
}
// Phase 2: Remove line-level end markers (DeepSeek and remaining close tags).
thinkingMarkers := []string{
"<end▁of▁thinking>", // DeepSeek Unicode variant
"<|end_of_thinking|>", // ASCII variant
"<|end▁of▁thinking|>", // Mixed variant
"</think>", // Generic thinking block end
"</thought>", // Thought block end
"<|/reasoning|>", // Reasoning block end
}
for _, marker := range thinkingMarkers {
for strings.Contains(content, marker) {
idx := strings.Index(content, marker)
if idx >= 0 {
// Find start of the line containing the marker
lineStart := strings.LastIndex(content[:idx], "\n")
if lineStart == -1 {
lineStart = 0
}
// Find end of the line containing the marker
markerEnd := idx + len(marker)
lineEnd := strings.Index(content[markerEnd:], "\n")
if lineEnd == -1 {
lineEnd = len(content)
} else {
lineEnd = markerEnd + lineEnd
}
// Remove the entire line containing the marker
content = content[:lineStart] + content[lineEnd:]
}
}
}
// Phase 3: Remove lines that look like internal reasoning.
// These typically start with patterns like "Now, " or "Let's " after a blank line.
lines := strings.Split(content, "\n")
var cleanedLines []string
skipUntilContent := false
for i, line := range lines {
trimmed := strings.TrimSpace(line)
// Skip lines that look like internal reasoning
if skipUntilContent {
// Resume when we hit actual content (markdown headers, findings, etc.)
if strings.HasPrefix(trimmed, "#") ||
strings.HasPrefix(trimmed, "[FINDING]") ||
strings.HasPrefix(trimmed, "**") ||
strings.HasPrefix(trimmed, "-") ||
strings.HasPrefix(trimmed, "1.") {
skipUntilContent = false
} else {
continue
}
}
// Detect reasoning patterns (typically after empty lines)
if trimmed == "" && i+1 < len(lines) {
nextTrimmed := strings.TrimSpace(lines[i+1])
if strings.HasPrefix(nextTrimmed, "Now, ") ||
strings.HasPrefix(nextTrimmed, "Let's ") ||
strings.HasPrefix(nextTrimmed, "Let me ") ||
strings.HasPrefix(nextTrimmed, "I should ") ||
strings.HasPrefix(nextTrimmed, "I'll ") ||
strings.HasPrefix(nextTrimmed, "I need to ") ||
strings.HasPrefix(nextTrimmed, "Checking ") ||
strings.HasPrefix(nextTrimmed, "Looking at ") {
skipUntilContent = true
continue
}
}
cleanedLines = append(cleanedLines, line)
}
// Clean up excessive blank lines
content = strings.Join(cleanedLines, "\n")
for strings.Contains(content, "\n\n\n") {
content = strings.ReplaceAll(content, "\n\n\n", "\n\n")
}
return strings.TrimSpace(content)
}
// runAIAnalysis uses the agentic tool-driven approach to analyze infrastructure.
// The LLM investigates using MCP tools and reports findings via patrol_report_finding.
// An optional scope focuses the patrol on specific resources.
func (p *PatrolService) runAIAnalysisState(ctx context.Context, snap patrolRuntimeState, scope *PatrolScope, executionID string) (*AIAnalysisResult, error) {
if p.aiService == nil {
return nil, fmt.Errorf("Pulse Patrol service not available")
}
// Pre-flight budget check: fail fast before building context or acquiring a chat service
if err := p.aiService.CheckBudget("patrol"); err != nil {
log.Warn().Err(err).Msg("AI Patrol: Budget exceeded, skipping analysis")
return nil, fmt.Errorf("patrol skipped: %w", err)
}
// Gather guest intelligence (discovery + reachability) before building seed context
intelCtx, intelCancel := context.WithTimeout(ctx, 5*time.Second)
guestIntel := p.gatherGuestIntelligence(intelCtx)
intelCancel()
// Phase 1: Deterministic triage
triageResult := p.runDeterministicTriageState(ctx, snap, scope, guestIntel)
log.Info().
Int("flags", len(triageResult.Flags)).
Bool("quiet", triageResult.IsQuiet).
Int("flagged_resources", len(triageResult.FlaggedIDs)).
Msg("AI Patrol: Triage complete")
metrics := GetPatrolMetrics()
metrics.RecordTriageFlags(len(triageResult.Flags))
if triageResult.IsQuiet {
metrics.RecordTriageQuiet()
}
// Quiet infrastructure: skip LLM entirely
if triageResult.IsQuiet {
log.Info().Msg("AI Patrol: Infrastructure quiet, skipping LLM analysis")
return &AIAnalysisResult{
Response: "Infrastructure healthy — deterministic triage found no issues.",
TriageFlags: 0,
TriageSkippedLLM: true,
}, nil
}
// Phase 2: Build focused seed context from triage results
seedSections, seededFindingIDs := p.buildTriageSeedSectionsState(triageResult, snap, scope, guestIntel)
seedBudget := p.calculateSeedBudget()
seedContext := p.assembleSeedWithinBudget(seedSections, seedBudget)
if strings.TrimSpace(seedContext) == "" {
return nil, nil
}
log.Info().
Int("seed_context_chars", len(seedContext)).
Int("seed_context_estimated_tokens", chat.EstimateTokens(seedContext)).
Msg("AI Patrol: Triage seed context built")
log.Debug().Msg("AI Patrol: Starting agentic patrol analysis")
maxTurns := computeTriageMaxTurns(len(triageResult.Flags), scope)
if strings.TrimSpace(executionID) == "" {
executionID = uuid.NewString()
}
log.Debug().
Int("triage_flags", len(triageResult.Flags)).
Int("max_turns", maxTurns).
Msg("AI Patrol: Calculated agentic max turns")
// Determine whether to skip streaming updates (verification runs are consumed
// programmatically and must not interleave with a concurrent normal patrol's stream).
noStream := scope != nil && scope.NoStream
// Start streaming phase
if !noStream {
p.setStreamPhase("analyzing")
p.broadcast(PatrolStreamEvent{Type: "start"})
}
// Create finding creator adapter
adapter := newPatrolFindingCreatorAdapterState(p, snap)
// Get chat service and set the finding creator on the executor
cs := p.aiService.GetChatService()
if cs == nil {
if !noStream {
p.setStreamPhase("idle")
}
return nil, fmt.Errorf("chat service not available")
}
// Type-assert to get executor access
executorAccessor, ok := cs.(chatServiceExecutorAccessor)
if !ok {
if !noStream {
p.setStreamPhase("idle")
}
return nil, fmt.Errorf("chat service does not support executor access")
}
executor := executorAccessor.GetExecutor()
if executor == nil {
if !noStream {
p.setStreamPhase("idle")
}
return nil, fmt.Errorf("tool executor not available")
}
// Set the patrol finding creator for this run
executor.SetPatrolFindingCreator(adapter)
defer executor.SetPatrolFindingCreator(nil) // Clear after run
// Execute the agentic patrol loop
var inputTokens, outputTokens int
type patrolStreamAttempt struct {
response *PatrolStreamResponse
finalContent string
toolCalls []ToolCallRecord
rawToolOutputs []string
}
executePatrol := func(prompt string) (*patrolStreamAttempt, error) {
var contentBuffer strings.Builder
var toolCallsMu sync.Mutex
pendingToolCalls := make(map[string]ToolCallRecord)
var pendingToolOrder []string
anonToolCounter := 0
var completedToolCalls []ToolCallRecord
var rawToolOutputs []string
chatResp, chatErr := cs.ExecutePatrolStream(ctx, PatrolExecuteRequest{
Prompt: prompt,
SystemPrompt: p.getPatrolSystemPromptForTriage(),
SessionID: "patrol-main",
ExecutionID: executionID,
UseCase: "patrol",
MaxTurns: maxTurns,
}, func(event ChatStreamEvent) {
switch event.Type {
case "content":
var contentData struct {
Text string `json:"text"`
}
if json.Unmarshal(event.Data, &contentData) == nil && contentData.Text != "" {
contentBuffer.WriteString(contentData.Text)
if !noStream {
p.appendStreamContent(contentData.Text)
}
}
case "thinking":
var thinkingData struct {
Text string `json:"text"`
}
if json.Unmarshal(event.Data, &thinkingData) == nil && thinkingData.Text != "" {
if !noStream {
p.broadcast(PatrolStreamEvent{
Type: "thinking",
Content: thinkingData.Text,
})
}
}
case "tool_start":
var data struct {
ID string `json:"id"`
Name string `json:"name"`
Input string `json:"input"`
RawInput string `json:"raw_input"`
}
if json.Unmarshal(event.Data, &data) == nil {
if data.ID == "" {
anonToolCounter++
data.ID = fmt.Sprintf("patrol-anon-%d", anonToolCounter)
}
if !noStream {
p.broadcast(PatrolStreamEvent{
Type: "tool_start",
ToolID: data.ID,
ToolName: data.Name,
ToolInput: data.Input,
ToolRawInput: data.RawInput,
})
}
input := data.Input
if data.RawInput != "" {
input = data.RawInput
}
toolCallsMu.Lock()
pendingToolOrder = append(pendingToolOrder, data.ID)
pendingToolCalls[data.ID] = ToolCallRecord{
ID: data.ID,
ToolName: data.Name,
Input: truncateString(input, MaxToolInputSize),
StartTime: time.Now().UnixMilli(),
}
toolCallsMu.Unlock()
}
case "tool_end":
var data struct {
ID string `json:"id"`
Name string `json:"name"`
Input string `json:"input"`
RawInput string `json:"raw_input"`
Output string `json:"output"`
Success bool `json:"success"`
}
if json.Unmarshal(event.Data, &data) == nil {
if data.ID == "" {
if len(pendingToolOrder) > 0 {
data.ID = pendingToolOrder[0]
pendingToolOrder = pendingToolOrder[1:]
} else {
anonToolCounter++
data.ID = fmt.Sprintf("patrol-anon-end-%d", anonToolCounter)
}
} else if len(pendingToolOrder) > 0 {
for i, id := range pendingToolOrder {
if id == data.ID {
pendingToolOrder = append(pendingToolOrder[:i], pendingToolOrder[i+1:]...)
break
}
}
}
if !noStream {
success := data.Success
p.broadcast(PatrolStreamEvent{
Type: "tool_end",
ToolID: data.ID,
ToolName: data.Name,
ToolInput: data.Input,
ToolRawInput: data.RawInput,
ToolOutput: data.Output,
ToolSuccess: &success,
})
}
toolCallsMu.Lock()
if pending, ok := pendingToolCalls[data.ID]; ok {
now := time.Now().UnixMilli()
input := data.Input
if data.RawInput != "" {
input = data.RawInput
}
if input != "" {
pending.Input = truncateString(input, MaxToolInputSize)
}
pending.Output = truncateString(data.Output, MaxToolOutputSize)
pending.Success = data.Success
pending.EndTime = now
pending.Duration = now - pending.StartTime
completedToolCalls = append(completedToolCalls, pending)
rawToolOutputs = append(rawToolOutputs, data.Output)
delete(pendingToolCalls, data.ID)
} else {
now := time.Now().UnixMilli()
input := data.Input
if data.RawInput != "" {
input = data.RawInput
}
completedToolCalls = append(completedToolCalls, ToolCallRecord{
ID: data.ID,
ToolName: data.Name,
Input: truncateString(input, MaxToolInputSize),
Output: truncateString(data.Output, MaxToolOutputSize),
Success: data.Success,
StartTime: now,
EndTime: now,
Duration: 0,
})
rawToolOutputs = append(rawToolOutputs, data.Output)
}
toolCallsMu.Unlock()
}
}
})
if chatErr != nil {
return nil, chatErr
}
finalContent := chatResp.Content
if finalContent == "" {
finalContent = contentBuffer.String()
}
toolCallsMu.Lock()
collectedToolCalls := append([]ToolCallRecord(nil), completedToolCalls...)
collectedRawOutputs := append([]string(nil), rawToolOutputs...)
toolCallsMu.Unlock()
return &patrolStreamAttempt{
response: chatResp,
finalContent: finalContent,
toolCalls: collectedToolCalls,
rawToolOutputs: collectedRawOutputs,
}, nil
}
attempt, chatErr := executePatrol(seedContext)
if chatErr != nil && isPatrolContextWindowError(chatErr) {
for _, retryBudget := range patrolSeedRetryBudgets(chatErr) {
if retryBudget >= seedBudget {
continue
}
retrySeedContext := p.assembleSeedWithinBudget(seedSections, retryBudget)
if strings.TrimSpace(retrySeedContext) == "" || retrySeedContext == seedContext {
continue
}
log.Warn().
Int("previous_seed_budget_tokens", seedBudget).
Int("retry_seed_budget_tokens", retryBudget).
Int("previous_seed_tokens", chat.EstimateTokens(seedContext)).
Int("retry_seed_tokens", chat.EstimateTokens(retrySeedContext)).
Msg("AI Patrol: Retrying patrol analysis with tighter provider-derived seed budget")
seedBudget = retryBudget
seedContext = retrySeedContext
attempt, chatErr = executePatrol(seedContext)
if chatErr == nil {
break
}
if !isPatrolContextWindowError(chatErr) {
break
}
}
}
if chatErr != nil {
if !noStream {
p.setStreamPhase("idle")
p.broadcast(PatrolStreamEvent{Type: "error", Content: chatErr.Error()})
}
return nil, fmt.Errorf("agentic patrol failed: %w", chatErr)
}
finalContent := attempt.finalContent
inputTokens = attempt.response.InputTokens
outputTokens = attempt.response.OutputTokens
p.recordPatrolUsage(attempt.response.InputTokens, attempt.response.OutputTokens)
// Clean thinking tokens
finalContent = CleanThinkingTokens(finalContent)
log.Debug().
Int("input_tokens", inputTokens).
Int("output_tokens", outputTokens).
Int("findings_created", len(adapter.getCollectedFindings())).
Int("findings_resolved", adapter.getResolvedCount()).
Msg("AI Patrol: Agentic patrol analysis complete")
var toolCallsMu sync.Mutex
completedToolCalls := append([]ToolCallRecord(nil), attempt.toolCalls...)
rawToolOutputs := append([]string(nil), attempt.rawToolOutputs...)
p.ensureInvestigationToolCall(ctx, executor, &toolCallsMu, &completedToolCalls, &rawToolOutputs, noStream)
// Broadcast completion
if !noStream {
p.broadcast(PatrolStreamEvent{
Type: "complete",
Tokens: outputTokens,
})
p.setStreamPhase("idle")
}
// Collect completed tool calls
toolCallsMu.Lock()
collectedToolCalls := completedToolCalls
signalToolCalls := make([]ToolCallRecord, len(collectedToolCalls))
for i, tc := range collectedToolCalls {
signalToolCalls[i] = tc
if i < len(rawToolOutputs) && rawToolOutputs[i] != "" {
signalToolCalls[i].Output = rawToolOutputs[i]
}
}
toolCallsMu.Unlock()
// --- Deterministic signal detection + evaluation pass ---
// Build signal thresholds from user config so detection aligns with alert settings
p.mu.RLock()
sigThresholds := SignalThresholdsFromPatrol(p.thresholds)
p.mu.RUnlock()
detectedSignals := DetectSignals(signalToolCalls, sigThresholds)
// Merge reachability signals from pre-patrol guest probing
reachabilitySignals := DetectReachabilitySignals(guestIntel)
detectedSignals = append(detectedSignals, reachabilitySignals...)
if len(detectedSignals) > 0 {
log.Info().
Int("detected_signals", len(detectedSignals)).
Msg("AI Patrol: Deterministic signal detection found signals")
unmatchedSignals := UnmatchedSignals(detectedSignals, adapter.getCollectedFindings())
if len(unmatchedSignals) > 0 {
log.Warn().
Int("unmatched_signals", len(unmatchedSignals)).
Msg("AI Patrol: Unmatched signals found, running evaluation pass")
evalResp, evalErr := p.runEvaluationPass(ctx, adapter, unmatchedSignals, executionID)
if evalErr != nil {
log.Warn().Err(evalErr).Msg("AI Patrol: Evaluation pass failed")
} else if evalResp != nil {
inputTokens += evalResp.InputTokens
outputTokens += evalResp.OutputTokens
log.Info().
Int("eval_input_tokens", evalResp.InputTokens).
Int("eval_output_tokens", evalResp.OutputTokens).
Int("total_findings", len(adapter.getCollectedFindings())).
Msg("AI Patrol: Evaluation pass completed")
}
// Deterministic fallback: if unmatched signals remain, create findings directly.
remaining := UnmatchedSignals(detectedSignals, adapter.getCollectedFindings())
if len(remaining) > 0 {
created := p.createFindingsFromSignals(adapter, remaining)
if created > 0 {
log.Info().
Int("created", created).
Int("remaining", len(remaining)).
Msg("AI Patrol: Created deterministic findings for unmatched signals")
}
}
} else {
log.Debug().
Int("detected_signals", len(detectedSignals)).
Msg("AI Patrol: All detected signals already matched by findings")
}
}
// Findings were already created via tool calls — collect them
adapter.findingsMu.Lock()
rejectedCount := adapter.rejectedCount
adapter.findingsMu.Unlock()
return &AIAnalysisResult{
Response: finalContent,
Findings: adapter.getCollectedFindings(),
RejectedFindings: rejectedCount,
TriageFlags: len(triageResult.Flags),
TriageSkippedLLM: false,
InputTokens: inputTokens,
OutputTokens: outputTokens,
ToolCalls: collectedToolCalls,
ReportedIDs: adapter.getReportedFindingIDs(),
ResolvedIDs: adapter.getResolvedIDs(),
SeededFindingIDs: seededFindingIDs,
}, nil
}
func computePatrolMaxTurns(resourceCount int, scope *PatrolScope) int {
minTurns := patrolMinTurns
maxTurns := patrolMaxTurnsLimit
if scope != nil && scope.Depth == PatrolDepthQuick {
minTurns = patrolQuickMinTurns
maxTurns = patrolQuickMaxTurns
}
extra := (resourceCount / 50) * patrolTurnsPer50Devices
turns := minTurns + extra
if turns < minTurns {
return minTurns
}
if turns > maxTurns {
return maxTurns
}
return turns
}
func computeTriageMaxTurns(flagCount int, scope *PatrolScope) int {
const (
triageBaseTurns = 5
triageTurnsPerFlag = 3
triageMinTurns = 8
triageMaxTurns = 40
)
turns := triageBaseTurns + flagCount*triageTurnsPerFlag
if turns < triageMinTurns {
turns = triageMinTurns
}
if turns > triageMaxTurns {
turns = triageMaxTurns
}
if scope != nil && scope.Depth == PatrolDepthQuick {
if turns > 20 {
turns = 20
}
}
return turns
}
func (p *PatrolService) ensureInvestigationToolCall(
ctx context.Context,
executor *tools.PulseToolExecutor,
toolCallsMu *sync.Mutex,
completedToolCalls *[]ToolCallRecord,
rawToolOutputs *[]string,
noStream bool,
) {
if executor == nil {
return
}
toolCallsMu.Lock()
needsInvestigation := true
for _, tc := range *completedToolCalls {
if isInvestigationTool(tc.ToolName) {
needsInvestigation = false
break
}
}
toolCallsMu.Unlock()
if !needsInvestigation {
return
}
fallbackName := "pulse_query"
args := map[string]interface{}{"action": "health"}
inputBytes, _ := json.Marshal(args)
inputStr := string(inputBytes)
fallbackID := fmt.Sprintf("patrol-fallback-%d", time.Now().UnixNano())
start := time.Now().UnixMilli()
if !noStream {
p.broadcast(PatrolStreamEvent{
Type: "tool_start",
ToolID: fallbackID,
ToolName: fallbackName,
ToolInput: inputStr,
ToolRawInput: inputStr,
})
}
result, err := executor.ExecuteTool(ctx, fallbackName, args)
output := ""
success := false
if err != nil {
output = err.Error()
} else {
output = formatToolResult(result)
success = !result.IsError
}
end := time.Now().UnixMilli()
if !noStream {
p.broadcast(PatrolStreamEvent{
Type: "tool_end",
ToolID: fallbackID,
ToolName: fallbackName,
ToolInput: inputStr,
ToolRawInput: inputStr,
ToolOutput: output,
ToolSuccess: &success,
})
}
toolCallsMu.Lock()
*completedToolCalls = append(*completedToolCalls, ToolCallRecord{
ID: fallbackID,
ToolName: fallbackName,
Input: truncateString(inputStr, MaxToolInputSize),
Output: truncateString(output, MaxToolOutputSize),
Success: success,
StartTime: start,
EndTime: end,
Duration: end - start,
})
*rawToolOutputs = append(*rawToolOutputs, output)
toolCallsMu.Unlock()
}
func isInvestigationTool(name string) bool {
switch name {
case "pulse_query", "pulse_metrics", "pulse_storage", "pulse_read":
return true
default:
return false
}
}
func formatToolResult(result tools.CallToolResult) string {
if len(result.Content) == 0 {
return ""
}
var text string
for _, c := range result.Content {
if c.Type == "text" && c.Text != "" {
if text != "" {
text += "\n"
}
text += c.Text
}
}
return text
}
// runEvaluationPass runs a focused second LLM call to evaluate unmatched signals
// that the main patrol pass detected but did not report as findings.
func (p *PatrolService) runEvaluationPass(ctx context.Context, adapter *patrolFindingCreatorAdapter, unmatchedSignals []DetectedSignal, executionID string) (*PatrolStreamResponse, error) {
cs := p.aiService.GetChatService()
if cs == nil {
return nil, fmt.Errorf("chat service not available for evaluation pass")
}
if err := p.aiService.CheckBudget("patrol"); err != nil {
log.Warn().Err(err).Msg("AI Patrol: Budget exceeded, skipping evaluation pass")
return nil, fmt.Errorf("patrol evaluation skipped: %w", err)
}
systemPrompt := buildEvalSystemPrompt()
userPrompt := buildEvalUserPrompt(unmatchedSignals)
log.Info().
Int("unmatched_signals", len(unmatchedSignals)).
Msg("AI Patrol: Running evaluation pass for unmatched signals")
resp, err := cs.ExecutePatrolStream(ctx, PatrolExecuteRequest{
Prompt: userPrompt,
SystemPrompt: systemPrompt,
SessionID: "patrol-eval",
ExecutionID: executionID,
UseCase: "patrol",
MaxTurns: 5,
}, func(event ChatStreamEvent) {
// Minimal callback — we don't stream eval pass to the frontend
// but findings are still created via the adapter
})
if err != nil {
log.Warn().Err(err).Msg("AI Patrol: Evaluation pass failed")
return nil, err
}
log.Info().
Int("input_tokens", resp.InputTokens).
Int("output_tokens", resp.OutputTokens).
Msg("AI Patrol: Evaluation pass complete")
p.recordPatrolUsage(resp.InputTokens, resp.OutputTokens)
return resp, nil
}
func (p *PatrolService) recordPatrolUsage(inputTokens, outputTokens int) {
if p == nil || p.aiService == nil || (inputTokens <= 0 && outputTokens <= 0) {
return
}
p.aiService.mu.RLock()
store := p.aiService.costStore
cfg := p.aiService.cfg
provider := p.aiService.provider
p.aiService.mu.RUnlock()
if store == nil {
return
}
model := ""
if cfg != nil {
model = strings.TrimSpace(cfg.GetPatrolModel())
if model == "" {
model = strings.TrimSpace(cfg.GetChatModel())
}
}
providerName := ""
if model != "" {
parts := strings.SplitN(model, ":", 2)
if len(parts) == 2 {
providerName = strings.TrimSpace(strings.ToLower(parts[0]))
}
}
if providerName == "" && provider != nil {
providerName = strings.TrimSpace(strings.ToLower(provider.Name()))
}
store.Record(cost.UsageEvent{
Timestamp: time.Now(),
Provider: providerName,
RequestModel: model,
UseCase: "patrol",
InputTokens: inputTokens,
OutputTokens: outputTokens,
})
}
// buildEvalSystemPrompt returns the system prompt for the evaluation pass.
func buildEvalSystemPrompt() string {
return `You are a patrol evaluation agent reviewing infrastructure signals that were
detected but not reported as findings.
Tools: patrol_report_finding, patrol_get_findings
Instructions:
1. Call patrol_get_findings to check what already exists.
2. For each signal below, determine if it is a genuine issue requiring attention.
3. If yes, call patrol_report_finding with complete details.
4. If not actionable or already covered by an existing finding, skip it.
5. Do NOT investigate further — use only the evidence provided below.`
}
// buildEvalUserPrompt formats the unmatched signals into a user prompt for the evaluation pass.
func buildEvalUserPrompt(signals []DetectedSignal) string {
var sb strings.Builder
sb.WriteString("The following infrastructure signals were detected during patrol but were not reported as findings.\n")
sb.WriteString("Review each one and report genuine issues using patrol_report_finding.\n\n")
for i, s := range signals {
sb.WriteString(fmt.Sprintf("## Signal %d: %s\n", i+1, s.SignalType))
sb.WriteString(fmt.Sprintf("- **Resource**: %s (ID: %s, Type: %s)\n", s.ResourceName, s.ResourceID, s.ResourceType))
sb.WriteString(fmt.Sprintf("- **Suggested Severity**: %s\n", s.SuggestedSeverity))
sb.WriteString(fmt.Sprintf("- **Category**: %s\n", s.Category))
sb.WriteString(fmt.Sprintf("- **Summary**: %s\n", s.Summary))
sb.WriteString(fmt.Sprintf("- **Evidence**: ```\n%s\n```\n\n", s.Evidence))
}
return sb.String()
}
func (p *PatrolService) createFindingsFromSignals(adapter *patrolFindingCreatorAdapter, signals []DetectedSignal) int {
if adapter == nil || len(signals) == 0 {
return 0
}
created := 0
for _, s := range signals {
input := signalToFindingInput(s)
if input.ResourceName == "" {
input.ResourceName = input.ResourceID
}
if input.ResourceType == "" {
input.ResourceType = inferFindingResourceType(input.ResourceID, input.ResourceName)
}
if input.Category == "" {
input.Category = "general"
}
if input.Severity == "" {
input.Severity = "warning"
}
if input.Recommendation == "" {
input.Recommendation = defaultRecommendationForSignal(s)
}
if input.Title == "" {
input.Title = s.Summary
}
if input.Description == "" {
input.Description = s.Summary
}
if _, _, err := adapter.CreateFinding(input); err == nil {
created++
}
}
return created
}
func signalToFindingInput(s DetectedSignal) tools.PatrolFindingInput {
key := signalKey(s)
category := s.Category
severity := s.SuggestedSeverity
return tools.PatrolFindingInput{
Key: key,
Severity: severity,
Category: category,
ResourceID: s.ResourceID,
ResourceName: s.ResourceName,
ResourceType: s.ResourceType,
Title: signalTitle(s),
Description: s.Summary,
Evidence: s.Evidence,
}
}
func signalKey(s DetectedSignal) string {
switch s.SignalType {
case SignalSMARTFailure:
return "smart-failure"
case SignalHighCPU:
return "cpu-high"
case SignalHighMemory:
return "memory-high"
case SignalHighDisk:
return "disk-high"
case SignalBackupFailed:
return "backup-failed"
case SignalBackupStale:
return "backup-stale"
case SignalActiveAlert:
return "active-alert"
case SignalGuestUnreachable:
return "guest-unreachable"
default:
return "deterministic-signal"
}
}
func signalTitle(s DetectedSignal) string {
switch s.SignalType {
case SignalSMARTFailure:
return "SMART health check failed"
case SignalHighCPU:
return "High CPU usage detected"
case SignalHighMemory:
return "High memory usage detected"
case SignalHighDisk:
return "Storage usage is high"
case SignalBackupFailed:
return "Backup failed"
case SignalBackupStale:
return "Backup is stale"
case SignalActiveAlert:
return "Active alert detected"
case SignalGuestUnreachable:
return fmt.Sprintf("Guest unreachable: %s", s.ResourceName)
default:
return "Infrastructure signal detected"
}
}
func defaultRecommendationForSignal(s DetectedSignal) string {
switch s.SignalType {
case SignalSMARTFailure:
return "Inspect the disk for errors and consider replacing it if SMART failures persist."
case SignalHighCPU:
return "Identify processes causing high CPU usage and optimize or scale resources."
case SignalHighMemory:
return "Identify memory-heavy processes and consider increasing memory or tuning workloads."
case SignalHighDisk:
return "Investigate disk usage growth and clean up or expand storage as needed."
case SignalBackupFailed:
return "Review backup logs and fix the underlying error, then rerun the backup."
case SignalBackupStale:
return "Ensure backups are scheduled and completing successfully; run a new backup."
case SignalActiveAlert:
return "Investigate the active alert and resolve the underlying issue."
case SignalGuestUnreachable:
return "Investigate why this guest is not responding to ping. Check network configuration, firewall rules, or whether the guest has crashed."
default:
return "Investigate the signal and take corrective action if needed."
}
}
// getPatrolSystemPrompt returns the system prompt for AI patrol analysis.
// The new agentic prompt instructs the LLM to use investigation tools and
// report findings via the patrol_report_finding tool instead of text blocks.
func (p *PatrolService) getPatrolSystemPrompt() string {
autoFix := false
if cfg := p.aiService.GetAIConfig(); cfg != nil {
autoFix = cfg.PatrolAutoFix
}
basePrompt := `You are Pulse Patrol, an autonomous infrastructure analysis agent. Your job is to find issues that simple threshold-based alerts CANNOT catch — trends, capacity risks, misconfigurations, reliability gaps, and cross-resource correlations.
Pulse already has a real-time alerting system that fires when metrics cross thresholds (CPU, memory, disk, etc.) and when resources go down. Do NOT duplicate what alerts already handle. Your value is deeper analysis that requires looking at patterns over time and across resources.
## Investigation Tools
You have access to the following tools to investigate infrastructure:
**Infrastructure State:**
- pulse_query — Search resources, get details, list resources, check health overview
- pulse_metrics — Performance metrics, temperatures, network, disk I/O, baselines, patterns
- pulse_storage — Storage pools, config, backups, snapshots, Ceph, replication, PBS jobs, RAID, disk health
**Platform-Specific:**
- pulse_docker — Container status, updates, services, swarm
- pulse_kubernetes — Clusters, nodes, pods, deployments
- pulse_pmg — Proxmox Mail Gateway status, mail stats, queues
**Deep Investigation:**
- pulse_read — Read-only command execution, file reads, log tailing
- pulse_discovery — Infrastructure discovery details
- pulse_knowledge — User notes, incidents, event correlations
**Patrol Reporting:**
- patrol_report_finding — Report a finding (creates a structured finding with validation)
- patrol_resolve_finding — Resolve an existing finding that is no longer an issue
- patrol_get_findings — Check currently active findings (use before reporting to avoid duplicates)
## How Patrol Works
You are provided with the current state of the user's infrastructure below, including resource metrics, storage health, backup status, disk health, active alerts, baselines, and connection health. This gives you a complete point-in-time snapshot without needing to query for it.
The seed context includes service identity (from discovery) and reachability data when available. Guests marked UNREACHABLE are running according to Proxmox but did not respond to ICMP ping from their host node. This may indicate a network issue, guest crash, or firewall blocking ICMP. Use pulse_read to check guest logs or pulse_discovery for service details.
**Step 1 — Analyze the snapshot.** Scan the data for anything notable: high usage, backup gaps, disk health issues, resources above baseline, stopped resources that should be running, storage trending full, unreachable guests, etc.
**Step 2 — Investigate deeper.** For anything notable you spotted, use your tools to understand whether it's actually a problem:
- Use **pulse_metrics** with historical windows (1h, 6h, 24h) to check if a high metric is trending up or just a momentary spike. A resource at 60% and rising is more interesting than one sitting steady at 75%.
- Use **pulse_read** to check logs on resources that look unhealthy or abnormal.
- Use **pulse_storage** to check snapshot ages, replication status, or backup job details.
- Use **pulse_query** to check resource configuration for misconfigurations.
- Use **pulse_pmg** to check mail queues or spam volume if mail flow looks abnormal.
**Step 3 — Report or resolve findings.** Report findings for confirmed issues. Resolve active findings that are no longer issues based on current data.
Always call patrol_get_findings before reporting or resolving findings.
The snapshot eliminates routine data gathering, but you must still investigate to distinguish real problems from noise. Do not skip investigation — a snapshot alone cannot tell you whether a metric is stable or rapidly changing.
## Efficiency Rules
- Do NOT call the same tool with the same parameters twice in a single patrol run.
- Keep track of what you've already checked. If you've already retrieved metrics for a resource, use the data you have.
- Always call at least one investigation tool (pulse_query, pulse_metrics, pulse_storage, or pulse_read) in every patrol run, even if everything appears healthy.
## Severity & Thresholds
- **critical**: Data loss risk, unrecoverable misconfiguration, complete backup failure with no retention
- **warning**: Capacity will be exhausted within 7 days at current growth rate, backup gap >48h, replication broken, security misconfiguration
- **watch**: Capacity trending toward limits (14-30 days), minor config drift, optimization opportunity
- **info**: Almost never — only for significant findings that don't fit above
These are for Patrol-specific findings (trends, capacity, config issues). Simple metric thresholds (CPU >90%, memory >95%, etc.) are handled by the alerting system — do NOT report those.
## Noise to Avoid
- "CPU at 15% vs baseline 8%" — NORMAL variance, not an issue
- "Memory at 45% which is elevated" — FINE, lots of headroom
- "Disk at 30% is above baseline" — FINE, not actionable
- Stopped containers/VMs (unless autostart is enabled AND they crashed)
- Minor metric fluctuations compared to baseline
- Resources that are simply "busier than usual" but not near limits
- Simple threshold breaches (CPU/memory/disk above X%) — alerts handle these
- Resources that are down or stopped — alerts handle these
- Any condition that a metric-crosses-threshold alert would catch
## Before Reporting a Finding, Ask Yourself
1. Would an operator need to DO something about this?
2. Is this something the real-time alerting system would catch on its own? If yes — DO NOT report it.
3. Does this require analysis, trend detection, or correlation that a simple threshold can't provide?
If everything looks healthy, report no findings. Report findings for issues that require human planning or intervention — capacity risks, misconfigurations, reliability gaps, optimization opportunities, or emerging trends. Do NOT report simple threshold breaches (high CPU, high memory, high disk, resource down) — those are handled by the alerting system.
## Final Summary Format
After completing your investigation, write a concise summary using this structure:
### Infrastructure Status
One sentence overall health verdict (e.g., "All 3 nodes and 18 guests are operating normally." or "1 warning found across 3 nodes and 12 VMs.").
### Key Observations
- Bullet each noteworthy observation with the **resource name** bolded and the metric or finding inline
- Only include items worth mentioning — skip anything completely normal
- Group related items (e.g., all storage together, all compute together)
### Actions Taken
- List each finding you reported or resolved, with its severity badge: ` + "`" + `⚠ warning` + "`" + `, ` + "`" + `🔴 critical` + "`" + `, ` + "`" + `✅ resolved` + "`" + `
- If no findings were created or resolved, write "No findings reported — all clear."
Keep the summary factual, terse, and scannable. Do NOT repeat your investigation process or thinking. Do NOT use phrases like "Let me check..." or "I'll start by..." — only state results. Maximum 15 lines.`
if autoFix {
return basePrompt + `
## Auto-Fix Mode
Auto-fix is enabled. You may use pulse_control and pulse_read tools to attempt automatic remediation.
Safe operations you can perform autonomously:
- Restart services (systemctl restart)
- Clear caches and temp files
- Rotate/compress logs
- Trigger garbage collection
Always:
1. Run a verification command after any fix to confirm success
2. Report findings for issues you attempted to fix (include fix outcome in evidence)
3. Stop and report if the fix doesn't resolve the issue`
}
return basePrompt + `
## Observe Only Mode
You are in observation mode. Use read-only tools to gather diagnostic information but DO NOT modify anything. Report findings with clear recommendations for the user to review and action manually.`
}
const triageSystemPreamble = `You are Pulse Patrol, an autonomous infrastructure analysis agent.
Deterministic triage has already scanned all resources against thresholds, baselines, backup schedules, disk health, and connectivity. The flagged items are listed in your seed context under "Deterministic Triage Results".
Your job is to investigate each flagged item deeper using tools:
- Use pulse_metrics with historical windows to check if an elevated metric is trending up or stable
- Use pulse_read to check logs on flagged resources
- Use pulse_storage to verify backup/replication/RAID details
- Use pulse_query to check resource configuration
After investigation, report confirmed issues via patrol_report_finding and resolve any active findings that are no longer problems.
Do NOT re-scan healthy resources. Triage already verified they are within normal parameters. Focus your turns exclusively on the flagged items.`
func (p *PatrolService) getPatrolSystemPromptForTriage() string {
fullPrompt := p.getPatrolSystemPrompt()
const toolsMarker = "## Investigation Tools"
toolsIdx := strings.Index(fullPrompt, toolsMarker)
if toolsIdx < 0 {
return triageSystemPreamble + "\n\n" + fullPrompt
}
return triageSystemPreamble + "\n\n" + fullPrompt[toolsIdx:]
}
// seedIntelligence holds pre-computed intelligence data used by multiple seed context sections.
type seedIntelligence struct {
anomalies []baseline.AnomalyReport
forecasts []seedForecast
predictions []FailurePrediction
recentChanges []memory.Change
correlations []*Correlation
isQuiet bool
hasBaselineStore bool
}
type seedSection struct {
priority int
name string
content string
summary string
}
// seedForecast represents a capacity forecast for seed context.
type seedForecast struct {
name, resourceID, metric, severity string
daysToFull int
dailyChange, current float64
}
func (p *PatrolService) buildTriageSeedContextState(
triage *TriageResult,
snap patrolRuntimeState,
scope *PatrolScope,
guestIntel map[string]*GuestIntelligence,
) (string, []string) {
sections, seededFindingIDs := p.buildTriageSeedSectionsState(triage, snap, scope, guestIntel)
return p.assembleSeedWithinBudget(sections, p.calculateSeedBudget()), seededFindingIDs
}
func (p *PatrolService) buildTriageSeedSectionsState(
triage *TriageResult,
snap patrolRuntimeState,
scope *PatrolScope,
guestIntel map[string]*GuestIntelligence,
) ([]seedSection, []string) {
p.mu.RLock()
cfg := p.config
p.mu.RUnlock()
if triage == nil {
triage = &TriageResult{}
}
flaggedSet := triage.FlaggedIDs
if flaggedSet == nil {
flaggedSet = map[string]bool{}
}
findingsCtx, seededFindingIDs := p.seedFindingsAndContextState(scope, snap)
now := time.Now()
sections := []seedSection{
// P0 — always include.
{priority: 0, name: "triage_overview", content: formatTriageOverviewSection(triage)},
{priority: 0, name: "findings", content: findingsCtx},
{priority: 0, name: "health_alerts", content: p.seedHealthAndAlertsState(snap, flaggedSet, cfg, now)},
{priority: 0, name: "scope", content: buildScopeSection(scope, sortedScopedIDs(flaggedSet))},
// P2 — triage already preserves the flagged set, so these sections can
// summarize under tighter provider-derived retry budgets.
{
priority: 2,
name: "triage_flags",
content: formatTriageFlagsSection(triage),
summary: formatTriageFlagsSummary(triage),
},
{
priority: 2,
name: "flagged_inventory",
content: p.seedResourceInventoryState(snap, flaggedSet, cfg, now, false, guestIntel),
summary: p.seedResourceInventorySummaryState(snap, flaggedSet, cfg, now, guestIntel),
},
// P3 — healthy rollup is useful context but lowest-value on retries.
{priority: 3, name: "triage_healthy", content: formatTriageHealthySummarySection(triage)},
}
return sections, seededFindingIDs
}
// buildSeedContext produces the infrastructure state context for the agentic patrol loop.
// It pre-assembles current metrics, storage health, backup status, disk health, alerts,
// connection health, and baselines/trends so the model can analyze without tool calls.
// Tools remain available for targeted deep-dives.
func (p *PatrolService) buildSeedContextState(snap patrolRuntimeState, scope *PatrolScope, guestIntel map[string]*GuestIntelligence) (string, []string) {
sections, seededFindingIDs := p.buildSeedSectionsState(snap, scope, guestIntel)
return p.assembleSeedWithinBudget(sections, p.calculateSeedBudget()), seededFindingIDs
}
func (p *PatrolService) buildSeedSectionsState(snap patrolRuntimeState, scope *PatrolScope, guestIntel map[string]*GuestIntelligence) ([]seedSection, []string) {
p.mu.RLock()
cfg := p.config
p.mu.RUnlock()
now := time.Now()
scopedSet := p.buildScopedSetForRuntime(scope, snap)
intel := p.seedPrecomputeIntelligenceState(snap, scopedSet, now)
findingsCtx, seededFindingIDs := p.seedFindingsAndContextState(scope, snap)
effectiveScopeIDs := sortedScopedIDs(scopedSet)
sections := []seedSection{
// P0 — always include.
{priority: 0, name: "findings", content: findingsCtx},
{priority: 0, name: "health_alerts", content: p.seedHealthAndAlertsState(snap, scopedSet, cfg, now)},
{priority: 0, name: "scope", content: buildScopeSection(scope, effectiveScopeIDs)},
// P1 — always include (typically compact).
{priority: 1, name: "previous_run", content: p.seedPreviousRun(now)},
// P2 — summarize when needed.
{
priority: 2,
name: "resource_inventory",
content: p.seedResourceInventoryState(snap, scopedSet, cfg, now, false, guestIntel),
summary: p.seedResourceInventorySummaryState(snap, scopedSet, cfg, now, guestIntel),
},
// P3 — droppable if budget is tight.
{priority: 3, name: "intelligence", content: p.seedIntelligenceContext(intel, now)},
{priority: 3, name: "backup_analysis", content: p.seedBackupAnalysisState(snap, scopedSet, now)},
// P4 — least critical, dropped first.
{priority: 4, name: "pmg_snapshot", content: p.seedPMGSnapshotStringState(snap, scopedSet, cfg, intel.isQuiet)},
}
return sections, seededFindingIDs
}
func (p *PatrolService) calculateSeedBudget() int {
const (
defaultContextWindow = 128_000
)
model := ""
if p.aiService != nil {
if cfg := p.aiService.GetAIConfig(); cfg != nil {
model = strings.TrimSpace(cfg.GetPatrolModel())
}
}
contextWindow := providers.ContextWindowTokens(model)
if contextWindow <= 0 {
contextWindow = defaultContextWindow
}
budget := calculateSeedBudgetForContextWindow(contextWindow)
log.Debug().
Str("model", model).
Int("context_window_tokens", contextWindow).
Int("seed_budget_tokens", budget).
Msg("AI Patrol: Calculated seed context token budget")
return budget
}
func calculateSeedBudgetForContextWindow(contextWindow int) int {
const (
systemPromptEstimate = 4_000
toolEstimate = 8_000
outputReserve = 8_000
historyReserve = 16_000
minimumSeedBudget = 16_000
defaultContextWindow = 128_000
)
if contextWindow <= 0 {
contextWindow = defaultContextWindow
}
budget := contextWindow - systemPromptEstimate - toolEstimate - outputReserve - historyReserve
// Clamp floor so small-context models aren't forced beyond practical capacity.
floor := minimumSeedBudget
if halfContext := contextWindow / 2; halfContext < floor {
floor = halfContext
}
if budget < floor {
budget = floor
}
return budget
}
func isPatrolContextWindowError(err error) bool {
if err == nil {
return false
}
msg := strings.ToLower(err.Error())
return strings.Contains(msg, "exceed_context_size_error") ||
strings.Contains(msg, "exceeds the available context size") ||
strings.Contains(msg, "maximum context length") ||
strings.Contains(msg, "context window") ||
strings.Contains(msg, "context length") ||
strings.Contains(msg, "n_ctx")
}
func patrolSeedRetryBudgets(err error) []int {
if err == nil {
return []int{patrolRetrySeedBudget1, patrolRetrySeedBudget2, patrolRetrySeedBudget3}
}
contextWindow := extractPatrolContextWindow(err)
if contextWindow <= 0 {
return []int{patrolRetrySeedBudget1, patrolRetrySeedBudget2, patrolRetrySeedBudget3}
}
safeBudget := calculateSeedBudgetForContextWindow(contextWindow)
return uniquePositiveInts(
safeBudget,
maxInt(1_000, safeBudget/2),
maxInt(1_000, safeBudget/4),
)
}
func extractPatrolContextWindow(err error) int {
if err == nil {
return 0
}
msg := err.Error()
for _, pattern := range patrolContextWindowPatterns {
matches := pattern.FindStringSubmatch(msg)
if len(matches) < 2 {
continue
}
nctx, convErr := strconv.Atoi(matches[1])
if convErr == nil && nctx > 0 {
return nctx
}
}
return 0
}
func uniquePositiveInts(values ...int) []int {
result := make([]int, 0, len(values))
seen := make(map[int]struct{}, len(values))
for _, value := range values {
if value <= 0 {
continue
}
if _, exists := seen[value]; exists {
continue
}
seen[value] = struct{}{}
result = append(result, value)
}
return result
}
func maxInt(a, b int) int {
if a > b {
return a
}
return b
}
func minInt(a, b int) int {
if a < b {
return a
}
return b
}
func (p *PatrolService) assembleSeedWithinBudget(sections []seedSection, budgetTokens int) string {
if len(sections) == 0 {
return ""
}
ordered := append([]seedSection(nil), sections...)
sort.SliceStable(ordered, func(i, j int) bool {
return ordered[i].priority < ordered[j].priority
})
var sb strings.Builder
usedTokens := 0
included := make([]string, 0, len(ordered))
summarized := make([]string, 0, len(ordered))
dropped := make([]string, 0, len(ordered))
appendSection := func(sectionName, content string) {
sb.WriteString(content)
usedTokens += chat.EstimateTokens(content)
included = append(included, sectionName)
}
for _, section := range ordered {
if strings.TrimSpace(section.content) == "" {
continue
}
contentTokens := chat.EstimateTokens(section.content)
switch {
case section.priority <= 1:
appendSection(section.name, section.content)
case section.priority == 2:
if usedTokens+contentTokens <= budgetTokens {
appendSection(section.name, section.content)
} else if strings.TrimSpace(section.summary) != "" {
summaryTokens := chat.EstimateTokens(section.summary)
if usedTokens+summaryTokens <= budgetTokens {
sb.WriteString(section.summary)
usedTokens += summaryTokens
summarized = append(summarized, section.name)
included = append(included, section.name)
} else {
dropped = append(dropped, section.name)
}
} else {
dropped = append(dropped, section.name)
}
default:
if usedTokens+contentTokens <= budgetTokens {
appendSection(section.name, section.content)
} else {
dropped = append(dropped, section.name)
}
}
}
log.Debug().
Int("budget_tokens", budgetTokens).
Int("used_tokens", usedTokens).
Bool("over_budget", usedTokens > budgetTokens).
Strs("included", included).
Strs("summarized", summarized).
Strs("dropped", dropped).
Msg("AI Patrol: Assembled seed context within budget")
return sb.String()
}
func buildScopeSection(scope *PatrolScope, effectiveScopeIDs []string) string {
if scope == nil {
return ""
}
var sb strings.Builder
sb.WriteString("# Patrol Scope\n")
if scope.Reason != "" {
sb.WriteString(fmt.Sprintf("Trigger: %s\n", scope.Reason))
}
if scope.Context != "" {
sb.WriteString(fmt.Sprintf("Context: %s\n", scope.Context))
}
if len(scope.ResourceIDs) > 0 {
sb.WriteString(fmt.Sprintf("Requested resources: %s\n", strings.Join(scope.ResourceIDs, ", ")))
}
if len(scope.ResourceTypes) > 0 {
sb.WriteString(fmt.Sprintf("Requested resource types: %s\n", strings.Join(scope.ResourceTypes, ", ")))
}
if len(effectiveScopeIDs) > 0 {
sb.WriteString(fmt.Sprintf("Effective scope: %d %s (%s)\n",
len(effectiveScopeIDs),
seedCountLabel(len(effectiveScopeIDs), "resource", "resources"),
seedTruncateOutlierList(effectiveScopeIDs, 8)))
}
if scope.AlertIdentifier != "" {
sb.WriteString(fmt.Sprintf("Alert Identifier: %s\n", scope.AlertIdentifier))
}
if scope.FindingID != "" {
sb.WriteString(fmt.Sprintf("Finding ID: %s\n", scope.FindingID))
}
sb.WriteString(fmt.Sprintf("Depth: %s\n", scope.Depth.String()))
if scope.Depth == PatrolDepthQuick {
sb.WriteString("\nThis is a quick check — focus on the scoped resources, limit investigation depth.\n")
} else {
sb.WriteString("\nPerform thorough investigation including trends, baselines, logs, and correlations.\n")
}
sb.WriteString("\n")
return sb.String()
}
func sortedScopedIDs(scopedSet map[string]bool) []string {
if len(scopedSet) == 0 {
return nil
}
ids := make([]string, 0, len(scopedSet))
for id := range scopedSet {
if strings.TrimSpace(id) == "" {
continue
}
ids = append(ids, id)
}
sort.Strings(ids)
return ids
}
// buildScopedSet constructs the set of resource IDs in scope from explicit scope IDs,
// expanding with correlated resources. It is preserved for direct ID-scoped callers/tests.
func (p *PatrolService) buildScopedSet(scope *PatrolScope) map[string]bool {
if scope == nil || len(scope.ResourceIDs) == 0 {
return nil
}
return p.buildScopedSetWithCorrelations(scope.ResourceIDs)
}
func (p *PatrolService) buildScopedSetWithCorrelations(resourceIDs []string) map[string]bool {
if len(resourceIDs) == 0 {
return nil
}
p.mu.RLock()
corrDet := p.correlationDetector
p.mu.RUnlock()
scopedSet := make(map[string]bool)
for _, id := range resourceIDs {
scopedSet[id] = true
}
if corrDet != nil {
for _, id := range resourceIDs {
for _, c := range corrDet.GetCorrelationsForResource(id) {
scopedSet[c.SourceID] = true
scopedSet[c.TargetID] = true
}
}
}
return scopedSet
}
// buildScopedSetForRuntime constructs the effective scope set for a patrol runtime state.
// For explicit resource-ID scopes, it preserves correlation expansion semantics.
// For non-empty type-only scopes, it derives the scope from the already-filtered runtime state.
func (p *PatrolService) buildScopedSetForRuntime(scope *PatrolScope, snap patrolRuntimeState) map[string]bool {
if scope == nil {
return nil
}
if len(scope.ResourceIDs) > 0 {
return p.buildScopedSet(scope)
}
if len(scope.ResourceTypes) == 0 && scope.AlertIdentifier == "" && scope.FindingID == "" && strings.TrimSpace(scope.Context) == "" {
return nil
}
resourceIDs := patrolRuntimeResourceIDs(snap)
if len(resourceIDs) == 0 {
return nil
}
return p.buildScopedSetWithCorrelations(resourceIDs)
}
// seedPreviousRun returns the previous patrol run summary section.
func (p *PatrolService) seedPreviousRun(now time.Time) string {
if p.runHistoryStore == nil {
return ""
}
recent := p.runHistoryStore.GetRecent(1)
if len(recent) == 0 {
return ""
}
var sb strings.Builder
last := recent[0]
sb.WriteString("# Previous Patrol Run\n")
sb.WriteString(fmt.Sprintf("- Ran: %s (duration: %s)\n", seedFormatTimeAgo(now, last.StartedAt), seedFormatDuration(last.Duration)))
sb.WriteString(fmt.Sprintf("- Status: %s\n", last.Status))
sb.WriteString(fmt.Sprintf("- Findings: %d new, %d existing, %d resolved, %d rejected\n",
last.NewFindings, last.ExistingFindings, last.ResolvedFindings, last.RejectedFindings))
if last.FindingsSummary != "" {
sb.WriteString(fmt.Sprintf("- Summary: %s\n", last.FindingsSummary))
}
trigger := last.TriggerReason
if trigger == "" {
trigger = "scheduled"
}
sb.WriteString(fmt.Sprintf("- Trigger: %s\n", trigger))
sb.WriteString("\n")
return sb.String()
}
// seedPrecomputeIntelligence pre-computes anomalies, forecasts, predictions, changes,
// and correlations used by multiple seed context sections.
func (p *PatrolService) seedPrecomputeIntelligenceState(snap patrolRuntimeState, scopedSet map[string]bool, now time.Time) seedIntelligence {
p.mu.RLock()
bs := p.baselineStore
mh := p.metricsHistory
pd := p.patternDetector
cd := p.changeDetector
p.mu.RUnlock()
var intel seedIntelligence
intel.hasBaselineStore = bs != nil
nodeSources := patrolPrecomputeNodeSources(snap, scopedSet)
guestSources := patrolPrecomputeGuestSources(snap, scopedSet)
storageSources := patrolPrecomputeStorageSources(snap, scopedSet)
// Anomalies
if bs != nil {
for _, n := range nodeSources {
metrics := map[string]float64{"cpu": n.cpuFraction, "memory": n.memPercent}
anomalies := bs.CheckResourceAnomaliesReadOnly(n.id, metrics)
for i := range anomalies {
if anomalies[i].ResourceName == "" {
anomalies[i].ResourceName = n.name
}
}
intel.anomalies = append(intel.anomalies, anomalies...)
}
for _, g := range guestSources {
if g.template || g.status != "running" {
continue
}
metrics := map[string]float64{"memory": g.memPercent, "disk": g.diskPercent}
if g.cpuFraction > 0 {
metrics["cpu"] = g.cpuFraction
}
anomalies := bs.CheckResourceAnomaliesReadOnly(g.id, metrics)
for i := range anomalies {
if anomalies[i].ResourceName == "" {
anomalies[i].ResourceName = g.name
}
}
intel.anomalies = append(intel.anomalies, anomalies...)
}
for _, s := range storageSources {
metrics := map[string]float64{"usage": s.usagePercent}
anomalies := bs.CheckResourceAnomaliesReadOnly(s.id, metrics)
for i := range anomalies {
if anomalies[i].ResourceName == "" {
anomalies[i].ResourceName = s.name
}
}
intel.anomalies = append(intel.anomalies, anomalies...)
}
}
// Capacity forecasts
if mh != nil {
addForecast := func(resourceID, resourceName, metricName string, points []MetricPoint, currentValue float64) {
if len(points) < 5 {
return
}
samples := make([]float64, len(points))
for i, pt := range points {
samples[i] = pt.Value
}
trend := baseline.CalculateTrend(samples, currentValue)
if trend != nil && trend.DaysToFull > 0 && trend.DaysToFull <= 30 {
intel.forecasts = append(intel.forecasts, seedForecast{
name: resourceName,
resourceID: resourceID,
metric: metricName,
severity: trend.Severity,
daysToFull: trend.DaysToFull,
dailyChange: trend.DailyChange,
current: currentValue,
})
}
}
for _, n := range nodeSources {
if pts := mh.GetNodeMetrics(n.id, "memory", 48*time.Hour); len(pts) >= 5 {
addForecast(n.id, n.name, "memory", pts, n.memPercent)
}
}
for _, g := range guestSources {
if g.template || g.status != "running" {
continue
}
if pts := mh.GetGuestMetrics(g.id, "memory", 48*time.Hour); len(pts) >= 5 {
addForecast(g.id, g.name, "memory", pts, g.memPercent)
}
if pts := mh.GetGuestMetrics(g.id, "disk", 48*time.Hour); len(pts) >= 5 {
addForecast(g.id, g.name, "disk", pts, g.diskPercent)
}
}
for _, s := range storageSources {
allMetrics := mh.GetAllStorageMetrics(s.id, 48*time.Hour)
if pts, ok := allMetrics["usage"]; ok && len(pts) >= 5 {
addForecast(s.id, s.name, "usage", pts, s.usagePercent)
}
}
}
// Failure predictions
if pd != nil {
allPredictions := pd.GetPredictions()
for _, pred := range allPredictions {
if seedIsInScope(scopedSet, pred.ResourceID) {
intel.predictions = append(intel.predictions, pred)
}
}
}
// Recent changes
if canonicalChanges, ok := p.loadCanonicalRecentChanges(scopedSet, now.Add(-24*time.Hour), 20); ok {
intel.recentChanges = append(intel.recentChanges, canonicalChanges...)
} else if cd != nil {
allChanges := cd.GetRecentChanges(20, now.Add(-24*time.Hour))
for _, c := range allChanges {
if seedIsInScope(scopedSet, c.ResourceID) {
intel.recentChanges = append(intel.recentChanges, c)
}
}
}
// Correlations
if intelFacade := p.GetIntelligence(); intelFacade != nil && intelFacade.HasCorrelationsSource() {
allCorrs := intelFacade.GetCorrelations("")
for _, c := range allCorrs {
if !seedIsInScope(scopedSet, c.SourceID) && !seedIsInScope(scopedSet, c.TargetID) {
continue
}
intel.correlations = append(intel.correlations, c)
if len(intel.correlations) >= 10 {
break
}
}
}
// Determine if infrastructure is quiet
hasWarningForecasts := false
for _, f := range intel.forecasts {
if f.daysToFull <= 30 {
hasWarningForecasts = true
break
}
}
intel.isQuiet = len(intel.anomalies) == 0 && !hasWarningForecasts &&
len(intel.predictions) == 0 && len(intel.recentChanges) == 0 && len(snap.ActiveAlerts) == 0
return intel
}
func (p *PatrolService) loadCanonicalRecentChanges(scopedSet map[string]bool, since time.Time, limit int) ([]memory.Change, bool) {
if p == nil || p.aiService == nil {
return nil, false
}
p.aiService.mu.RLock()
store := p.aiService.resourceExportStore
orgID := strings.TrimSpace(p.aiService.orgID)
storeOrgID := strings.TrimSpace(p.aiService.resourceExportStoreOrgID)
p.aiService.mu.RUnlock()
if store == nil {
return nil, false
}
if storeOrgID != "" && storeOrgID != orgID {
return nil, false
}
changes, err := store.GetRecentChanges("", since, limit)
if err != nil {
log.Warn().
Err(err).
Msg("failed to load canonical patrol resource timeline")
return nil, false
}
if len(changes) == 0 {
return nil, false
}
result := make([]memory.Change, 0, len(changes))
for _, change := range changes {
if !seedIsInScope(scopedSet, change.ResourceID) {
continue
}
result = append(result, memory.ChangeFromUnifiedResourceChange(change))
if len(result) >= limit {
break
}
}
if len(result) == 0 {
return nil, false
}
return result, true
}
// seedResourceInventory builds the node, guest, docker, storage, ceph, and PBS sections.
type patrolNodeInventoryRow struct {
id, name, status string
cpu, mem, disk float64
load []float64
uptimeSeconds int64
pendingUpdates int
}
type patrolGuestInventoryRow struct {
id string
name, gType, node, status string
cpu, mem, disk float64
vmid int
ip string
lastBackup time.Time
service string
reachable string
}
type patrolPBSDatastoreRow struct {
instance, name string
usage float64
used, total int64
}
type patrolDockerHostRow struct {
host string
containerCount int
runningCount int
stoppedCount int
unhealthyContainers []string
}
type patrolAppContainerRow struct {
id, name, status string
cpu, memory float64
}
type patrolStoragePoolRow struct {
id, name, stype, node, status string
used, total int64
hasBytes bool
usage float64
zfsRead, zfsWrite, zfsCksum int64
hasZFSErrors bool
}
type patrolPhysicalDiskRow struct {
id, name, node, diskType string
sizeBytes int64
devPath, model string
health, status string
wearout, temperature int
}
type patrolPrecomputeNodeSource struct {
id, name string
cpuFraction float64
memPercent float64
}
type patrolPrecomputeGuestSource struct {
id, name string
template bool
status string
cpuFraction float64
memPercent float64
diskPercent float64
}
type patrolPrecomputeStorageSource struct {
id, name string
usagePercent float64
}
type patrolConnectionHealthEntry struct {
resourceID string
healthy bool
}
func patrolNodeInventoryRows(snap patrolRuntimeState, scopedSet map[string]bool) []patrolNodeInventoryRow {
rs := snap.readState
if rs != nil {
scopedNodes := make([]patrolNodeInventoryRow, 0, len(rs.Nodes()))
for _, nv := range rs.Nodes() {
if !seedIsInScope(scopedSet, nv.ID()) {
continue
}
scopedNodes = append(scopedNodes, patrolNodeInventoryRow{
id: nv.ID(),
name: nv.Name(),
status: string(nv.Status()),
cpu: nv.CPUPercent(),
mem: nv.MemoryPercent(),
disk: nv.DiskPercent(),
load: nv.LoadAverage(),
uptimeSeconds: nv.Uptime(),
pendingUpdates: nv.PendingUpdates(),
})
}
return scopedNodes
}
scopedNodes := make([]patrolNodeInventoryRow, 0, len(snap.Nodes))
for _, n := range snap.Nodes {
if !seedIsInScope(scopedSet, n.ID) {
continue
}
scopedNodes = append(scopedNodes, patrolNodeInventoryRow{
id: n.ID,
name: n.Name,
status: n.Status,
cpu: n.CPU * 100,
mem: n.Memory.Usage,
disk: n.Disk.Usage,
load: n.LoadAverage,
uptimeSeconds: n.Uptime,
pendingUpdates: n.PendingUpdates,
})
}
return scopedNodes
}
func patrolGuestInventoryRows(snap patrolRuntimeState, scopedSet map[string]bool, guestIntel map[string]*GuestIntelligence) []patrolGuestInventoryRow {
rs := snap.readState
if rs != nil {
guests := make([]patrolGuestInventoryRow, 0, len(rs.VMs())+len(rs.Containers()))
for _, vmv := range rs.VMs() {
if vmv.Template() || !seedIsInScope(scopedSet, vmv.ID()) {
continue
}
gi := guestIntel[vmv.ID()]
guests = append(guests, patrolGuestInventoryRow{
id: vmv.ID(),
name: vmv.Name(),
gType: "VM",
node: vmv.Node(),
status: string(vmv.Status()),
cpu: vmv.CPUPercent(),
mem: vmv.MemoryPercent(),
disk: vmv.DiskPercent(),
vmid: vmv.VMID(),
ip: patrolFirstIP(vmv.IPAddresses()),
lastBackup: vmv.LastBackup(),
service: formatService(gi),
reachable: formatReachable(reachableFromIntel(gi)),
})
}
for _, ctv := range rs.Containers() {
if ctv.Template() || !seedIsInScope(scopedSet, ctv.ID()) {
continue
}
gi := guestIntel[ctv.ID()]
guests = append(guests, patrolGuestInventoryRow{
id: ctv.ID(),
name: ctv.Name(),
gType: "Container",
node: ctv.Node(),
status: string(ctv.Status()),
cpu: ctv.CPUPercent(),
mem: ctv.MemoryPercent(),
disk: ctv.DiskPercent(),
vmid: ctv.VMID(),
ip: patrolFirstIP(ctv.IPAddresses()),
lastBackup: ctv.LastBackup(),
service: formatService(gi),
reachable: formatReachable(reachableFromIntel(gi)),
})
}
return guests
}
guests := make([]patrolGuestInventoryRow, 0, len(snap.VMs)+len(snap.Containers))
for _, vm := range snap.VMs {
if vm.Template || !seedIsInScope(scopedSet, vm.ID) {
continue
}
gi := guestIntel[vm.ID]
guests = append(guests, patrolGuestInventoryRow{
id: vm.ID,
name: vm.Name,
gType: "VM",
node: vm.Node,
status: vm.Status,
cpu: vm.CPU * 100,
mem: vm.Memory.Usage,
disk: vm.Disk.Usage,
vmid: vm.VMID,
ip: patrolFirstIP(vm.IPAddresses),
lastBackup: vm.LastBackup,
service: formatService(gi),
reachable: formatReachable(reachableFromIntel(gi)),
})
}
for _, ct := range snap.Containers {
if ct.Template || !seedIsInScope(scopedSet, ct.ID) {
continue
}
gi := guestIntel[ct.ID]
guests = append(guests, patrolGuestInventoryRow{
id: ct.ID,
name: ct.Name,
gType: "Container",
node: ct.Node,
status: ct.Status,
cpu: ct.CPU * 100,
mem: ct.Memory.Usage,
disk: ct.Disk.Usage,
vmid: ct.VMID,
ip: patrolFirstIP(ct.IPAddresses),
lastBackup: ct.LastBackup,
service: formatService(gi),
reachable: formatReachable(reachableFromIntel(gi)),
})
}
return guests
}
func patrolPBSDatastoreRows(snap patrolRuntimeState, scopedSet map[string]bool) []patrolPBSDatastoreRow {
rs := snap.readState
if rs != nil {
rows := make([]patrolPBSDatastoreRow, 0)
for _, pbs := range rs.PBSInstances() {
if !seedIsInScope(scopedSet, pbs.ID()) {
continue
}
instanceName := strings.TrimSpace(pbs.Name())
if instanceName == "" {
instanceName = strings.TrimSpace(pbs.ID())
}
for _, ds := range pbs.Datastores() {
rows = append(rows, patrolPBSDatastoreRow{
instance: instanceName,
name: ds.Name,
usage: ds.UsagePercent,
used: ds.Used,
total: ds.Total,
})
}
}
return rows
}
rows := make([]patrolPBSDatastoreRow, 0)
for _, pbs := range snap.PBSInstances {
if !seedIsInScope(scopedSet, pbs.ID) {
continue
}
for _, ds := range pbs.Datastores {
rows = append(rows, patrolPBSDatastoreRow{
instance: pbs.Name,
name: ds.Name,
usage: ds.Usage,
used: ds.Used,
total: ds.Total,
})
}
}
return rows
}
func patrolDockerHostRows(snap patrolRuntimeState, scopedSet map[string]bool) []patrolDockerHostRow {
rs := snap.readState
if rs != nil {
containersByHost := make(map[string][]*unifiedresources.DockerContainerView)
for _, cv := range rs.DockerContainers() {
hostID := strings.TrimSpace(cv.ParentID())
if hostID == "" {
hostID = strings.TrimSpace(cv.HostSourceID())
}
if hostID == "" {
continue
}
containersByHost[hostID] = append(containersByHost[hostID], cv)
}
rows := make([]patrolDockerHostRow, 0, len(rs.DockerHosts()))
for _, dhv := range rs.DockerHosts() {
if !seedIsInScope(scopedSet, dhv.ID()) {
continue
}
host := strings.TrimSpace(dhv.Hostname())
if host == "" {
host = strings.TrimSpace(dhv.Name())
}
row := patrolDockerHostRow{
host: host,
containerCount: dhv.ChildCount(),
}
for _, cv := range containersByHost[dhv.ID()] {
state := strings.TrimSpace(cv.ContainerState())
if state == "running" {
row.runningCount++
} else {
row.stoppedCount++
}
health := strings.TrimSpace(cv.Health())
if health != "" && health != "healthy" && state == "running" {
row.unhealthyContainers = append(row.unhealthyContainers, fmt.Sprintf("%s/%s: health=%s", host, cv.Name(), health))
}
}
if len(containersByHost[dhv.ID()]) > 0 {
row.containerCount = len(containersByHost[dhv.ID()])
}
rows = append(rows, row)
}
return rows
}
rows := make([]patrolDockerHostRow, 0, len(snap.DockerHosts))
for _, dh := range snap.DockerHosts {
if !seedIsInScope(scopedSet, dh.ID) {
continue
}
row := patrolDockerHostRow{
host: dh.Hostname,
containerCount: len(dh.Containers),
}
for _, c := range dh.Containers {
if c.State == "running" {
row.runningCount++
} else {
row.stoppedCount++
}
if c.Health != "" && c.Health != "healthy" && c.State == "running" {
row.unhealthyContainers = append(row.unhealthyContainers, fmt.Sprintf("%s/%s: health=%s", dh.Hostname, c.Name, c.Health))
}
}
rows = append(rows, row)
}
return rows
}
func patrolAppContainerRows(snap patrolRuntimeState, scopedSet map[string]bool) []patrolAppContainerRow {
rs := snap.readState
if rs != nil {
rows := make([]patrolAppContainerRow, 0, len(rs.DockerContainers()))
for _, cv := range rs.DockerContainers() {
if !seedIsInScope(scopedSet, cv.ID()) {
continue
}
rows = append(rows, patrolAppContainerRow{
id: cv.ID(),
name: cv.Name(),
status: strings.TrimSpace(cv.ContainerState()),
cpu: cv.CPUPercent(),
memory: cv.MemoryPercent(),
})
}
return rows
}
count := 0
for _, host := range snap.DockerHosts {
count += len(host.Containers)
}
rows := make([]patrolAppContainerRow, 0, count)
for _, host := range snap.DockerHosts {
for _, container := range host.Containers {
if !seedIsInScope(scopedSet, container.ID) {
continue
}
rows = append(rows, patrolAppContainerRow{
id: container.ID,
name: container.Name,
status: container.State,
cpu: container.CPUPercent,
memory: container.MemoryPercent,
})
}
}
return rows
}
func patrolStoragePoolRows(snap patrolRuntimeState, scopedSet map[string]bool) []patrolStoragePoolRow {
rs := snap.readState
if rs != nil {
storagePools := rs.StoragePools()
rows := make([]patrolStoragePoolRow, 0, len(storagePools))
for _, spv := range storagePools {
if !seedIsInScope(scopedSet, spv.ID()) {
continue
}
name := strings.TrimSpace(spv.Name())
if name == "" {
name = strings.TrimSpace(spv.ID())
}
stype := strings.TrimSpace(spv.StorageType())
if stype == "" {
stype = "-"
}
node := strings.TrimSpace(spv.Node())
if node == "" && spv.Shared() {
node = "shared"
}
status := "active"
switch spv.Status() {
case unifiedresources.StatusOffline:
status = "inactive"
case unifiedresources.StatusUnknown:
status = "unknown"
case unifiedresources.StatusWarning:
status = "warning"
}
if spv.IsZFS() && strings.TrimSpace(spv.ZFSPoolState()) != "" {
status = strings.TrimSpace(spv.ZFSPoolState())
}
used := spv.DiskUsed()
total := spv.DiskTotal()
zfsRead := spv.ZFSReadErrors()
zfsWrite := spv.ZFSWriteErrors()
zfsCksum := spv.ZFSChecksumErrors()
rows = append(rows, patrolStoragePoolRow{
id: spv.ID(),
name: name,
stype: stype,
node: node,
status: status,
used: used,
total: total,
hasBytes: total > 0,
usage: spv.DiskPercent(),
zfsRead: zfsRead,
zfsWrite: zfsWrite,
zfsCksum: zfsCksum,
hasZFSErrors: spv.IsZFS() && (zfsRead > 0 || zfsWrite > 0 || zfsCksum > 0),
})
}
return rows
}
urp := snap.unifiedResourceProvider
if urp != nil {
storageResources := urp.GetByType(unifiedresources.ResourceTypeStorage)
rows := make([]patrolStoragePoolRow, 0, len(storageResources))
for _, r := range storageResources {
if !seedIsInScope(scopedSet, r.ID) || r.Storage == nil {
continue
}
name := strings.TrimSpace(r.Name)
if name == "" {
name = strings.TrimSpace(r.ID)
}
stype := strings.TrimSpace(r.Storage.Type)
if stype == "" {
stype = "-"
}
node := ""
if r.Proxmox != nil {
node = strings.TrimSpace(r.Proxmox.NodeName)
}
if node == "" && r.Storage.Shared {
node = "shared"
}
used, total := int64(0), int64(0)
hasBytes := false
usage := 0.0
if r.Metrics != nil && r.Metrics.Disk != nil {
if r.Metrics.Disk.Used != nil && r.Metrics.Disk.Total != nil {
used, total = *r.Metrics.Disk.Used, *r.Metrics.Disk.Total
hasBytes = true
}
if r.Metrics.Disk.Percent > 0 {
usage = r.Metrics.Disk.Percent
} else if hasBytes && total > 0 {
usage = (float64(used) / float64(total)) * 100
}
}
status := "active"
switch r.Status {
case unifiedresources.StatusOffline:
status = "inactive"
case unifiedresources.StatusUnknown:
status = "unknown"
case unifiedresources.StatusWarning:
status = "warning"
}
if r.Storage.IsZFS && strings.TrimSpace(r.Storage.ZFSPoolState) != "" {
status = strings.TrimSpace(r.Storage.ZFSPoolState)
}
zfsRead := r.Storage.ZFSReadErrors
zfsWrite := r.Storage.ZFSWriteErrors
zfsCksum := r.Storage.ZFSChecksumErrors
hasZFSErrors := r.Storage.IsZFS && (zfsRead > 0 || zfsWrite > 0 || zfsCksum > 0)
rows = append(rows, patrolStoragePoolRow{
id: r.ID,
name: name,
stype: stype,
node: node,
status: status,
used: used,
total: total,
hasBytes: hasBytes,
usage: usage,
zfsRead: zfsRead,
zfsWrite: zfsWrite,
zfsCksum: zfsCksum,
hasZFSErrors: hasZFSErrors,
})
}
return rows
}
rows := make([]patrolStoragePoolRow, 0, len(snap.Storage))
for _, s := range snap.Storage {
if !seedIsInScope(scopedSet, s.ID) {
continue
}
name := strings.TrimSpace(s.Name)
if name == "" {
name = strings.TrimSpace(s.ID)
}
node := strings.TrimSpace(s.Node)
if node == "" && s.Shared {
node = "shared"
}
stype := strings.TrimSpace(s.Type)
if stype == "" {
stype = "-"
}
rows = append(rows, patrolStoragePoolRow{
id: s.ID,
name: name,
stype: stype,
node: node,
status: strings.TrimSpace(s.Status),
used: s.Used,
total: s.Total,
hasBytes: s.Total > 0,
usage: s.Usage,
})
}
return rows
}
func patrolPhysicalDiskRows(snap patrolRuntimeState, scopedSet map[string]bool) []patrolPhysicalDiskRow {
urp := snap.unifiedResourceProvider
if urp != nil {
diskResources := urp.GetByType(unifiedresources.ResourceTypePhysicalDisk)
rows := make([]patrolPhysicalDiskRow, 0, len(diskResources))
for _, r := range diskResources {
if !seedIsInScope(scopedSet, r.ID) || r.PhysicalDisk == nil {
continue
}
name := strings.TrimSpace(r.Name)
if name == "" {
name = strings.TrimSpace(r.ID)
}
status := strings.TrimSpace(string(r.Status))
if status == "" {
status = "unknown"
}
health := strings.TrimSpace(r.PhysicalDisk.Health)
if health == "" {
health = "UNKNOWN"
}
rows = append(rows, patrolPhysicalDiskRow{
id: r.ID,
name: name,
node: strings.TrimSpace(r.ParentName),
diskType: strings.TrimSpace(r.PhysicalDisk.DiskType),
sizeBytes: r.PhysicalDisk.SizeBytes,
devPath: strings.TrimSpace(r.PhysicalDisk.DevPath),
model: strings.TrimSpace(r.PhysicalDisk.Model),
health: health,
status: status,
wearout: r.PhysicalDisk.Wearout,
temperature: r.PhysicalDisk.Temperature,
})
}
return rows
}
rows := make([]patrolPhysicalDiskRow, 0, len(snap.PhysicalDisks))
for _, d := range snap.PhysicalDisks {
if !seedIsInScope(scopedSet, d.ID) {
continue
}
name := strings.TrimSpace(d.Model)
if name == "" {
name = strings.TrimSpace(d.DevPath)
}
status := strings.ToLower(strings.TrimSpace(d.Health))
switch status {
case "passed", "ok":
status = "online"
case "failed":
status = "inactive"
case "":
status = "unknown"
}
health := strings.TrimSpace(d.Health)
if health == "" {
health = "UNKNOWN"
}
rows = append(rows, patrolPhysicalDiskRow{
id: d.ID,
name: name,
node: strings.TrimSpace(d.Node),
diskType: strings.TrimSpace(d.Type),
sizeBytes: d.Size,
devPath: strings.TrimSpace(d.DevPath),
model: strings.TrimSpace(d.Model),
health: health,
status: status,
wearout: d.Wearout,
temperature: d.Temperature,
})
}
return rows
}
func patrolPrecomputeNodeSources(snap patrolRuntimeState, scopedSet map[string]bool) []patrolPrecomputeNodeSource {
nodeRows := patrolNodeInventoryRows(snap, scopedSet)
rows := make([]patrolPrecomputeNodeSource, 0, len(nodeRows))
for _, n := range nodeRows {
rows = append(rows, patrolPrecomputeNodeSource{
id: n.id,
name: n.name,
cpuFraction: n.cpu / 100,
memPercent: n.mem,
})
}
return rows
}
func patrolPrecomputeGuestSources(snap patrolRuntimeState, scopedSet map[string]bool) []patrolPrecomputeGuestSource {
guestRows := patrolGuestInventoryRows(snap, scopedSet, nil)
rows := make([]patrolPrecomputeGuestSource, 0, len(guestRows))
for _, guest := range guestRows {
rows = append(rows, patrolPrecomputeGuestSource{
id: guest.id,
name: guest.name,
template: false,
status: guest.status,
cpuFraction: guest.cpu / 100,
memPercent: guest.mem,
diskPercent: guest.disk,
})
}
return rows
}
func patrolPrecomputeStorageSources(snap patrolRuntimeState, scopedSet map[string]bool) []patrolPrecomputeStorageSource {
storageRows := patrolStoragePoolRows(snap, scopedSet)
rows := make([]patrolPrecomputeStorageSource, 0, len(storageRows))
for _, s := range storageRows {
rows = append(rows, patrolPrecomputeStorageSource{
id: s.id,
name: s.name,
usagePercent: s.usage,
})
}
return rows
}
func patrolActiveAlertsInScope(snap patrolRuntimeState, scopedSet map[string]bool) []models.Alert {
if scopedSet == nil {
return snap.ActiveAlerts
}
alerts := make([]models.Alert, 0, len(snap.ActiveAlerts))
for _, alert := range snap.ActiveAlerts {
if seedIsInScope(scopedSet, alert.ResourceID) {
alerts = append(alerts, alert)
}
}
return alerts
}
func patrolResolvedAlertsInScope(snap patrolRuntimeState, scopedSet map[string]bool) []models.ResolvedAlert {
if scopedSet == nil {
return snap.RecentlyResolved
}
alerts := make([]models.ResolvedAlert, 0, len(snap.RecentlyResolved))
for _, resolved := range snap.RecentlyResolved {
if seedIsInScope(scopedSet, resolved.Alert.ResourceID) {
alerts = append(alerts, resolved)
}
}
return alerts
}
func patrolConnectionHealthEntries(snap patrolRuntimeState, scopedSet map[string]bool) []patrolConnectionHealthEntry {
if len(snap.ConnectionHealth) == 0 {
return nil
}
entries := make([]patrolConnectionHealthEntry, 0, len(snap.ConnectionHealth))
for resourceID, healthy := range snap.ConnectionHealth {
if !seedIsInScope(scopedSet, resourceID) {
continue
}
entries = append(entries, patrolConnectionHealthEntry{
resourceID: resourceID,
healthy: healthy,
})
}
return entries
}
func (p *PatrolService) seedResourceInventoryState(snap patrolRuntimeState, scopedSet map[string]bool, cfg PatrolConfig, now time.Time, isQuiet bool, guestIntel map[string]*GuestIntelligence) string {
var sb strings.Builder
// --- Node Metrics ---
if cfg.AnalyzeNodes {
scopedNodes := patrolNodeInventoryRows(snap, scopedSet)
if len(scopedNodes) > 0 {
if isQuiet && scopedSet == nil {
minCPU, maxCPU := 100.0, 0.0
minMem, maxMem := 100.0, 0.0
allHealthy := true
for _, n := range scopedNodes {
if n.cpu < minCPU {
minCPU = n.cpu
}
if n.cpu > maxCPU {
maxCPU = n.cpu
}
if n.mem < minMem {
minMem = n.mem
}
if n.mem > maxMem {
maxMem = n.mem
}
if n.status != "online" {
allHealthy = false
}
}
status := "healthy"
if !allHealthy {
status = "mixed"
}
sb.WriteString(fmt.Sprintf("# Nodes: All %d %s (CPU %.0f-%.0f%%, Mem %.0f-%.0f%%)\n\n",
len(scopedNodes), status, minCPU, maxCPU, minMem, maxMem))
} else {
sb.WriteString("# Node Metrics\n")
sb.WriteString("| Node | Status | CPU | Mem | Disk | Load (1/5/15) | Uptime | Updates |\n")
sb.WriteString("|------|--------|-----|-----|------|---------------|--------|---------|\n")
for _, n := range scopedNodes {
load := "—"
if len(n.load) >= 3 {
load = fmt.Sprintf("%.1f/%.1f/%.1f", n.load[0], n.load[1], n.load[2])
}
uptime := seedFormatDuration(time.Duration(n.uptimeSeconds) * time.Second)
updates := "—"
if n.pendingUpdates > 0 {
updates = fmt.Sprintf("%d", n.pendingUpdates)
}
sb.WriteString(fmt.Sprintf("| %s | %s | %.0f%% | %.0f%% | %.0f%% | %s | %s | %s |\n",
n.name, n.status, n.cpu, n.mem, n.disk, load, uptime, updates))
}
sb.WriteString("\n")
}
}
}
// --- Guest Metrics (VMs + Containers in one table) ---
var guests []patrolGuestInventoryRow
if cfg.AnalyzeGuests {
guests = patrolGuestInventoryRows(snap, scopedSet, guestIntel)
}
if len(guests) > 0 {
if isQuiet && scopedSet == nil {
running, stopped := 0, 0
var unreachableNames []string
for _, g := range guests {
if g.status == "running" {
running++
} else {
stopped++
}
if g.reachable == "NO" {
unreachableNames = append(unreachableNames, g.name)
}
}
if len(unreachableNames) > 0 {
sb.WriteString(fmt.Sprintf("# Guests: %d running, %d stopped. %d UNREACHABLE: %s\n\n",
running, stopped, len(unreachableNames), strings.Join(unreachableNames, ", ")))
} else {
hasReachabilityData := false
for _, g := range guests {
if g.reachable != "-" {
hasReachabilityData = true
break
}
}
if hasReachabilityData {
sb.WriteString(fmt.Sprintf("# Guests: %d running, %d stopped, no issues detected. All reachable.\n\n", running, stopped))
} else {
sb.WriteString(fmt.Sprintf("# Guests: %d running, %d stopped, no issues detected.\n\n", running, stopped))
}
}
} else {
sb.WriteString("# Guest Metrics\n")
sb.WriteString("| Name | Type | Node | Service | CPU | Mem | Disk | Status | Reachable | Last Backup |\n")
sb.WriteString("|------|------|------|---------|-----|-----|------|--------|-----------|-------------|\n")
for _, g := range guests {
backup := "never"
if !g.lastBackup.IsZero() {
backup = seedFormatTimeAgo(now, g.lastBackup)
}
sb.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %.0f%% | %.0f%% | %.0f%% | %s | %s | %s |\n",
g.name, g.gType, g.node, g.service, g.cpu, g.mem, g.disk, g.status, g.reachable, backup))
}
sb.WriteString("\n")
// Add service health issues section for unreachable running guests
var issues []serviceHealthIssue
for _, g := range guests {
if g.status == "running" && g.reachable == "NO" {
svc := g.service
if svc == "-" {
svc = g.gType // "VM" or "Container"
}
issues = append(issues, serviceHealthIssue{
name: g.name,
service: svc,
node: g.node,
})
}
}
if section := buildServiceHealthIssues(issues); section != "" {
sb.WriteString(section)
}
}
}
// --- Docker ---
if cfg.AnalyzeDocker {
rows := patrolDockerHostRows(snap, scopedSet)
if len(rows) > 0 {
sb.WriteString("# Docker\n")
sb.WriteString("| Host | Containers | Running | Stopped |\n")
sb.WriteString("|------|------------|---------|--------|\n")
for _, row := range rows {
sb.WriteString(fmt.Sprintf("| %s | %d | %d | %d |\n",
row.host, row.containerCount, row.runningCount, row.stoppedCount))
}
for _, row := range rows {
for _, issue := range row.unhealthyContainers {
sb.WriteString("- " + issue + "\n")
}
}
sb.WriteString("\n")
}
}
// --- Storage Pools ---
if cfg.AnalyzeStorage {
poolRows := patrolStoragePoolRows(snap, scopedSet)
diskRows := patrolPhysicalDiskRows(snap, scopedSet)
if len(poolRows) > 0 {
sort.Slice(poolRows, func(i, j int) bool { return poolRows[i].name < poolRows[j].name })
}
if len(diskRows) > 0 {
sort.Slice(diskRows, func(i, j int) bool {
if diskRows[i].node != diskRows[j].node {
return diskRows[i].node < diskRows[j].node
}
if diskRows[i].devPath != diskRows[j].devPath {
return diskRows[i].devPath < diskRows[j].devPath
}
return diskRows[i].name < diskRows[j].name
})
}
if len(poolRows) > 0 || len(diskRows) > 0 {
if isQuiet && scopedSet == nil {
parts := make([]string, 0, 2)
if len(poolRows) > 0 {
minUsage, maxUsage := 100.0, 0.0
for _, row := range poolRows {
if row.usage < minUsage {
minUsage = row.usage
}
if row.usage > maxUsage {
maxUsage = row.usage
}
}
parts = append(parts, fmt.Sprintf("%d %s (%.0f-%.0f%% used)", len(poolRows), seedCountLabel(len(poolRows), "pool", "pools"), minUsage, maxUsage))
}
if len(diskRows) > 0 {
diskIssues := 0
for _, row := range diskRows {
if (!strings.EqualFold(row.health, "PASSED") && !strings.EqualFold(row.health, "UNKNOWN") && !strings.EqualFold(row.health, "OK") && row.health != "") || row.status != "online" {
diskIssues++
}
}
diskSummary := fmt.Sprintf("%d %s healthy", len(diskRows), seedCountLabel(len(diskRows), "disk", "disks"))
if diskIssues > 0 {
diskSummary = fmt.Sprintf("%d %s, %d with issues", len(diskRows), seedCountLabel(len(diskRows), "disk", "disks"), diskIssues)
}
parts = append(parts, diskSummary)
}
sb.WriteString(fmt.Sprintf("# Storage: %s.\n\n", strings.Join(parts, "; ")))
} else {
sb.WriteString("# Storage\n")
if len(poolRows) > 0 {
sb.WriteString("## Pools\n")
sb.WriteString("| Pool | Type | Node | Usage | Used | Total | Status |\n")
sb.WriteString("|------|------|------|-------|------|-------|--------|\n")
for _, row := range poolRows {
usedStr, totalStr := "—", "—"
if row.hasBytes {
usedStr, totalStr = seedFormatBytes(row.used), seedFormatBytes(row.total)
}
node := row.node
if node == "" {
node = "—"
}
sb.WriteString(fmt.Sprintf("| %s | %s | %s | %.0f%% | %s | %s | %s |\n",
row.name, row.stype, node, row.usage, usedStr, totalStr, row.status))
}
for _, row := range poolRows {
if row.hasZFSErrors {
sb.WriteString(fmt.Sprintf("- %s ZFS errors: read=%d write=%d checksum=%d\n",
row.name, row.zfsRead, row.zfsWrite, row.zfsCksum))
}
}
}
if len(diskRows) > 0 {
if len(poolRows) > 0 {
sb.WriteString("\n")
}
sb.WriteString("## Physical Disks\n")
sb.WriteString("| Disk | Node | Type | Size | Health | Wear | Temp | Status |\n")
sb.WriteString("|------|------|------|------|--------|------|------|--------|\n")
for _, row := range diskRows {
diskName := row.devPath
if diskName == "" {
diskName = row.name
}
if row.model != "" {
diskName = fmt.Sprintf("%s (%s)", diskName, row.model)
}
node := row.node
if node == "" {
node = "—"
}
diskType := row.diskType
if diskType == "" {
diskType = "—"
}
size := "—"
if row.sizeBytes > 0 {
size = seedFormatBytes(row.sizeBytes)
}
wear := "—"
if row.wearout >= 0 {
wear = fmt.Sprintf("%d%%", row.wearout)
}
temp := "—"
if row.temperature > 0 {
temp = fmt.Sprintf("%dC", row.temperature)
}
sb.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s | %s | %s | %s |\n",
diskName, node, diskType, size, row.health, wear, temp, row.status))
}
}
sb.WriteString("\n\n")
}
}
}
// --- Ceph Clusters ---
urp := snap.unifiedResourceProvider
if urp != nil {
cephResources := urp.GetByType(unifiedresources.ResourceTypeCeph)
if len(cephResources) > 0 {
sb.WriteString("# Ceph\n")
for _, r := range cephResources {
if r.Ceph == nil {
continue
}
c := r.Ceph
usedBytes, totalBytes := seedCephBytes(r)
usagePercent := 0.0
if totalBytes > 0 {
usagePercent = float64(usedBytes) / float64(totalBytes) * 100
}
sb.WriteString(fmt.Sprintf("- %s: %s — %.0f%% used (%s / %s), %d OSDs (%d up, %d in)\n",
r.Name, c.HealthStatus, usagePercent,
seedFormatBytes(usedBytes), seedFormatBytes(totalBytes),
c.NumOSDs, c.NumOSDsUp, c.NumOSDsIn))
if c.HealthMessage != "" && c.HealthStatus != "HEALTH_OK" {
sb.WriteString(fmt.Sprintf(" Message: %s\n", c.HealthMessage))
}
}
sb.WriteString("\n")
}
}
// --- PBS Instances ---
if cfg.AnalyzePBS {
rows := patrolPBSDatastoreRows(snap, scopedSet)
if len(rows) > 0 {
sb.WriteString("# PBS Datastores\n")
for _, row := range rows {
sb.WriteString(fmt.Sprintf("- %s/%s: %.0f%% used (%s / %s)\n",
row.instance, row.name, row.usage,
seedFormatBytes(row.used), seedFormatBytes(row.total)))
}
sb.WriteString("\n")
}
}
return sb.String()
}
// seedResourceInventorySummary builds a compact, always-condensed inventory snapshot.
// Unlike seedResourceInventory quiet mode, this summary condenses even when scoped.
func (p *PatrolService) seedResourceInventorySummaryState(snap patrolRuntimeState, scopedSet map[string]bool, cfg PatrolConfig, now time.Time, guestIntel map[string]*GuestIntelligence) string {
_ = now
type compactResource struct {
name, status string
cpu, mem float64
disk float64
}
var lines []string
// --- Nodes ---
if cfg.AnalyzeNodes {
nodeRows := patrolNodeInventoryRows(snap, scopedSet)
nodes := make([]compactResource, 0, len(nodeRows))
for _, n := range nodeRows {
nodes = append(nodes, compactResource{
name: n.name,
status: n.status,
cpu: n.cpu,
mem: n.mem,
disk: n.disk,
})
}
if len(nodes) > 0 {
statusCounts := map[string]int{}
minCPU, maxCPU := nodes[0].cpu, nodes[0].cpu
minMem, maxMem := nodes[0].mem, nodes[0].mem
outliers := []string{}
for _, n := range nodes {
statusCounts[n.status]++
if n.cpu < minCPU {
minCPU = n.cpu
}
if n.cpu > maxCPU {
maxCPU = n.cpu
}
if n.mem < minMem {
minMem = n.mem
}
if n.mem > maxMem {
maxMem = n.mem
}
if outlier, ok := seedOutlierLabel(n.name, n.cpu, n.mem, n.disk); ok {
outliers = append(outliers, outlier)
}
}
line := fmt.Sprintf("Nodes: %d (%s), CPU %.0f-%.0f%%, Mem %.0f-%.0f%%",
len(nodes),
seedFormatStatusBreakdown(statusCounts, []string{"online", "offline", "unknown"}),
minCPU, maxCPU, minMem, maxMem)
if len(outliers) > 0 {
line += fmt.Sprintf(". High usage: %s", seedTruncateOutlierList(outliers, 5))
}
lines = append(lines, line)
}
}
// --- Guests ---
if cfg.AnalyzeGuests {
guestRows := patrolGuestInventoryRows(snap, scopedSet, guestIntel)
guests := make([]compactResource, 0, len(guestRows))
for _, g := range guestRows {
guests = append(guests, compactResource{
name: g.name,
status: g.status,
cpu: g.cpu,
mem: g.mem,
disk: g.disk,
})
}
if len(guests) > 0 {
statusCounts := map[string]int{}
outliers := []string{}
for _, g := range guests {
statusCounts[g.status]++
if outlier, ok := seedOutlierLabel(g.name, g.cpu, g.mem, g.disk); ok {
outliers = append(outliers, outlier)
}
}
line := fmt.Sprintf("Guests: %d (%s)",
len(guests),
seedFormatStatusBreakdown(statusCounts, []string{"running", "stopped", "paused"}))
if len(outliers) > 0 {
line += fmt.Sprintf(". High usage: %s", seedTruncateOutlierList(outliers, 5))
}
unreachable := 0
for id, intel := range guestIntel {
if !seedIsInScope(scopedSet, id) || intel == nil || intel.Reachable == nil || *intel.Reachable {
continue
}
unreachable++
}
if unreachable > 0 {
line += fmt.Sprintf(". Unreachable: %d", unreachable)
}
lines = append(lines, line)
}
}
// --- Storage ---
if cfg.AnalyzeStorage {
poolRows := patrolStoragePoolRows(snap, scopedSet)
diskRows := patrolPhysicalDiskRows(snap, scopedSet)
if len(poolRows) > 0 || len(diskRows) > 0 {
statusCounts := map[string]int{}
usageOutliers := []string{}
for _, row := range poolRows {
statusCounts[strings.ToLower(strings.TrimSpace(row.status))]++
if row.usage > 80 {
usageOutliers = append(usageOutliers, fmt.Sprintf("%s (%.0f%%)", row.name, row.usage))
}
}
diskIssues := []string{}
for _, row := range diskRows {
statusCounts[strings.ToLower(strings.TrimSpace(row.status))]++
if (!strings.EqualFold(row.health, "PASSED") && !strings.EqualFold(row.health, "UNKNOWN") && !strings.EqualFold(row.health, "OK") && row.health != "") || row.status != "online" {
name := row.devPath
if name == "" {
name = row.name
}
diskIssues = append(diskIssues, fmt.Sprintf("%s (%s)", name, row.health))
}
}
line := fmt.Sprintf("Storage: %d resources (%d %s, %d %s; %s)",
len(poolRows)+len(diskRows),
len(poolRows),
seedCountLabel(len(poolRows), "pool", "pools"),
len(diskRows),
seedCountLabel(len(diskRows), "disk", "disks"),
seedFormatStatusBreakdown(statusCounts, []string{"active", "online", "warning", "degraded", "inactive", "unknown"}))
if len(usageOutliers) > 0 {
line += fmt.Sprintf(". High usage: %s", seedTruncateOutlierList(usageOutliers, 5))
}
if len(diskIssues) > 0 {
line += fmt.Sprintf(". Disk issues: %s", seedTruncateOutlierList(diskIssues, 5))
}
lines = append(lines, line)
}
}
if len(lines) == 0 {
return ""
}
var sb strings.Builder
sb.WriteString("# Infrastructure Summary (condensed)\n")
for _, line := range lines {
sb.WriteString(line)
sb.WriteString("\n")
}
sb.WriteString("\n")
return sb.String()
}
func (p *PatrolService) seedPMGSnapshotStringState(snap patrolRuntimeState, scopedSet map[string]bool, cfg PatrolConfig, isQuiet bool) string {
var sb strings.Builder
p.seedPMGSnapshotState(&sb, snap, scopedSet, cfg, isQuiet)
return sb.String()
}
// seedPMGSnapshot adds Proxmox Mail Gateway status to the seed context
func (p *PatrolService) seedPMGSnapshotState(sb *strings.Builder, snap patrolRuntimeState, scopedSet map[string]bool, cfg PatrolConfig, isQuiet bool) {
if !cfg.AnalyzePMG {
return
}
rs := snap.readState
if rs == nil {
return
}
pmgViews := rs.PMGInstances()
if len(pmgViews) == 0 {
return
}
var scopedPMG []*unifiedresources.PMGInstanceView
for _, pmgv := range pmgViews {
if seedIsInScope(scopedSet, pmgv.ID()) {
scopedPMG = append(scopedPMG, pmgv)
}
}
if len(scopedPMG) == 0 {
return
}
if isQuiet && scopedSet == nil {
allHealthy := true
for _, pmgv := range scopedPMG {
if string(pmgv.Status()) != "online" {
allHealthy = false
break
}
if stats := pmgv.MailStats(); stats != nil && stats.AverageProcessTimeMs > 5000 {
allHealthy = false
break
}
}
if allHealthy {
sb.WriteString(fmt.Sprintf("# PMG: %d gateways, all healthy and processing mail normally.\n\n", len(scopedPMG)))
return
}
}
sb.WriteString("# Proxmox Mail Gateway (PMG)\n")
sb.WriteString("| Instance | Status | Version | In/Out | Spam/Virus | Avg Time | Queue (Active/Deferred/Hold) |\n")
sb.WriteString("|----------|--------|---------|--------|------------|----------|------------------------------|\n")
for _, pmgv := range scopedPMG {
version := strings.TrimSpace(pmgv.Version())
if version == "" {
version = "—"
}
traffic := "—"
spamVirus := "—"
avgTime := "—"
if stats := pmgv.MailStats(); stats != nil {
traffic = fmt.Sprintf("%.0f/%.0f", stats.CountIn, stats.CountOut)
spamVirus = fmt.Sprintf("%.0f/%.0f", stats.SpamIn+stats.SpamOut, stats.VirusIn+stats.VirusOut)
avgTime = fmt.Sprintf("%.0fms", stats.AverageProcessTimeMs)
}
queueStr := fmt.Sprintf("%d/%d/%d", pmgv.QueueActive(), pmgv.QueueDeferred(), pmgv.QueueHold())
sb.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s | %s | %s |\n",
pmgv.Name(), string(pmgv.Status()), version, traffic, spamVirus, avgTime, queueStr))
}
sb.WriteString("\n")
}
// seedBackupAnalysis builds the backup status section.
func (p *PatrolService) seedBackupAnalysisState(snap patrolRuntimeState, scopedSet map[string]bool, now time.Time) string {
type backupInfo struct {
lastBackup time.Time
source string
}
guestBackups := make(map[string]*backupInfo)
vmidToName := make(map[int]string)
guestRows := patrolGuestInventoryRows(snap, scopedSet, nil)
if snap.readState == nil {
log.Warn().Msg("seedBackupAnalysis: ReadState not wired, backup analysis will be incomplete")
}
for _, guest := range guestRows {
if guest.vmid > 0 {
vmidToName[guest.vmid] = guest.name
}
}
for _, bt := range snap.PVEBackups.BackupTasks {
if bt.Status != "OK" {
continue
}
name := vmidToName[bt.VMID]
if scopedSet != nil && name == "" {
continue
}
if name == "" {
name = fmt.Sprintf("vmid-%d", bt.VMID)
}
if existing, ok := guestBackups[name]; !ok || bt.EndTime.After(existing.lastBackup) {
guestBackups[name] = &backupInfo{lastBackup: bt.EndTime, source: "pve"}
}
}
for _, stb := range snap.PVEBackups.StorageBackups {
name := vmidToName[stb.VMID]
if scopedSet != nil && name == "" {
continue
}
if name == "" {
name = fmt.Sprintf("vmid-%d", stb.VMID)
}
if existing, ok := guestBackups[name]; !ok || stb.Time.After(existing.lastBackup) {
guestBackups[name] = &backupInfo{lastBackup: stb.Time, source: "pve"}
}
}
for _, pb := range snap.PBSBackups {
name := pb.VMID
if id, err := strconv.Atoi(pb.VMID); err == nil {
if n := vmidToName[id]; n != "" {
name = n
}
}
if scopedSet != nil && name == pb.VMID {
continue
}
if existing, ok := guestBackups[name]; !ok || pb.BackupTime.After(existing.lastBackup) {
guestBackups[name] = &backupInfo{lastBackup: pb.BackupTime, source: "pbs"}
}
}
for _, guest := range guestRows {
if guest.lastBackup.IsZero() {
continue
}
if existing, ok := guestBackups[guest.name]; !ok || guest.lastBackup.After(existing.lastBackup) {
guestBackups[guest.name] = &backupInfo{lastBackup: guest.lastBackup, source: "pve"}
}
}
totalGuests := len(guestRows)
if totalGuests == 0 {
return ""
}
var sb strings.Builder
sb.WriteString("# Backup Status\n")
var staleGuests []string
recentCount := 0
threshold48h := now.Add(-48 * time.Hour)
allGuestNames := make(map[string]bool, len(guestRows))
for _, guest := range guestRows {
allGuestNames[guest.name] = true
}
for name := range allGuestNames {
info, hasBackup := guestBackups[name]
if !hasBackup {
staleGuests = append(staleGuests, fmt.Sprintf("%s (never)", name))
} else if info.lastBackup.Before(threshold48h) {
staleGuests = append(staleGuests, fmt.Sprintf("%s (last: %s)", name, seedFormatTimeAgo(now, info.lastBackup)))
} else {
recentCount++
}
}
sort.Strings(staleGuests)
if len(staleGuests) > 0 {
sb.WriteString(fmt.Sprintf("Guests with no backup in >48h: %s\n", strings.Join(staleGuests, ", ")))
}
sb.WriteString(fmt.Sprintf("Guests with recent backups: %d/%d\n", recentCount, totalGuests))
sb.WriteString("\n")
return sb.String()
}
// seedHealthAndAlerts builds the disk health, alerts, connection health, kubernetes, and hosts sections.
func (p *PatrolService) seedHealthAndAlertsState(snap patrolRuntimeState, scopedSet map[string]bool, cfg PatrolConfig, now time.Time) string {
var sb strings.Builder
rs := snap.readState
if rs == nil && (cfg.AnalyzeKubernetes || cfg.AnalyzeHosts) {
log.Warn().Msg("seedHealthAndAlerts: ReadState not wired, Kubernetes/Hosts sections will be omitted")
}
// --- Disk Health ---
diskURP := snap.unifiedResourceProvider
if diskURP != nil {
diskResources := diskURP.GetByType(unifiedresources.ResourceTypePhysicalDisk)
if len(diskResources) > 0 {
hasIssues := false
for _, r := range diskResources {
if r.PhysicalDisk == nil {
continue
}
d := r.PhysicalDisk
if (d.Health != "PASSED" && d.Health != "UNKNOWN" && d.Health != "OK" && d.Health != "") || (d.Wearout > 0 && d.Wearout < 20) || d.Temperature > 55 {
hasIssues = true
break
}
}
sb.WriteString("# Disk Health\n")
if !hasIssues {
sb.WriteString(fmt.Sprintf("All %d disks healthy (SMART PASSED).\n", len(diskResources)))
} else {
sb.WriteString("| Node | Device | Model | Health | Wearout | Temp |\n")
sb.WriteString("|------|--------|-------|--------|---------|------|\n")
for _, r := range diskResources {
if r.PhysicalDisk == nil {
continue
}
d := r.PhysicalDisk
node := r.ParentName
if node == "" && len(r.Identity.Hostnames) > 0 {
node = r.Identity.Hostnames[0]
}
wearout := "—"
if d.Wearout >= 0 {
wearout = fmt.Sprintf("%d%%", d.Wearout)
}
temp := "—"
if d.Temperature > 0 {
temp = fmt.Sprintf("%d°C", d.Temperature)
}
sb.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s | %s |\n",
node, d.DevPath, d.Model, d.Health, wearout, temp))
}
}
sb.WriteString("\n")
}
}
// --- Active Alerts ---
if alerts := patrolActiveAlertsInScope(snap, scopedSet); len(alerts) > 0 {
sb.WriteString("# Active Alerts\n")
for _, a := range alerts {
since := seedFormatTimeAgo(now, a.StartTime)
sb.WriteString(fmt.Sprintf("- [%s] %s — since %s\n", a.Level, a.Message, since))
}
sb.WriteString("\n")
}
// --- Recently Resolved Alerts ---
if alerts := patrolResolvedAlertsInScope(snap, scopedSet); len(alerts) > 0 {
sb.WriteString("# Recently Resolved Alerts\n")
for _, r := range alerts {
ago := seedFormatTimeAgo(now, r.ResolvedTime)
sb.WriteString(fmt.Sprintf("- %s — resolved %s\n", r.Alert.Message, ago))
}
sb.WriteString("\n")
}
// --- Connection Health ---
if entries := patrolConnectionHealthEntries(snap, scopedSet); len(entries) > 0 {
allConnected := true
var disconnected []string
for _, entry := range entries {
if !entry.healthy {
allConnected = false
disconnected = append(disconnected, entry.resourceID)
}
}
sb.WriteString("# Connections\n")
if allConnected {
sb.WriteString(fmt.Sprintf("All %d instances connected.\n", len(entries)))
} else {
sort.Strings(disconnected)
sb.WriteString(fmt.Sprintf("Disconnected: %s\n", strings.Join(disconnected, ", ")))
sb.WriteString(fmt.Sprintf("Connected: %d/%d\n",
len(entries)-len(disconnected), len(entries)))
}
sb.WriteString("\n")
}
// --- Kubernetes ---
// Uses canonical ReadState view surface — legacy state fallbacks removed.
if cfg.AnalyzeKubernetes && rs != nil {
k8sViews := rs.K8sClusters()
if len(k8sViews) > 0 {
sb.WriteString("# Kubernetes Clusters\n")
for _, kv := range k8sViews {
if !seedIsInScope(scopedSet, kv.ID()) {
continue
}
sb.WriteString(fmt.Sprintf("- %s (Nodes: %d)\n", kv.Name(), kv.ChildCount()))
}
sb.WriteString("\n")
}
}
// --- Hosts ---
// Uses canonical ReadState view surface — legacy state fallbacks removed.
if cfg.AnalyzeHosts && rs != nil {
hosts := rs.Hosts()
if len(hosts) > 0 {
sb.WriteString("# Hosts\n")
for _, hv := range hosts {
if !seedIsInScope(scopedSet, hv.ID()) {
continue
}
name := hv.Hostname()
if strings.TrimSpace(name) == "" {
name = hv.Name()
}
sb.WriteString(fmt.Sprintf("- %s (ID: %s)\n", name, hv.ID()))
}
sb.WriteString("\n")
}
}
return sb.String()
}
// seedIntelligenceContext builds the anomalies, forecasts, predictions, changes, and correlations sections.
func (p *PatrolService) seedIntelligenceContext(intel seedIntelligence, now time.Time) string {
var sb strings.Builder
// --- Anomalies ---
if intel.hasBaselineStore {
sb.WriteString("# Anomalies\n")
if len(intel.anomalies) == 0 {
sb.WriteString("No anomalies detected. All resources within learned baseline ranges.\n")
} else {
for _, a := range intel.anomalies {
name := a.ResourceName
if name == "" {
name = a.ResourceID
}
currentDisp := a.CurrentValue
baselineDisp := a.BaselineMean
if a.Metric == "cpu" {
currentDisp *= 100
baselineDisp *= 100
}
sb.WriteString(fmt.Sprintf("- [%s] %s %s: %.1fσ above baseline (current: %.0f%%, baseline: %.0f%%)\n",
a.Severity, name, a.Metric, a.ZScore, currentDisp, baselineDisp))
}
}
sb.WriteString("\n")
}
// --- Capacity Forecasts ---
if len(intel.forecasts) > 0 {
sb.WriteString("# Capacity Forecasts\n")
for _, f := range intel.forecasts {
sb.WriteString(fmt.Sprintf("- [%s] %s %s: full in ~%d days (current: %.0f%%, growing +%.1f%%/day)\n",
f.severity, f.name, f.metric, f.daysToFull, f.current, f.dailyChange))
}
sb.WriteString("\n")
}
// --- Failure Predictions ---
if len(intel.predictions) > 0 {
sb.WriteString("# Failure Predictions\n")
sb.WriteString("Based on historical patterns of recurring events:\n")
for _, pred := range intel.predictions {
name := pred.ResourceID
sb.WriteString(fmt.Sprintf("- %s: %s predicted in %.0f days (confidence: %.0f%%) — %s\n",
name, string(pred.EventType), pred.DaysUntil, pred.Confidence*100, pred.Basis))
}
sb.WriteString("\n")
}
// --- Recent Infrastructure Changes ---
if len(intel.recentChanges) > 0 {
sb.WriteString("# Recent Infrastructure Changes (last 24h)\n")
for _, c := range intel.recentChanges {
name := c.ResourceName
if name == "" {
name = c.ResourceID
}
ago := seedFormatTimeAgo(now, c.DetectedAt)
if strings.TrimSpace(c.ResourceType) != "" {
sb.WriteString(fmt.Sprintf("- %s (%s): %s (%s)\n", name, c.ResourceType, c.Description, ago))
} else {
sb.WriteString(fmt.Sprintf("- %s: %s (%s)\n", name, c.Description, ago))
}
}
sb.WriteString("\n")
}
// --- Known Resource Correlations ---
if len(intel.correlations) > 0 {
sb.WriteString("# Known Resource Correlations\n")
for _, c := range intel.correlations {
if summary := correlation.FormatCorrelationSummary(c); summary != "" {
sb.WriteString("- " + summary + "\n")
}
}
sb.WriteString("\n")
}
return sb.String()
}
// seedFindingsAndContext builds the thresholds, active findings, dismissed findings, and user notes sections.
func (p *PatrolService) seedFindingsAndContextState(scope *PatrolScope, snap patrolRuntimeState) (string, []string) {
var sb strings.Builder
// --- Alert Thresholds ---
p.mu.RLock()
thresholds := p.thresholds
p.mu.RUnlock()
sb.WriteString("# Alert Thresholds\n")
sb.WriteString(fmt.Sprintf("- Node CPU warning: %.0f%%\n", thresholds.NodeCPUWarning))
sb.WriteString(fmt.Sprintf("- Node Memory warning: %.0f%%\n", thresholds.NodeMemWarning))
sb.WriteString(fmt.Sprintf("- Guest Memory warning: %.0f%%\n", thresholds.GuestMemWarning))
sb.WriteString(fmt.Sprintf("- Guest Disk warning: %.0f%%, critical: %.0f%%\n", thresholds.GuestDiskWarn, thresholds.GuestDiskCrit))
sb.WriteString(fmt.Sprintf("- Storage warning: %.0f%%, critical: %.0f%%\n", thresholds.StorageWarning, thresholds.StorageCritical))
sb.WriteString("Note: The real-time alerting system monitors these thresholds continuously. Do NOT report findings for threshold breaches — focus on trends, capacity planning, and issues alerts cannot detect.\n\n")
scopedResources := patrolRuntimeKnownResources(snap)
stateHasScopedResources := len(scopedResources) > 0
globalResources := scopedResources
current := p.currentPatrolRuntimeState()
globalResources = patrolRuntimeKnownResources(current)
stateHasGlobalResources := len(globalResources) > 0
// --- Active Findings to Re-check ---
activeFindings := p.findings.GetActive(FindingSeverityInfo)
var seededFindingIDs []string
if len(activeFindings) > 0 {
sb.WriteString("# Active Findings to Re-check\n")
sb.WriteString("Verify whether these findings are still valid. Resolve any that are no longer issues.\n\n")
for _, f := range activeFindings {
usesSyntheticRuntimeResource := patrolFindingUsesSyntheticRuntimeResource(f)
inScopedState := usesSyntheticRuntimeResource || !stateHasScopedResources || scopedResources[f.ResourceID] || scopedResources[f.ResourceName]
inGlobalState := usesSyntheticRuntimeResource || !stateHasGlobalResources || globalResources[f.ResourceID] || globalResources[f.ResourceName]
// Auto-resolve findings only when the resource is gone from the full current state.
// Scoped patrols should skip out-of-scope findings, not resolve them as deleted.
if stateHasGlobalResources && !inGlobalState {
if ok := p.findings.ResolveWithReason(f.ID, "Resource no longer exists in infrastructure"); ok {
// Notify unified store
p.mu.RLock()
resolveUnified := p.unifiedFindingResolver
p.mu.RUnlock()
if resolveUnified != nil {
resolveUnified(f.ID)
}
log.Info().
Str("finding_id", f.ID).
Str("resource_id", f.ResourceID).
Str("resource_name", f.ResourceName).
Msg("AI Patrol: Auto-resolved finding for deleted resource")
}
continue
}
if !inScopedState {
continue
}
sb.WriteString(fmt.Sprintf("- [%s] %s on %s (ID: %s, Severity: %s, Detected: %s)\n",
f.ID, f.Title, f.ResourceName, f.ResourceID, f.Severity, f.DetectedAt.Format("2006-01-02 15:04")))
if f.UserNote != "" {
sb.WriteString(fmt.Sprintf(" User note: %q\n", f.UserNote))
}
seededFindingIDs = append(seededFindingIDs, f.ID)
}
sb.WriteString("\n")
}
// --- Dismissed/Snoozed Findings ---
feedbackContext := p.findings.GetDismissedForContextForResources(scopedResources)
if feedbackContext != "" {
sb.WriteString("# User Feedback on Previous Findings\n")
sb.WriteString("Do NOT re-raise findings the user has dismissed or snoozed.\n\n")
sb.WriteString(feedbackContext)
sb.WriteString("\n\n")
}
// --- User Notes from Knowledge Store ---
p.mu.RLock()
knowledgeStore := p.knowledgeStore
p.mu.RUnlock()
if knowledgeStore != nil {
var knowledgeContext string
scopedKnowledgeIDs := patrolRuntimeResourceIDs(snap)
if len(scopedKnowledgeIDs) == 0 && scope != nil {
scopedKnowledgeIDs = append(scopedKnowledgeIDs, scope.ResourceIDs...)
}
if len(scopedKnowledgeIDs) > 0 {
knowledgeContext = knowledgeStore.FormatForContextForResources(scopedKnowledgeIDs)
} else if scope == nil {
knowledgeContext = knowledgeStore.FormatAllForContext()
}
if knowledgeContext != "" {
sb.WriteString("# User Notes\n")
sb.WriteString(knowledgeContext)
sb.WriteString("\n\n")
}
}
return sb.String(), seededFindingIDs
}
func seedOutlierLabel(name string, cpu, mem, disk float64) (string, bool) {
type metricOutlier struct {
label string
value float64
}
best := metricOutlier{}
for _, m := range []metricOutlier{
{label: "CPU", value: cpu},
{label: "Mem", value: mem},
{label: "Disk", value: disk},
} {
if m.value > 80 && m.value > best.value {
best = m
}
}
if best.label == "" {
return "", false
}
return fmt.Sprintf("%s (%s %.0f%%)", name, best.label, best.value), true
}
func seedFormatStatusBreakdown(counts map[string]int, preferredOrder []string) string {
if len(counts) == 0 {
return "none"
}
parts := []string{}
seen := map[string]bool{}
for _, status := range preferredOrder {
if count := counts[status]; count > 0 {
parts = append(parts, fmt.Sprintf("%s: %d", status, count))
seen[status] = true
}
}
remaining := []string{}
for status, count := range counts {
if count <= 0 || seen[status] {
continue
}
remaining = append(remaining, status)
}
sort.Strings(remaining)
for _, status := range remaining {
parts = append(parts, fmt.Sprintf("%s: %d", status, counts[status]))
}
return strings.Join(parts, ", ")
}
func seedTruncateOutlierList(items []string, max int) string {
if len(items) == 0 {
return ""
}
if max <= 0 || len(items) <= max {
return strings.Join(items, ", ")
}
return fmt.Sprintf("%s (+%d more)", strings.Join(items[:max], ", "), len(items)-max)
}
// seedIsInScope returns true when scopedSet is nil (unscoped) or the resource is in the set.
func seedIsInScope(scopedSet map[string]bool, resourceID string) bool {
if scopedSet == nil {
return true
}
return scopedSet[resourceID]
}
// seedFormatBytes formats bytes into a human-readable string (e.g. "1.5 GB").
// seedCephBytes extracts used/total bytes from a unified Ceph resource.
func seedCephBytes(r unifiedresources.Resource) (usedBytes, totalBytes int64) {
if r.Metrics != nil && r.Metrics.Disk != nil && r.Metrics.Disk.Used != nil && r.Metrics.Disk.Total != nil {
return *r.Metrics.Disk.Used, *r.Metrics.Disk.Total
}
if r.Ceph != nil {
for _, p := range r.Ceph.Pools {
usedBytes += p.StoredBytes
totalBytes += p.StoredBytes + p.AvailableBytes
}
}
return
}
func seedCountLabel(count int, singular, plural string) string {
if count == 1 {
return singular
}
return plural
}
func seedFormatBytes(b int64) string {
const (
KB = 1024
MB = KB * 1024
GB = MB * 1024
TB = GB * 1024
)
switch {
case b >= TB:
return fmt.Sprintf("%.1f TB", float64(b)/float64(TB))
case b >= GB:
return fmt.Sprintf("%.1f GB", float64(b)/float64(GB))
case b >= MB:
return fmt.Sprintf("%.0f MB", float64(b)/float64(MB))
case b >= KB:
return fmt.Sprintf("%.0f KB", float64(b)/float64(KB))
default:
return fmt.Sprintf("%d B", b)
}
}
// seedFormatDuration formats a duration into a compact human-readable string (e.g. "45d", "3h").
func seedFormatDuration(d time.Duration) string {
if d < time.Minute {
return fmt.Sprintf("%ds", int(d.Seconds()))
}
if d < time.Hour {
return fmt.Sprintf("%dm", int(d.Minutes()))
}
if d < 24*time.Hour {
return fmt.Sprintf("%dh", int(d.Hours()))
}
return fmt.Sprintf("%dd", int(d.Hours()/24))
}
// seedFormatTimeAgo formats a timestamp as a human-readable "ago" string.
func seedFormatTimeAgo(now, t time.Time) string {
d := now.Sub(t)
if d < time.Minute {
return "just now"
}
if d < time.Hour {
return fmt.Sprintf("%dm ago", int(d.Minutes()))
}
if d < 24*time.Hour {
return fmt.Sprintf("%dh ago", int(d.Hours()))
}
days := int(d.Hours() / 24)
if days == 1 {
return "1d ago"
}
return fmt.Sprintf("%dd ago", days)
}
// reconcileStaleFindings auto-resolves active findings that were presented to the LLM
// in seed context but were neither re-reported nor explicitly resolved during the run.
// This handles the case where the LLM doesn't reliably use patrol_resolve_finding.
//
// Safety: only called after successful full patrols (not scoped), and only for findings
// that were in the seed context (the LLM had the opportunity to re-report them).
func (p *PatrolService) reconcileStaleFindings(reportedIDs, resolvedIDs, seededFindingIDs []string, runHadErrors bool) int {
if runHadErrors {
return 0
}
if len(seededFindingIDs) == 0 {
return 0
}
reported := make(map[string]bool, len(reportedIDs))
for _, id := range reportedIDs {
reported[id] = true
}
resolved := make(map[string]bool, len(resolvedIDs))
for _, id := range resolvedIDs {
resolved[id] = true
}
seeded := make(map[string]bool, len(seededFindingIDs))
for _, id := range seededFindingIDs {
seeded[id] = true
}
count := 0
for _, id := range seededFindingIDs {
if reported[id] || resolved[id] {
continue
}
// Only resolve if still active
if ok := p.findings.ResolveWithReason(id, "No longer detected by patrol"); ok {
count++
// Notify unified store
p.mu.RLock()
resolveUnified := p.unifiedFindingResolver
p.mu.RUnlock()
if resolveUnified != nil {
resolveUnified(id)
}
log.Info().
Str("finding_id", id).
Msg("AI Patrol: Auto-resolved stale finding (not re-reported by patrol)")
}
}
return count
}