Pulse/internal/ai/patrol_findings.go
2026-03-29 13:38:06 +01:00

1948 lines
63 KiB
Go

// patrol_findings.go manages the finding lifecycle: creation, resolution, dismissal,
// remediation plan generation, investigation triggering and verification,
// and the adapter types that bridge patrol findings to the investigation subsystem.
package ai
import (
"context"
"encoding/json"
"fmt"
"sort"
"strings"
"sync"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/ai/baseline"
"github.com/rcourtman/pulse-go-rewrite/internal/ai/safety"
"github.com/rcourtman/pulse-go-rewrite/internal/ai/tools"
"github.com/rcourtman/pulse-go-rewrite/internal/relay"
"github.com/rcourtman/pulse-go-rewrite/pkg/aicontracts"
"github.com/rs/zerolog/log"
)
const (
patrolRuntimeFindingKey = "ai-patrol-error"
patrolRuntimeResourceID = "ai-service"
)
func patrolFindingUsesSyntheticRuntimeResource(f *Finding) bool {
return f != nil && (f.Key == patrolRuntimeFindingKey || f.ResourceID == patrolRuntimeResourceID)
}
func patrolRuntimeFindingManualActionError(action string) error {
return fmt.Errorf(
"Patrol runtime findings cannot be %s manually; update AI settings and rerun Patrol",
action,
)
}
// recordFinding stores a finding, syncs it to the unified store, and triggers follow-up actions.
func (p *PatrolService) recordFinding(f *Finding) bool {
if p == nil || p.findings == nil || f == nil {
return false
}
isNew := p.findings.Add(f)
stored := p.findings.Get(f.ID)
if stored == nil {
return false
}
if isNew {
log.Info().
Str("finding_id", stored.ID).
Str("severity", string(stored.Severity)).
Str("resource", stored.ResourceName).
Str("title", stored.Title).
Msg("AI Patrol: New finding")
// Generate remediation plan for actionable findings
// Skip internal error findings (not actionable by users)
if !patrolFindingUsesSyntheticRuntimeResource(stored) {
p.generateRemediationPlan(stored)
}
// Send push notification for new critical/warning findings
if stored.Severity == FindingSeverityCritical || stored.Severity == FindingSeverityWarning {
p.mu.RLock()
pushCb := p.pushNotifyCallback
p.mu.RUnlock()
if pushCb != nil {
pushCb(relay.NewPatrolFindingNotification(
stored.ID,
string(stored.Severity),
string(stored.Category),
stored.Title,
))
}
}
}
// Keep unified store in sync even when findings transition to snoozed/dismissed/resolved.
// The unified UI can filter by status; losing updates here makes the patrol loop look broken.
if p.unifiedFindingCallback != nil {
p.unifiedFindingCallback(stored)
}
// Trigger autonomous investigation if enabled and finding warrants it
p.MaybeInvestigateFinding(stored)
return isNew
}
// RejectManualActionForRuntimeFinding fails closed when a manual lifecycle action targets
// a Patrol-owned runtime finding such as the synthetic ai-service provider/runtime error.
func (p *PatrolService) RejectManualActionForRuntimeFinding(findingID string, action string) error {
if p == nil || p.findings == nil {
return fmt.Errorf("finding not found: %s", findingID)
}
finding := p.findings.Get(findingID)
if finding == nil {
return fmt.Errorf("finding not found: %s", findingID)
}
if patrolFindingUsesSyntheticRuntimeResource(finding) {
return patrolRuntimeFindingManualActionError(action)
}
return nil
}
func (p *PatrolService) setBlockedReason(reason string) {
if reason == "" {
return
}
p.mu.Lock()
p.lastBlockedReason = reason
p.lastBlockedAt = time.Now()
p.mu.Unlock()
}
func (p *PatrolService) clearBlockedReason() {
p.mu.Lock()
p.lastBlockedReason = ""
p.lastBlockedAt = time.Time{}
p.mu.Unlock()
}
// generateRemediationPlan creates a remediation plan for a finding if appropriate.
// Only generates plans for critical/warning findings when a remediation engine is configured.
func (p *PatrolService) generateRemediationPlan(finding *Finding) {
p.mu.RLock()
engine := p.remediationEngine
p.mu.RUnlock()
if engine == nil {
return
}
// Only generate plans for actionable findings
if finding.Severity != FindingSeverityCritical && finding.Severity != FindingSeverityWarning {
return
}
// Generate remediation steps based on finding category and resource type
steps := p.generateRemediationSteps(finding)
if len(steps) == 0 {
return
}
// Determine risk level based on finding severity and category
riskLevel := aicontracts.RiskLow
if finding.Severity == FindingSeverityWarning {
riskLevel = aicontracts.RiskMedium
}
if finding.Severity == FindingSeverityCritical {
riskLevel = aicontracts.RiskHigh
}
// Reliability issues involving restarts/reboots are higher risk
if finding.Category == FindingCategoryReliability {
title := strings.ToLower(finding.Title)
if strings.Contains(title, "restart") || strings.Contains(title, "reboot") || strings.Contains(title, "offline") {
if riskLevel < aicontracts.RiskHigh {
riskLevel = aicontracts.RiskHigh
}
} else if riskLevel < aicontracts.RiskMedium {
riskLevel = aicontracts.RiskMedium
}
}
// Create the remediation plan
plan := &aicontracts.RemediationPlan{
FindingID: finding.ID,
ResourceID: finding.ResourceID,
Title: fmt.Sprintf("Fix: %s", finding.Title),
Description: finding.Description,
Category: aicontracts.CategoryGuided, // All auto-generated plans require user approval
RiskLevel: riskLevel,
Steps: steps,
Rationale: finding.Recommendation,
}
// Add warnings based on risk level
if riskLevel == aicontracts.RiskHigh {
plan.Warnings = append(plan.Warnings, "High risk: This action may cause service disruption. Review carefully and consider scheduling during maintenance window.")
} else if riskLevel == aicontracts.RiskMedium {
plan.Warnings = append(plan.Warnings, "Review steps carefully before execution")
}
if err := engine.CreatePlan(plan); err != nil {
log.Debug().
Err(err).
Str("finding_id", finding.ID).
Str("resource", finding.ResourceName).
Msg("AI Patrol: Failed to create remediation plan")
return
}
log.Info().
Str("plan_id", plan.ID).
Str("finding_id", finding.ID).
Str("resource", finding.ResourceName).
Int("steps", len(steps)).
Msg("AI Patrol: Remediation plan generated")
}
// generateRemediationPlanFromInvestigation persists a remediation plan artifact when
// an investigation proposes a concrete fix command. This is intentionally separate
// from the "approval" execution pipeline; it's a durable summary users can act on
// later (often via Pulse Assistant).
func (p *PatrolService) generateRemediationPlanFromInvestigation(findingID string) {
p.mu.RLock()
engine := p.remediationEngine
orchestrator := p.investigationOrchestrator
p.mu.RUnlock()
if engine == nil || orchestrator == nil || p.findings == nil || findingID == "" {
return
}
finding := p.findings.Get(findingID)
if finding == nil {
return
}
inv := orchestrator.GetInvestigationByFinding(findingID)
if inv == nil || inv.ProposedFix == nil || len(inv.ProposedFix.Commands) == 0 {
return
}
fix := inv.ProposedFix
targetHost := strings.TrimSpace(fix.TargetHost)
if targetHost == "" {
targetHost = "local"
}
// Map investigation risk strings into remediation risk levels.
riskLevel := aicontracts.RiskMedium
switch strings.ToLower(strings.TrimSpace(fix.RiskLevel)) {
case "low":
riskLevel = aicontracts.RiskLow
case "medium":
riskLevel = aicontracts.RiskMedium
case "high":
riskLevel = aicontracts.RiskHigh
case "critical":
riskLevel = aicontracts.RiskHigh
}
steps := make([]aicontracts.RemediationStep, 0, 2+len(fix.Commands))
steps = append(steps, aicontracts.RemediationStep{
Order: 1,
Description: "Review the finding context and confirm the proposed fix is appropriate",
})
blockedCount := 0
order := 2
for _, raw := range fix.Commands {
cmd := strings.TrimSpace(raw)
if cmd == "" {
continue
}
stepCommand := cmd
stepDesc := fmt.Sprintf("Run the proposed fix on %s", targetHost)
if safety.IsBlockedCommand(cmd) {
// Don't store blocked commands in the remediation engine; keep the plan as an
// artifact for users to review and apply manually (typically via Assistant).
stepCommand = ""
stepDesc = fmt.Sprintf("Blocked command proposed by investigation (review and apply manually): %s", cmd)
blockedCount++
}
steps = append(steps, aicontracts.RemediationStep{
Order: order,
Description: stepDesc,
Command: stepCommand,
Target: targetHost,
})
order++
}
steps = append(steps, aicontracts.RemediationStep{
Order: order,
Description: "Verify the issue is resolved (re-check metrics/logs, confirm service health)",
})
description := strings.TrimSpace(fix.Rationale)
if description == "" {
description = strings.TrimSpace(inv.Summary)
}
if description == "" {
description = finding.Description
}
plan := &aicontracts.RemediationPlan{
FindingID: finding.ID,
ResourceID: finding.ResourceID,
Title: fmt.Sprintf("Investigation Fix: %s", finding.Title),
Description: description,
Category: aicontracts.CategoryGuided,
RiskLevel: riskLevel,
Steps: steps,
Rationale: fix.Description,
}
// Patrol findings are often reviewed hours/days later; keep investigation-derived
// plans around longer than the default ephemeral remediation TTL.
expires := time.Now().Add(7 * 24 * time.Hour)
plan.ExpiresAt = &expires
if blockedCount > 0 {
plan.Warnings = append(plan.Warnings, "Investigation suggested one or more commands that are blocked by safety policy. Review carefully and apply manually (prefer Pulse Assistant).")
}
if err := engine.CreatePlan(plan); err != nil {
// As a fallback, keep the plan as purely informational so it can still be
// surfaced to the user without enabling remediation engine execution.
for i := range plan.Steps {
if plan.Steps[i].Command == "" {
continue
}
plan.Steps[i].Description = fmt.Sprintf("%s: %s", plan.Steps[i].Description, plan.Steps[i].Command)
plan.Steps[i].Command = ""
}
plan.RiskLevel = aicontracts.RiskMedium
plan.Warnings = append(plan.Warnings, fmt.Sprintf("Failed to store command steps for automated remediation: %v", err))
if createErr := engine.CreatePlan(plan); createErr != nil {
log.Warn().Err(createErr).Str("findingID", finding.ID).Msg("failed to create fallback remediation plan")
}
}
}
// generateRemediationSteps creates appropriate steps based on finding type
func (p *PatrolService) generateRemediationSteps(finding *Finding) []aicontracts.RemediationStep {
var steps []aicontracts.RemediationStep
switch finding.Category {
case FindingCategoryPerformance:
steps = p.generatePerformanceSteps(finding)
case FindingCategoryCapacity:
steps = p.generateCapacitySteps(finding)
case FindingCategoryReliability:
steps = p.generateAvailabilitySteps(finding)
case FindingCategoryBackup:
steps = p.generateBackupSteps(finding)
case FindingCategorySecurity:
steps = p.generateSecuritySteps(finding)
case FindingCategoryGeneral:
steps = p.generateConfigurationSteps(finding)
default:
// Generic investigation steps for unknown categories
steps = []aicontracts.RemediationStep{
{Order: 1, Description: "Investigate the issue by reviewing current resource state"},
{Order: 2, Description: "Review recent changes that may have caused this issue"},
{Order: 3, Description: "Take appropriate corrective action based on findings"},
}
}
return steps
}
// generatePerformanceSteps creates steps for performance issues
func (p *PatrolService) generatePerformanceSteps(finding *Finding) []aicontracts.RemediationStep {
title := strings.ToLower(finding.Title)
if strings.Contains(title, "cpu") {
return []aicontracts.RemediationStep{
{Order: 1, Description: "Identify processes consuming excessive CPU", Target: finding.ResourceID},
{Order: 2, Description: "Check if resource needs more CPU cores allocated"},
{Order: 3, Description: "Consider migrating to a less loaded host if VM/container"},
{Order: 4, Description: "Optimize or restart resource-hungry applications"},
}
}
if strings.Contains(title, "memory") || strings.Contains(title, "ram") {
return []aicontracts.RemediationStep{
{Order: 1, Description: "Identify processes consuming excessive memory", Target: finding.ResourceID},
{Order: 2, Description: "Check for memory leaks in running applications"},
{Order: 3, Description: "Consider increasing allocated memory"},
{Order: 4, Description: "Restart affected services to reclaim memory"},
}
}
if strings.Contains(title, "io") || strings.Contains(title, "disk") {
return []aicontracts.RemediationStep{
{Order: 1, Description: "Identify processes causing high disk I/O", Target: finding.ResourceID},
{Order: 2, Description: "Check for runaway log files or heavy writes"},
{Order: 3, Description: "Consider migrating to faster storage"},
}
}
// Generic performance steps
return []aicontracts.RemediationStep{
{Order: 1, Description: "Review current resource utilization metrics", Target: finding.ResourceID},
{Order: 2, Description: "Identify performance bottlenecks"},
{Order: 3, Description: "Optimize resource allocation or application configuration"},
}
}
// generateCapacitySteps creates steps for capacity issues
func (p *PatrolService) generateCapacitySteps(finding *Finding) []aicontracts.RemediationStep {
title := strings.ToLower(finding.Title)
if strings.Contains(title, "disk") || strings.Contains(title, "storage") {
return []aicontracts.RemediationStep{
{Order: 1, Description: "Identify largest files and directories consuming space", Target: finding.ResourceID},
{Order: 2, Description: "Clean up temporary files, logs, and caches"},
{Order: 3, Description: "Remove unused packages and old kernels"},
{Order: 4, Description: "Consider expanding disk or adding additional storage"},
}
}
if strings.Contains(title, "memory") {
return []aicontracts.RemediationStep{
{Order: 1, Description: "Review memory allocation across workloads", Target: finding.ResourceID},
{Order: 2, Description: "Reduce memory allocation on over-provisioned VMs"},
{Order: 3, Description: "Add more physical memory to the host"},
}
}
// Generic capacity steps
return []aicontracts.RemediationStep{
{Order: 1, Description: "Review current capacity utilization", Target: finding.ResourceID},
{Order: 2, Description: "Identify growth trends and plan for expansion"},
{Order: 3, Description: "Clean up unused resources to free capacity"},
}
}
// generateAvailabilitySteps creates steps for availability issues
func (p *PatrolService) generateAvailabilitySteps(finding *Finding) []aicontracts.RemediationStep {
title := strings.ToLower(finding.Title)
if strings.Contains(title, "offline") || strings.Contains(title, "down") {
return []aicontracts.RemediationStep{
{Order: 1, Description: "Verify network connectivity to the resource", Target: finding.ResourceID},
{Order: 2, Description: "Check host status if this is a VM/container"},
{Order: 3, Description: "Review system logs for crash or shutdown reasons"},
{Order: 4, Description: "Attempt to start or restart the resource"},
}
}
if strings.Contains(title, "restart") || strings.Contains(title, "reboot") {
return []aicontracts.RemediationStep{
{Order: 1, Description: "Review system logs for cause of restarts", Target: finding.ResourceID},
{Order: 2, Description: "Check for OOM kills or kernel panics"},
{Order: 3, Description: "Investigate application crashes"},
{Order: 4, Description: "Consider enabling watchdog or health checks"},
}
}
// Generic availability steps
return []aicontracts.RemediationStep{
{Order: 1, Description: "Verify resource health and connectivity", Target: finding.ResourceID},
{Order: 2, Description: "Review recent events and logs"},
{Order: 3, Description: "Take corrective action to restore availability"},
}
}
// generateBackupSteps creates steps for backup-related issues
func (p *PatrolService) generateBackupSteps(finding *Finding) []aicontracts.RemediationStep {
title := strings.ToLower(finding.Title)
if strings.Contains(title, "missing") || strings.Contains(title, "no backup") {
return []aicontracts.RemediationStep{
{Order: 1, Description: "Verify backup job configuration exists", Target: finding.ResourceID},
{Order: 2, Description: "Check backup storage availability and capacity"},
{Order: 3, Description: "Create or enable backup schedule"},
{Order: 4, Description: "Run initial backup job"},
}
}
if strings.Contains(title, "failed") || strings.Contains(title, "error") {
return []aicontracts.RemediationStep{
{Order: 1, Description: "Review backup job logs for error details", Target: finding.ResourceID},
{Order: 2, Description: "Check backup storage connectivity and space"},
{Order: 3, Description: "Verify backup credentials and permissions"},
{Order: 4, Description: "Retry backup job after fixing issues"},
}
}
if strings.Contains(title, "old") || strings.Contains(title, "stale") || strings.Contains(title, "outdated") {
return []aicontracts.RemediationStep{
{Order: 1, Description: "Check why scheduled backups are not running", Target: finding.ResourceID},
{Order: 2, Description: "Review backup retention policy"},
{Order: 3, Description: "Trigger a new backup immediately"},
}
}
// Generic backup steps
return []aicontracts.RemediationStep{
{Order: 1, Description: "Review backup configuration and schedule", Target: finding.ResourceID},
{Order: 2, Description: "Verify backup storage health"},
{Order: 3, Description: "Ensure backup jobs are running successfully"},
}
}
// generateConfigurationSteps creates steps for configuration issues
func (p *PatrolService) generateConfigurationSteps(finding *Finding) []aicontracts.RemediationStep {
return []aicontracts.RemediationStep{
{Order: 1, Description: "Review current configuration settings", Target: finding.ResourceID},
{Order: 2, Description: "Compare against recommended best practices"},
{Order: 3, Description: "Apply configuration changes as needed"},
{Order: 4, Description: "Verify changes don't impact dependent services"},
}
}
// generateSecuritySteps creates steps for security issues
func (p *PatrolService) generateSecuritySteps(finding *Finding) []aicontracts.RemediationStep {
return []aicontracts.RemediationStep{
{Order: 1, Description: "Assess the security impact and urgency", Target: finding.ResourceID},
{Order: 2, Description: "Review access logs for suspicious activity"},
{Order: 3, Description: "Apply security patches or configuration fixes"},
{Order: 4, Description: "Verify remediation and update security policies"},
}
}
// GetFindingsForResource returns active findings for a specific resource
func (p *PatrolService) GetFindingsForResource(resourceID string) []*Finding {
findings := p.findings.GetByResource(resourceID)
normalizeFindingResourceTypes(findings)
return findings
}
// GetFindingsSummary returns a summary of all findings
func (p *PatrolService) GetFindingsSummary() FindingsSummary {
return p.findings.GetSummary()
}
// ResolveFinding marks a finding as resolved with a resolution note
// This is called when the AI successfully fixes an issue
func (p *PatrolService) ResolveFinding(findingID string, resolutionNote string) error {
if findingID == "" {
return fmt.Errorf("finding ID is required")
}
// Get the finding first to update its resolution note
finding := p.findings.Get(findingID)
if finding == nil {
return fmt.Errorf("finding not found: %s", findingID)
}
// Update the user note with the resolution
finding.UserNote = resolutionNote
// Mark as resolved (not auto-resolved since user/AI initiated it)
if !p.findings.Resolve(findingID, false) {
return fmt.Errorf("failed to resolve finding: %s", findingID)
}
p.mu.RLock()
resolveUnified := p.unifiedFindingResolver
p.mu.RUnlock()
if resolveUnified != nil {
resolveUnified(findingID)
}
log.Info().
Str("finding_id", findingID).
Str("resolution_note", resolutionNote).
Msg("AI resolved finding")
return nil
}
// DismissFinding dismisses a finding with a reason and note
// This is called when the AI determines the finding is not actually an issue
// For reasons "expected_behavior" or "not_an_issue", a suppression rule is automatically created
func (p *PatrolService) DismissFinding(findingID string, reason string, note string) error {
if findingID == "" {
return fmt.Errorf("finding ID is required")
}
// Validate reason
validReasons := map[string]bool{"not_an_issue": true, "expected_behavior": true, "will_fix_later": true}
if !validReasons[reason] {
return fmt.Errorf("invalid reason: %s", reason)
}
// Check that the finding exists
finding := p.findings.Get(findingID)
if finding == nil {
return fmt.Errorf("finding not found: %s", findingID)
}
if patrolFindingUsesSyntheticRuntimeResource(finding) {
return patrolRuntimeFindingManualActionError("dismissed")
}
// Dismiss the finding:
// - "not_an_issue" creates permanent suppression (true false positive)
// - "expected_behavior" and "will_fix_later" just acknowledge (stays visible but marked)
if !p.findings.Dismiss(findingID, reason, note) {
return fmt.Errorf("failed to dismiss finding: %s", findingID)
}
log.Info().
Str("finding_id", findingID).
Str("reason", reason).
Str("note", note).
Bool("permanently_suppressed", reason == "not_an_issue").
Msg("AI dismissed finding")
return nil
}
// GetRunHistory returns the history of patrol runs
// If limit is > 0, returns at most that many records
func (p *PatrolService) GetRunHistory(limit int) []PatrolRunRecord {
if limit <= 0 {
return p.runHistoryStore.GetAll()
}
return p.runHistoryStore.GetRecent(limit)
}
// GetRunByID returns a single patrol run from history.
func (p *PatrolService) GetRunByID(id string) (PatrolRunRecord, bool) {
if strings.TrimSpace(id) == "" {
return PatrolRunRecord{}, false
}
return p.runHistoryStore.GetByID(id)
}
// GetAllFindings returns all active findings sorted by severity
// Only returns critical and warning findings - watch/info are filtered out as noise
func (p *PatrolService) GetAllFindings() []*Finding {
findings := p.findings.GetActive(FindingSeverityWarning)
normalizeFindingResourceTypes(findings)
// Sort by severity (critical first) then by time
severityOrder := map[FindingSeverity]int{
FindingSeverityCritical: 0,
FindingSeverityWarning: 1,
FindingSeverityWatch: 2,
FindingSeverityInfo: 3,
}
sort.Slice(findings, func(i, j int) bool {
if severityOrder[findings[i].Severity] != severityOrder[findings[j].Severity] {
return severityOrder[findings[i].Severity] < severityOrder[findings[j].Severity]
}
return findings[i].DetectedAt.After(findings[j].DetectedAt)
})
return findings
}
func normalizeFindingResourceTypes(findings []*Finding) {
for _, f := range findings {
if f == nil {
continue
}
if strings.TrimSpace(f.ResourceType) == "" {
f.ResourceType = inferFindingResourceType(f.ResourceID, f.ResourceName)
continue
}
if normalized := canonicalFindingResourceType(f.ResourceType); normalized != "" {
f.ResourceType = normalized
continue
}
f.ResourceType = inferFindingResourceType(f.ResourceID, f.ResourceName)
}
}
// GetFindingsHistory returns all findings including resolved ones for history display
// Optionally filter by startTime
func (p *PatrolService) GetFindingsHistory(startTime *time.Time) []*Finding {
findings := p.findings.GetAll(startTime)
normalizeFindingResourceTypes(findings)
// Sort by detected time (newest first)
sort.Slice(findings, func(i, j int) bool {
return findings[i].DetectedAt.After(findings[j].DetectedAt)
})
return findings
}
// ForcePatrol triggers an immediate patrol run.
// Uses context.Background() since this runs async after the HTTP response.
func (p *PatrolService) ForcePatrol(ctx context.Context) {
runCtx := context.Background()
if ctx != nil {
runCtx = context.WithoutCancel(ctx)
}
go p.runPatrolWithTrigger(runCtx, TriggerReasonManual, nil)
}
// chatServiceExecutorAccessor is satisfied by *chat.Service, allowing patrol to
// access the executor without adding GetExecutor to the ChatServiceProvider interface.
type chatServiceExecutorAccessor interface {
GetExecutor() *tools.PulseToolExecutor
}
// patrolFindingCreatorAdapter implements tools.PatrolFindingCreator by wrapping
// the PatrolService's existing FindingsStore and recordFinding method.
type patrolFindingCreatorAdapter struct {
patrol *PatrolService
snap patrolRuntimeState
findingsMu sync.Mutex
findings []*Finding
resolvedIDs []string
rejectedCount int
checkedFindings bool
}
func newPatrolFindingCreatorAdapterState(p *PatrolService, snap patrolRuntimeState) *patrolFindingCreatorAdapter {
return &patrolFindingCreatorAdapter{
patrol: p,
snap: snap,
}
}
func (a *patrolFindingCreatorAdapter) CreateFinding(input tools.PatrolFindingInput) (string, bool, error) {
// Map severity
var sev FindingSeverity
switch strings.ToLower(input.Severity) {
case "critical":
sev = FindingSeverityCritical
case "warning":
sev = FindingSeverityWarning
case "watch":
sev = FindingSeverityWatch
default:
sev = FindingSeverityInfo
}
// Map category
var cat FindingCategory
switch strings.ToLower(input.Category) {
case "performance":
cat = FindingCategoryPerformance
case "capacity":
cat = FindingCategoryCapacity
case "reliability":
cat = FindingCategoryReliability
case "backup":
cat = FindingCategoryBackup
case "security":
cat = FindingCategorySecurity
default:
cat = FindingCategoryGeneral
}
// Normalize key for stable dedup
normalizedKey := normalizeFindingKey(input.Key)
if normalizedKey == "" {
normalizedKey = normalizeFindingKey(input.Title)
if normalizedKey == "" {
normalizedKey = "llm-finding"
}
}
// Generate stable ID
id := generateFindingID(input.ResourceID, string(cat), normalizedKey)
finding := &Finding{
ID: id,
Key: normalizedKey,
Severity: sev,
Category: cat,
ResourceID: input.ResourceID,
ResourceName: input.ResourceName,
ResourceType: input.ResourceType,
Title: input.Title,
Description: input.Description,
Recommendation: input.Recommendation,
Evidence: input.Evidence,
Source: "ai-analysis",
}
// Inline validation: check if finding is actionable against current metrics
if !a.isActionable(finding) {
// Determine which metric caused rejection for logging and metrics
rejectedMetric := "unknown"
keyLower := strings.ToLower(finding.Key)
titleLower := strings.ToLower(finding.Title)
if strings.Contains(keyLower, "cpu") || strings.Contains(titleLower, "cpu") {
rejectedMetric = "cpu"
} else if strings.Contains(keyLower, "memory") || strings.Contains(keyLower, "mem") || strings.Contains(titleLower, "memory") {
rejectedMetric = "memory"
} else if strings.Contains(keyLower, "disk") || strings.Contains(keyLower, "storage") || strings.Contains(titleLower, "disk") {
rejectedMetric = "disk"
}
a.findingsMu.Lock()
a.rejectedCount++
a.findingsMu.Unlock()
GetPatrolMetrics().RecordFindingRejected(input.ResourceType, rejectedMetric)
log.Info().
Str("finding_id", id).
Str("title", input.Title).
Str("resource", input.ResourceName).
Str("resource_type", input.ResourceType).
Str("rejected_metric", rejectedMetric).
Msg("AI Patrol: Finding rejected by threshold validation")
// Broadcast rejection to stream consumers
a.patrol.broadcast(PatrolStreamEvent{
Type: "finding_rejected",
Content: fmt.Sprintf("Finding rejected: %s on %s (metric %s below threshold)", input.Title, input.ResourceName, rejectedMetric),
})
return id, false, fmt.Errorf("finding rejected: metrics do not support this finding (below actionable thresholds)")
}
// Record finding via PatrolService
isNew := a.patrol.recordFinding(finding)
// Track for run stats
a.findingsMu.Lock()
a.findings = append(a.findings, finding)
a.findingsMu.Unlock()
return id, isNew, nil
}
// actionabilityThreshold returns the threshold below which a metric finding is rejected as noise.
// It reads user-configured PatrolThresholds (Watch level = lowest alarm tier) and falls back
// to hardcoded defaults (50/60/70) if the threshold is zero or unset.
// The resourceType parameter selects between node-level and guest-level thresholds where both exist.
func (a *patrolFindingCreatorAdapter) actionabilityThreshold(metric, resourceType string) float64 {
a.patrol.mu.RLock()
thresholds := a.patrol.thresholds
a.patrol.mu.RUnlock()
isNode := resourceType == "node"
switch metric {
case "cpu":
// Only node-level CPU threshold exists; used for all resource types.
if thresholds.NodeCPUWatch > 0 {
return thresholds.NodeCPUWatch
}
return 50.0
case "memory":
if isNode {
if thresholds.NodeMemWatch > 0 {
return thresholds.NodeMemWatch
}
} else {
if thresholds.GuestMemWatch > 0 {
return thresholds.GuestMemWatch
}
}
return 60.0
case "disk":
if thresholds.GuestDiskWatch > 0 {
return thresholds.GuestDiskWatch
}
return 70.0
case "storage":
if thresholds.StorageWatch > 0 {
return thresholds.StorageWatch
}
return 70.0
default:
return 50.0
}
}
// isBaselineAnomaly checks if the given value is anomalously high compared to the learned
// baseline for this resource/metric. Returns true only for upward anomalies (rising above
// baseline), since dropping usage is not concerning. Returns false if baseline data is
// unavailable or insufficient.
func (a *patrolFindingCreatorAdapter) isBaselineAnomaly(resourceID, metric string, value float64) bool {
a.patrol.mu.RLock()
store := a.patrol.baselineStore
a.patrol.mu.RUnlock()
if store == nil {
return false
}
severity, _, bl := store.CheckAnomaly(resourceID, metric, value)
if severity == baseline.AnomalyNone || bl == nil {
return false
}
// Only flag upward anomalies (value above baseline mean)
return value > bl.Mean
}
// isActionable validates a finding against current metrics (inline version of the old
// validateAIFindings + isActionableFinding logic).
// Uses user-configured thresholds from PatrolThresholds and baseline anomaly detection
// as a second-chance check for findings below the threshold but statistically anomalous.
func (a *patrolFindingCreatorAdapter) isActionable(f *Finding) bool {
resourceMetrics, hasInventory := a.actionabilityResourceMetrics()
// Reject findings for resources that no longer exist in the current infrastructure.
// Only enforce when we have state data (avoid rejecting during empty/error states).
metrics, hasMetrics := resourceMetrics[f.ResourceID]
if !hasMetrics {
metrics, hasMetrics = resourceMetrics[f.ResourceName]
}
if !hasMetrics && hasInventory {
// Resource not found — it may have been deleted. Reject the finding.
return false
}
// Allow critical findings without metric threshold checks
if f.Severity == FindingSeverityCritical {
return true
}
// Allow backup and reliability findings without metric threshold checks
if f.Category == FindingCategoryBackup || f.Category == FindingCategoryReliability {
return true
}
if !hasMetrics {
return true // empty state — benefit of doubt
}
key := strings.ToLower(f.Key)
titleLower := strings.ToLower(f.Title)
// CPU check
if strings.Contains(key, "cpu") || strings.Contains(titleLower, "cpu") {
if cpu, ok := metrics["cpu"]; ok && cpu < a.actionabilityThreshold("cpu", f.ResourceType) {
// Below threshold — check if anomalous (statistically unusual spike)
if a.isBaselineAnomaly(f.ResourceID, "cpu", cpu) {
return true
}
return false
}
}
// Memory check
if strings.Contains(key, "memory") || strings.Contains(key, "mem") || strings.Contains(titleLower, "memory") {
if mem, ok := metrics["memory"]; ok && mem < a.actionabilityThreshold("memory", f.ResourceType) {
if a.isBaselineAnomaly(f.ResourceID, "memory", mem) {
return true
}
return false
}
}
// Disk/storage check
if strings.Contains(key, "disk") || strings.Contains(key, "storage") || strings.Contains(titleLower, "disk") {
if disk, ok := metrics["disk"]; ok && disk < a.actionabilityThreshold("disk", f.ResourceType) {
if a.isBaselineAnomaly(f.ResourceID, "disk", disk) {
return true
}
return false
}
if usage, ok := metrics["usage"]; ok && usage < a.actionabilityThreshold("storage", f.ResourceType) {
if a.isBaselineAnomaly(f.ResourceID, "storage", usage) {
return true
}
return false
}
}
return true
}
func (a *patrolFindingCreatorAdapter) actionabilityResourceMetrics() (map[string]map[string]float64, bool) {
return patrolActionabilityResourceMetrics(a.snap)
}
func (a *patrolFindingCreatorAdapter) ResolveFinding(findingID, reason string) error {
scopedResources := patrolRuntimeKnownResources(a.snap)
if len(scopedResources) > 0 {
finding := a.patrol.findings.Get(findingID)
if finding == nil {
return fmt.Errorf("finding %s not found or already resolved", findingID)
}
if !scopedResources[finding.ResourceID] && !scopedResources[finding.ResourceName] {
return fmt.Errorf("finding %s is outside the current patrol scope", findingID)
}
}
resolved := a.patrol.findings.Resolve(findingID, true)
if !resolved {
return fmt.Errorf("finding %s not found or already resolved", findingID)
}
// Notify unified store
a.patrol.mu.RLock()
resolveUnified := a.patrol.unifiedFindingResolver
a.patrol.mu.RUnlock()
if resolveUnified != nil {
resolveUnified(findingID)
}
a.findingsMu.Lock()
a.resolvedIDs = append(a.resolvedIDs, findingID)
a.findingsMu.Unlock()
log.Info().
Str("finding_id", findingID).
Str("reason", reason).
Msg("AI Patrol: Finding resolved via patrol tool")
return nil
}
func (a *patrolFindingCreatorAdapter) GetActiveFindings(resourceID, minSeverity string) []tools.PatrolFindingInfo {
a.findingsMu.Lock()
a.checkedFindings = true
a.findingsMu.Unlock()
var minSev FindingSeverity
switch strings.ToLower(minSeverity) {
case "critical":
minSev = FindingSeverityCritical
case "warning":
minSev = FindingSeverityWarning
case "watch":
minSev = FindingSeverityWatch
default:
minSev = FindingSeverityInfo
}
active := a.patrol.findings.GetActive(minSev)
scopedResources := patrolRuntimeKnownResources(a.snap)
var result []tools.PatrolFindingInfo
for _, f := range active {
if resourceID != "" && f.ResourceID != resourceID && f.ResourceName != resourceID {
continue
}
if len(scopedResources) > 0 && !scopedResources[f.ResourceID] && !scopedResources[f.ResourceName] {
continue
}
result = append(result, tools.PatrolFindingInfo{
ID: f.ID,
Key: f.Key,
Severity: string(f.Severity),
Category: string(f.Category),
ResourceID: f.ResourceID,
ResourceName: f.ResourceName,
ResourceType: f.ResourceType,
Title: f.Title,
Description: f.Description,
DetectedAt: f.DetectedAt.Format("2006-01-02 15:04"),
})
}
return result
}
func (a *patrolFindingCreatorAdapter) HasCheckedFindings() bool {
a.findingsMu.Lock()
defer a.findingsMu.Unlock()
return a.checkedFindings
}
// getCollectedFindings returns all findings created during this patrol run.
func (a *patrolFindingCreatorAdapter) getCollectedFindings() []*Finding {
a.findingsMu.Lock()
defer a.findingsMu.Unlock()
result := make([]*Finding, len(a.findings))
copy(result, a.findings)
return result
}
// getResolvedCount returns the number of findings resolved during this patrol run.
func (a *patrolFindingCreatorAdapter) getResolvedCount() int {
a.findingsMu.Lock()
defer a.findingsMu.Unlock()
return len(a.resolvedIDs)
}
// getReportedFindingIDs returns the IDs of all findings created/re-reported this run.
func (a *patrolFindingCreatorAdapter) getReportedFindingIDs() []string {
a.findingsMu.Lock()
defer a.findingsMu.Unlock()
ids := make([]string, len(a.findings))
for i, f := range a.findings {
ids[i] = f.ID
}
return ids
}
// getResolvedIDs returns the IDs of findings explicitly resolved by the LLM this run.
func (a *patrolFindingCreatorAdapter) getResolvedIDs() []string {
a.findingsMu.Lock()
defer a.findingsMu.Unlock()
result := make([]string, len(a.resolvedIDs))
copy(result, a.resolvedIDs)
return result
}
func normalizeFindingKey(key string) string {
if key == "" {
return ""
}
key = strings.TrimSpace(strings.ToLower(key))
if key == "" {
return ""
}
key = strings.ReplaceAll(key, "_", "-")
key = strings.ReplaceAll(key, " ", "-")
var b strings.Builder
for _, r := range key {
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' {
b.WriteRune(r)
}
}
return strings.Trim(b.String(), "-")
}
// recoverStuckInvestigations detects findings stuck in "running" state for longer than
// the investigation timeout and resets them to "failed/timed_out" so they can be retried.
// This handles the case where an investigation goroutine panics or is killed without
// properly updating the finding status.
func (p *PatrolService) recoverStuckInvestigations() {
if p.findings == nil {
return
}
const stuckThreshold = 15 * time.Minute // investigation timeout is 10min; allow 5min grace
active := p.findings.GetActive(FindingSeverityWarning)
recovered := 0
for _, f := range active {
if f.InvestigationStatus != string(InvestigationStatusRunning) {
continue
}
if f.LastInvestigatedAt == nil {
continue
}
if time.Since(*f.LastInvestigatedAt) < stuckThreshold {
continue
}
// This finding has been "running" for too long — reset it
p.findings.UpdateInvestigation(
f.ID,
f.InvestigationSessionID,
string(InvestigationStatusFailed),
string(InvestigationOutcomeTimedOut),
f.LastInvestigatedAt,
f.InvestigationAttempts,
)
recovered++
log.Warn().
Str("finding_id", f.ID).
Str("resource", f.ResourceName).
Time("last_investigated", *f.LastInvestigatedAt).
Msg("AI Patrol: Recovered stuck investigation (exceeded timeout)")
}
if recovered > 0 {
log.Info().Int("recovered", recovered).
Msg("AI Patrol: Recovered stuck investigations")
}
}
// retryTimedOutInvestigations re-triggers investigation for findings that failed due to timeout.
// Called at the end of each patrol run to give timed-out investigations another chance
// without waiting for the full 1-hour cooldown.
func (p *PatrolService) retryTimedOutInvestigations() {
if p.findings == nil {
return
}
active := p.findings.GetActive(FindingSeverityWarning)
retried := 0
for _, f := range active {
if f.InvestigationStatus != string(InvestigationStatusFailed) {
continue
}
if f.InvestigationOutcome != string(InvestigationOutcomeTimedOut) {
continue
}
p.MaybeInvestigateFinding(f)
retried++
}
if retried > 0 {
log.Info().Int("retried", retried).
Msg("AI Patrol: Retried timed-out investigations")
}
}
// MaybeInvestigateFinding checks if a finding should be investigated and triggers investigation if so
// This is called both during scheduled patrol runs and when alert-triggered findings are created
func (p *PatrolService) MaybeInvestigateFinding(f *Finding) {
p.mu.RLock()
orchestrator := p.investigationOrchestrator
aiService := p.aiService
p.mu.RUnlock()
// No orchestrator configured
if orchestrator == nil {
return
}
// Get autonomy level from AI config
if aiService == nil {
return
}
if aiService.GetConfig() == nil {
return
}
autonomyLevel := aiService.GetEffectivePatrolAutonomyLevel()
// Check if finding should be investigated
if !f.ShouldInvestigate(autonomyLevel) {
return
}
// Check if we can start another investigation (concurrency limit)
if !orchestrator.CanStartInvestigation() {
log.Debug().
Str("finding_id", f.ID).
Msg("Cannot start investigation: max concurrent investigations reached")
return
}
// Convert Finding to shared finding type for the investigation orchestrator
invFinding := f.ToCoreFinding()
// Trigger investigation in background with a timeout to prevent indefinite runs.
// Track with WaitGroup so graceful shutdown can wait for completion.
p.investigationWg.Add(1)
go func() {
defer p.investigationWg.Done()
// Re-read autonomy level at execution time to avoid using a stale value
// captured before the goroutine was scheduled.
currentCfg := aiService.GetConfig()
if currentCfg == nil {
log.Warn().Str("finding_id", f.ID).Msg("AI config unavailable at investigation start, aborting")
return
}
currentAutonomy := aiService.GetEffectivePatrolAutonomyLevel()
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
defer cancel()
if err := orchestrator.InvestigateFinding(ctx, invFinding, currentAutonomy); err != nil {
log.Error().
Err(err).
Str("finding_id", f.ID).
Msg("Failed to start investigation")
return
}
// The orchestrator updates the patrol findings store; sync the latest state to the unified store.
// This makes fix verification and resolution visible as an actual closed loop in the UI.
var pushUnified UnifiedFindingCallback
var resolveUnified func(string)
var pushCb PushNotifyCallback
p.mu.RLock()
pushUnified = p.unifiedFindingCallback
resolveUnified = p.unifiedFindingResolver
pushCb = p.pushNotifyCallback
p.mu.RUnlock()
if latest := p.findings.Get(f.ID); latest != nil {
if pushUnified != nil {
pushUnified(latest)
}
if latest.ResolvedAt != nil && resolveUnified != nil {
resolveUnified(latest.ID)
}
// Send push notifications for investigation outcomes
if pushCb != nil {
switch latest.InvestigationOutcome {
case string(InvestigationOutcomeFixQueued):
approvalID := ""
riskLevel := ""
if orchestrator != nil {
if inv := orchestrator.GetInvestigationByFinding(latest.ID); inv != nil {
approvalID = inv.ApprovalID
if inv.ProposedFix != nil {
riskLevel = inv.ProposedFix.RiskLevel
}
}
}
if approvalID == "" {
log.Warn().
Str("finding_id", latest.ID).
Str("investigation_session_id", latest.InvestigationSessionID).
Msg("Investigation queued for approval but approval ID missing")
}
pushCb(relay.NewApprovalRequestNotification(
approvalID,
latest.Title,
riskLevel,
))
case string(InvestigationOutcomeFixExecuted), string(InvestigationOutcomeFixVerified):
pushCb(relay.NewFixCompletedNotification(latest.ID, latest.Title, true))
case string(InvestigationOutcomeFixFailed), string(InvestigationOutcomeFixVerificationFailed):
pushCb(relay.NewFixCompletedNotification(latest.ID, latest.Title, false))
}
}
}
// Investigation finished successfully. If it produced a proposed fix, persist a
// remediation plan artifact so the user can review and execute later.
p.generateRemediationPlanFromInvestigation(f.ID)
}()
log.Info().
Str("finding_id", f.ID).
Str("severity", string(f.Severity)).
Str("resource", f.ResourceName).
Str("autonomy_level", autonomyLevel).
Msg("Triggered autonomous investigation for finding")
}
// VerifyFixResolved runs a lightweight scoped patrol to check if the issue
// identified by the given finding has been resolved after a fix was executed.
// It bypasses tryStartRun (the patrol mutex) because verification runs inline
// within the investigation goroutine.
func (p *PatrolService) VerifyFixResolved(ctx context.Context, resourceID, resourceType, findingKey, findingID string) (bool, error) {
if p == nil || !p.hasPatrolRuntimeInputs() {
return false, fmt.Errorf("%w: no patrol runtime state available", aicontracts.ErrVerificationUnknown)
}
if ctx == nil {
ctx = context.Background()
}
startTime := time.Now()
// Prefer canonical finding details from store when available.
var finding *Finding
if p.findings != nil && findingID != "" {
finding = p.findings.Get(findingID)
}
if finding != nil {
if resourceID == "" {
resourceID = finding.ResourceID
}
if resourceType == "" {
resourceType = finding.ResourceType
}
if findingKey == "" {
findingKey = finding.Key
}
}
log.Info().
Str("finding_id", findingID).
Str("resource_id", resourceID).
Str("key", findingKey).
Msg("Running deterministic verification to confirm fix")
verified, verifyErr := p.verifyFixDeterministically(ctx, finding, resourceID, resourceType, findingKey, findingID)
endTime := time.Now()
duration := endTime.Sub(startTime)
// Persist a verification run record for debugging and user transparency.
status := "healthy"
summary := "Verification: issue resolved"
if verifyErr != nil {
status = "error"
summary = fmt.Sprintf("Verification inconclusive: %v", verifyErr)
} else if !verified {
status = "issues_found"
summary = "Verification: issue still present"
}
verifyRecord := PatrolRunRecord{
ID: fmt.Sprintf("%d", startTime.UnixNano()),
StartedAt: startTime,
CompletedAt: endTime,
Duration: duration,
DurationMs: duration.Milliseconds(),
Type: "verification",
TriggerReason: string(TriggerReasonVerification),
ScopeResourceIDs: []string{resourceID},
EffectiveScopeResourceIDs: []string{resourceID},
ScopeResourceTypes: []string{resourceType},
ScopeContext: fmt.Sprintf("Verifying fix for finding: %s", findingID),
FindingID: findingID,
ResourcesChecked: 1,
NewFindings: 0,
FindingsSummary: summary,
Status: status,
}
if strings.TrimSpace(resourceID) == "" {
verifyRecord.ScopeResourceIDs = nil
verifyRecord.EffectiveScopeResourceIDs = nil
verifyRecord.ResourcesChecked = 0
}
if strings.TrimSpace(resourceType) == "" {
verifyRecord.ScopeResourceTypes = nil
}
if verifyErr != nil {
verifyRecord.ErrorCount = 1
}
if p.runHistoryStore != nil {
p.runHistoryStore.Add(verifyRecord)
}
p.mu.Lock()
p.lastActivity = endTime
p.lastDuration = duration
p.resourcesChecked = verifyRecord.ResourcesChecked
p.errorCount = verifyRecord.ErrorCount
p.mu.Unlock()
return verified, verifyErr
}
func (p *PatrolService) verifyFixDeterministically(
ctx context.Context,
finding *Finding,
resourceID, resourceType, findingKey, findingID string,
) (bool, error) {
key := normalizeFindingKey(findingKey)
if key == "" {
return false, fmt.Errorf("%w: missing finding key", aicontracts.ErrVerificationUnknown)
}
// State-only verifiers (no tools required).
fullState := p.currentPatrolRuntimeState()
switch key {
case "backup-stale":
ok, err := verifyBackupFreshState(fullState, resourceID)
if err != nil {
return false, err
}
return ok, nil
case "cpu-high", "memory-high", "disk-high":
ok, err := verifyMetricRecoveredState(fullState, p.thresholds, key, resourceID, resourceType)
if err != nil {
return false, err
}
return ok, nil
case "guest-unreachable":
ok, err := p.verifyGuestReachabilityState(ctx, fullState, resourceID)
if err != nil {
return false, err
}
return ok, nil
}
// Tool-based verifiers (deterministic tool calls + deterministic signal parsing).
executor, execErr := p.getExecutorForVerification()
if execErr != nil {
return false, execErr
}
p.mu.RLock()
sigThresholds := SignalThresholdsFromPatrol(p.thresholds)
p.mu.RUnlock()
switch key {
case "smart-failure":
node := strings.TrimSpace(resourceID)
device := ""
if finding != nil {
device = strings.TrimSpace(finding.ResourceName)
}
return p.verifyBySignals(ctx, executor, sigThresholds, key, node, device)
case "backup-failed":
guestID := strings.TrimSpace(resourceID)
return p.verifyBySignals(ctx, executor, sigThresholds, key, guestID, "")
default:
return false, fmt.Errorf("%w: no deterministic verifier for key=%q (finding_id=%s)", aicontracts.ErrVerificationUnknown, key, findingID)
}
}
func (p *PatrolService) getExecutorForVerification() (*tools.PulseToolExecutor, error) {
if p == nil || p.aiService == nil {
return nil, fmt.Errorf("%w: AI service unavailable", aicontracts.ErrVerificationUnknown)
}
cs := p.aiService.GetChatService()
if cs == nil {
return nil, fmt.Errorf("%w: chat service unavailable", aicontracts.ErrVerificationUnknown)
}
executorAccessor, ok := cs.(chatServiceExecutorAccessor)
if !ok {
return nil, fmt.Errorf("%w: chat service does not expose tool executor", aicontracts.ErrVerificationUnknown)
}
exec := executorAccessor.GetExecutor()
if exec == nil {
return nil, fmt.Errorf("%w: tool executor unavailable", aicontracts.ErrVerificationUnknown)
}
return exec, nil
}
func (p *PatrolService) verifyBySignals(
ctx context.Context,
executor *tools.PulseToolExecutor,
thresholds SignalThresholds,
findingKey string,
resourceID string,
resourceName string,
) (bool, error) {
if executor == nil {
return false, fmt.Errorf("%w: tool executor unavailable", aicontracts.ErrVerificationUnknown)
}
var toolName string
args := map[string]interface{}{}
switch findingKey {
case "smart-failure":
toolName = "pulse_storage"
args = map[string]interface{}{"type": "disk_health"}
if strings.TrimSpace(resourceID) != "" {
args["node"] = resourceID
}
case "backup-failed":
toolName = "pulse_storage"
args = map[string]interface{}{"type": "backup_tasks"}
if strings.TrimSpace(resourceID) != "" {
args["guest_id"] = resourceID
}
default:
return false, fmt.Errorf("%w: unhandled signal verifier key=%q", aicontracts.ErrVerificationUnknown, findingKey)
}
tc, err := executeToolCall(ctx, executor, toolName, args)
if err != nil {
return false, err
}
signals := DetectSignals([]ToolCallRecord{tc}, thresholds)
persisting := false
for _, s := range signals {
switch findingKey {
case "smart-failure":
if s.SignalType == SignalSMARTFailure {
if resourceName == "" || strings.TrimSpace(strings.ToLower(s.ResourceName)) == strings.TrimSpace(strings.ToLower(resourceName)) {
persisting = true
}
}
case "backup-failed":
if s.SignalType == SignalBackupFailed && (resourceID == "" || s.ResourceID == resourceID) {
persisting = true
}
}
}
if persisting {
return false, nil
}
return true, nil
}
func executeToolCall(ctx context.Context, executor *tools.PulseToolExecutor, toolName string, args map[string]interface{}) (ToolCallRecord, error) {
if executor == nil {
return ToolCallRecord{}, fmt.Errorf("%w: tool executor unavailable", aicontracts.ErrVerificationUnknown)
}
if toolName == "" {
return ToolCallRecord{}, fmt.Errorf("%w: missing tool name", aicontracts.ErrVerificationUnknown)
}
if args == nil {
args = map[string]interface{}{}
}
inputBytes, _ := json.Marshal(args)
inputStr := string(inputBytes)
start := time.Now().UnixMilli()
result, execErr := executor.ExecuteTool(ctx, toolName, args)
output := ""
success := false
if execErr != nil {
output = execErr.Error()
} else {
output = formatToolResult(result)
success = !result.IsError
}
end := time.Now().UnixMilli()
if execErr != nil {
return ToolCallRecord{}, fmt.Errorf("%w: tool execution failed (%s): %v", aicontracts.ErrVerificationUnknown, toolName, execErr)
}
if result.IsError {
return ToolCallRecord{}, fmt.Errorf("%w: tool returned error (%s): %s", aicontracts.ErrVerificationUnknown, toolName, output)
}
// Most verification probes rely on parsing structured JSON outputs. If we receive
// non-JSON text, treat verification as inconclusive rather than "resolved".
if toolName == "pulse_storage" || toolName == "pulse_metrics" || toolName == "pulse_alerts" {
if !isValidJSON(output) {
return ToolCallRecord{}, fmt.Errorf("%w: tool returned non-JSON output (%s)", aicontracts.ErrVerificationUnknown, toolName)
}
}
return ToolCallRecord{
ID: fmt.Sprintf("verify-%d", time.Now().UnixNano()),
ToolName: toolName,
Input: truncateString(inputStr, MaxToolInputSize),
Output: output,
Success: success,
StartTime: start,
EndTime: end,
Duration: end - start,
}, nil
}
func isValidJSON(s string) bool {
trimmed := strings.TrimSpace(s)
if trimmed == "" {
return false
}
if trimmed[0] != '{' && trimmed[0] != '[' {
return false
}
var v interface{}
return json.Unmarshal([]byte(trimmed), &v) == nil
}
func verifyBackupFreshState(snap patrolRuntimeState, guestID string) (bool, error) {
vmID := strings.TrimSpace(guestID)
if vmID == "" {
return false, fmt.Errorf("%w: missing guest id", aicontracts.ErrVerificationUnknown)
}
now := time.Now()
details, ok := patrolLookupGuestRuntimeDetails(snap, vmID)
if !ok || details.lastBackup.IsZero() {
// If the guest cannot be found, verification can't be concluded deterministically.
return false, fmt.Errorf("%w: guest not found for backup verification (%s)", aicontracts.ErrVerificationUnknown, vmID)
}
if now.Sub(details.lastBackup) <= 48*time.Hour {
return true, nil
}
return false, nil
}
func verifyMetricRecoveredState(snap patrolRuntimeState, thresholds PatrolThresholds, key, resourceID, resourceType string) (bool, error) {
rid := strings.TrimSpace(resourceID)
if rid == "" {
return false, fmt.Errorf("%w: missing resource id", aicontracts.ErrVerificationUnknown)
}
// Use a small margin to avoid flapping around exact thresholds.
const margin = 0.95
metrics, ok := patrolLookupResourceMetricsForType(snap, rid, resourceType)
if ok {
switch key {
case "cpu-high":
if value, exists := metrics["cpu"]; exists {
return value < thresholds.NodeCPUWarning*margin, nil
}
case "memory-high":
value, exists := metrics["memory"]
if !exists {
break
}
if resourceType == "node" {
return value < thresholds.NodeMemWarning*margin, nil
}
return value < thresholds.GuestMemWarning*margin, nil
case "disk-high":
if resourceType == "physical_disk" {
if disk, exists := patrolLookupPhysicalDiskVerificationState(snap, rid); exists {
if disk.health != "" && !strings.EqualFold(disk.health, "PASSED") && !strings.EqualFold(disk.health, "UNKNOWN") && !strings.EqualFold(disk.health, "OK") {
return false, nil
}
if disk.wearout >= 0 && disk.wearout < 20 {
return false, nil
}
if disk.temperature > 55 {
return false, nil
}
return true, nil
}
break
}
if resourceType == "storage" {
if value, exists := metrics["usage"]; exists {
return value < thresholds.StorageWarning*margin, nil
}
break
}
if value, exists := metrics["disk"]; exists {
return value < thresholds.GuestDiskWarn*margin, nil
}
}
}
// If we can't locate the resource, verification is inconclusive.
return false, fmt.Errorf("%w: resource not found for metric verification (%s)", aicontracts.ErrVerificationUnknown, rid)
}
type patrolPhysicalDiskVerification struct {
health string
wearout int
temperature int
}
type patrolPhysicalDiskVisitor func(identifiers []string, verification patrolPhysicalDiskVerification) bool
func patrolLookupPhysicalDiskVerificationState(snap patrolRuntimeState, resourceID string) (patrolPhysicalDiskVerification, bool) {
return patrolLookupPhysicalDiskVerificationWithVisitor(resourceID, func(visit patrolPhysicalDiskVisitor) bool {
return patrolVisitPhysicalDiskVerification(snap, visit)
})
}
func (p *PatrolService) verifyGuestReachabilityState(ctx context.Context, snap patrolRuntimeState, guestID string) (bool, error) {
p.mu.RLock()
prober := p.guestProber
p.mu.RUnlock()
if prober == nil {
return false, fmt.Errorf("%w: guest prober not configured", aicontracts.ErrVerificationUnknown)
}
vmID := strings.TrimSpace(guestID)
if vmID == "" {
return false, fmt.Errorf("%w: missing guest id", aicontracts.ErrVerificationUnknown)
}
details, ok := patrolLookupGuestRuntimeDetails(snap, vmID)
if !ok || details.node == "" || details.ip == "" {
return false, fmt.Errorf("%w: missing node/ip for guest reachability verification (guest=%s)", aicontracts.ErrVerificationUnknown, vmID)
}
agentID, ok := prober.GetAgentForHost(details.node)
if !ok || strings.TrimSpace(agentID) == "" {
return false, fmt.Errorf("%w: no agent available for host %s", aicontracts.ErrVerificationUnknown, details.node)
}
results, err := prober.PingGuests(ctx, agentID, []string{details.ip})
if err != nil {
return false, fmt.Errorf("%w: reachability probe failed: %v", aicontracts.ErrVerificationUnknown, err)
}
if res, ok := results[details.ip]; ok {
if res.Reachable {
return true, nil
}
return false, nil
}
return false, fmt.Errorf("%w: missing ping result for %s", aicontracts.ErrVerificationUnknown, details.ip)
}
type patrolGuestRuntimeDetails struct {
lastBackup time.Time
node string
ip string
}
type patrolGuestRuntimeDetailsVisitor func(identifiers []string, details patrolGuestRuntimeDetails) bool
type patrolMetricVisitor func(identifiers []string, metrics map[string]float64) bool
func patrolActionabilityResourceMetrics(snap patrolRuntimeState) (map[string]map[string]float64, bool) {
resourceMetrics := make(map[string]map[string]float64)
hasInventory := patrolVisitMetrics(snap, func(identifiers []string, metrics map[string]float64) bool {
patrolRegisterResourceMetrics(resourceMetrics, metrics, identifiers...)
return true
})
return patrolAugmentActionabilityMetricsWithPhysicalDisks(resourceMetrics, snap), hasInventory
}
func patrolAugmentActionabilityMetricsWithPhysicalDisks(dest map[string]map[string]float64, snap patrolRuntimeState) map[string]map[string]float64 {
if dest == nil {
dest = make(map[string]map[string]float64)
}
for _, disk := range patrolPhysicalDiskRows(snap, nil) {
patrolRegisterResourceMetrics(dest, map[string]float64{}, disk.id, disk.name, disk.devPath, disk.model)
}
return dest
}
func patrolLookupGuestRuntimeDetails(snap patrolRuntimeState, guestID string) (patrolGuestRuntimeDetails, bool) {
return patrolLookupGuestRuntimeDetailsWithVisitor(guestID, func(visit patrolGuestRuntimeDetailsVisitor) bool {
return patrolVisitGuestRuntimeDetails(snap, visit)
})
}
func patrolLookupResourceMetrics(snap patrolRuntimeState, resourceID string) (map[string]float64, bool) {
return patrolLookupMetricsWithVisitor(resourceID, func(visit patrolMetricVisitor) bool {
return patrolVisitMetrics(snap, visit)
})
}
func patrolLookupResourceMetricsForType(snap patrolRuntimeState, resourceID, resourceType string) (map[string]float64, bool) {
switch strings.ToLower(strings.TrimSpace(resourceType)) {
case "node", "agent":
return patrolLookupMetricsWithVisitor(resourceID, func(visit patrolMetricVisitor) bool {
return patrolVisitNodeMetrics(snap, visit)
})
case "vm":
return patrolLookupMetricsWithVisitor(resourceID, func(visit patrolMetricVisitor) bool {
return patrolVisitGuestMetrics(snap, "VM", visit)
})
case "container", "system-container":
return patrolLookupMetricsWithVisitor(resourceID, func(visit patrolMetricVisitor) bool {
return patrolVisitGuestMetrics(snap, "Container", visit)
})
case "storage":
return patrolLookupMetricsWithVisitor(resourceID, func(visit patrolMetricVisitor) bool {
return patrolVisitStorageMetrics(snap, visit)
})
case "physical_disk":
if metrics, ok := patrolLookupPhysicalDiskMetricsState(snap, resourceID); ok {
return metrics, true
}
return nil, false
default:
return patrolLookupResourceMetrics(snap, resourceID)
}
}
func patrolLookupPhysicalDiskMetricsState(snap patrolRuntimeState, resourceID string) (map[string]float64, bool) {
if _, ok := patrolLookupPhysicalDiskVerificationState(snap, resourceID); ok {
return map[string]float64{}, true
}
return nil, false
}
func patrolLookupPhysicalDiskVerificationWithVisitor(resourceID string, walk func(patrolPhysicalDiskVisitor) bool) (patrolPhysicalDiskVerification, bool) {
found := false
var result patrolPhysicalDiskVerification
walk(func(identifiers []string, verification patrolPhysicalDiskVerification) bool {
for _, identifier := range identifiers {
if strings.TrimSpace(identifier) != strings.TrimSpace(resourceID) {
continue
}
result = verification
found = true
return false
}
return true
})
return result, found
}
func patrolVisitPhysicalDiskVerification(snap patrolRuntimeState, visit patrolPhysicalDiskVisitor) bool {
rows := patrolPhysicalDiskRows(snap, nil)
for _, disk := range rows {
if !visit([]string{disk.id, disk.name, disk.devPath, disk.model}, patrolPhysicalDiskVerification{
health: strings.TrimSpace(disk.health),
wearout: disk.wearout,
temperature: disk.temperature,
}) {
return true
}
}
return len(rows) > 0
}
func patrolLookupGuestRuntimeDetailsWithVisitor(guestID string, walk func(patrolGuestRuntimeDetailsVisitor) bool) (patrolGuestRuntimeDetails, bool) {
found := false
var result patrolGuestRuntimeDetails
walk(func(identifiers []string, details patrolGuestRuntimeDetails) bool {
for _, identifier := range identifiers {
if strings.TrimSpace(identifier) != strings.TrimSpace(guestID) {
continue
}
result = details
found = true
return false
}
return true
})
return result, found
}
func patrolVisitGuestRuntimeDetails(snap patrolRuntimeState, visit patrolGuestRuntimeDetailsVisitor) bool {
rows := patrolGuestInventoryRows(snap, nil, nil)
for _, guest := range rows {
identifiers := []string{guest.id, guest.name}
if guest.vmid > 0 {
identifiers = append(identifiers, fmt.Sprintf("%d", guest.vmid))
}
if !visit(identifiers, patrolGuestRuntimeDetails{
lastBackup: guest.lastBackup,
node: guest.node,
ip: guest.ip,
}) {
return true
}
}
return len(rows) > 0
}
func patrolLookupMetricsWithVisitor(resourceID string, walk func(patrolMetricVisitor) bool) (map[string]float64, bool) {
found := false
var result map[string]float64
walk(func(identifiers []string, metrics map[string]float64) bool {
for _, identifier := range identifiers {
if strings.TrimSpace(identifier) != strings.TrimSpace(resourceID) {
continue
}
result = metrics
found = true
return false
}
return true
})
return result, found
}
func patrolVisitMetrics(snap patrolRuntimeState, visit patrolMetricVisitor) bool {
hasInventory := false
for _, walk := range []func(patrolRuntimeState, patrolMetricVisitor) bool{
patrolVisitNodeMetrics,
func(s patrolRuntimeState, v patrolMetricVisitor) bool { return patrolVisitGuestMetrics(s, "VM", v) },
func(s patrolRuntimeState, v patrolMetricVisitor) bool {
return patrolVisitGuestMetrics(s, "Container", v)
},
patrolVisitStorageMetrics,
} {
if walk(snap, visit) {
hasInventory = true
}
}
return hasInventory
}
func patrolVisitNodeMetrics(snap patrolRuntimeState, visit patrolMetricVisitor) bool {
rows := patrolNodeInventoryRows(snap, nil)
for _, node := range rows {
metrics := map[string]float64{"cpu": node.cpu}
if node.mem > 0 {
metrics["memory"] = node.mem
}
if !visit([]string{node.id, node.name}, metrics) {
return true
}
}
return len(rows) > 0
}
func patrolVisitGuestMetrics(snap patrolRuntimeState, guestType string, visit patrolMetricVisitor) bool {
rows := patrolGuestInventoryRows(snap, nil, nil)
count := 0
for _, guest := range rows {
if guest.gType != guestType {
continue
}
count++
if !visit([]string{guest.id, guest.name}, map[string]float64{
"cpu": guest.cpu,
"memory": guest.mem,
"disk": guest.disk,
}) {
return true
}
}
return count > 0
}
func patrolVisitStorageMetrics(snap patrolRuntimeState, visit patrolMetricVisitor) bool {
rows := patrolStoragePoolRows(snap, nil)
for _, storage := range rows {
if !visit([]string{storage.id, storage.name}, map[string]float64{"usage": storage.usage}) {
return true
}
}
return len(rows) > 0
}
func patrolRegisterResourceMetrics(dest map[string]map[string]float64, metrics map[string]float64, identifiers ...string) {
for _, identifier := range identifiers {
identifier = strings.TrimSpace(identifier)
if identifier == "" {
continue
}
dest[identifier] = metrics
}
}
func patrolGuestMatches(guestID, id, name string, vmid int) bool {
return id == guestID || name == guestID || fmt.Sprintf("%d", vmid) == guestID
}
func patrolFirstIP(ips []string) string {
if len(ips) == 0 {
return ""
}
return ips[0]
}