mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-10 03:51:54 +00:00
1948 lines
63 KiB
Go
1948 lines
63 KiB
Go
// patrol_findings.go manages the finding lifecycle: creation, resolution, dismissal,
|
|
// remediation plan generation, investigation triggering and verification,
|
|
// and the adapter types that bridge patrol findings to the investigation subsystem.
|
|
package ai
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/ai/baseline"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/ai/safety"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/ai/tools"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/relay"
|
|
"github.com/rcourtman/pulse-go-rewrite/pkg/aicontracts"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
const (
|
|
patrolRuntimeFindingKey = "ai-patrol-error"
|
|
patrolRuntimeResourceID = "ai-service"
|
|
)
|
|
|
|
func patrolFindingUsesSyntheticRuntimeResource(f *Finding) bool {
|
|
return f != nil && (f.Key == patrolRuntimeFindingKey || f.ResourceID == patrolRuntimeResourceID)
|
|
}
|
|
|
|
func patrolRuntimeFindingManualActionError(action string) error {
|
|
return fmt.Errorf(
|
|
"Patrol runtime findings cannot be %s manually; update AI settings and rerun Patrol",
|
|
action,
|
|
)
|
|
}
|
|
|
|
// recordFinding stores a finding, syncs it to the unified store, and triggers follow-up actions.
|
|
func (p *PatrolService) recordFinding(f *Finding) bool {
|
|
if p == nil || p.findings == nil || f == nil {
|
|
return false
|
|
}
|
|
|
|
isNew := p.findings.Add(f)
|
|
stored := p.findings.Get(f.ID)
|
|
if stored == nil {
|
|
return false
|
|
}
|
|
|
|
if isNew {
|
|
log.Info().
|
|
Str("finding_id", stored.ID).
|
|
Str("severity", string(stored.Severity)).
|
|
Str("resource", stored.ResourceName).
|
|
Str("title", stored.Title).
|
|
Msg("AI Patrol: New finding")
|
|
|
|
// Generate remediation plan for actionable findings
|
|
// Skip internal error findings (not actionable by users)
|
|
if !patrolFindingUsesSyntheticRuntimeResource(stored) {
|
|
p.generateRemediationPlan(stored)
|
|
}
|
|
|
|
// Send push notification for new critical/warning findings
|
|
if stored.Severity == FindingSeverityCritical || stored.Severity == FindingSeverityWarning {
|
|
p.mu.RLock()
|
|
pushCb := p.pushNotifyCallback
|
|
p.mu.RUnlock()
|
|
if pushCb != nil {
|
|
pushCb(relay.NewPatrolFindingNotification(
|
|
stored.ID,
|
|
string(stored.Severity),
|
|
string(stored.Category),
|
|
stored.Title,
|
|
))
|
|
}
|
|
}
|
|
}
|
|
|
|
// Keep unified store in sync even when findings transition to snoozed/dismissed/resolved.
|
|
// The unified UI can filter by status; losing updates here makes the patrol loop look broken.
|
|
if p.unifiedFindingCallback != nil {
|
|
p.unifiedFindingCallback(stored)
|
|
}
|
|
|
|
// Trigger autonomous investigation if enabled and finding warrants it
|
|
p.MaybeInvestigateFinding(stored)
|
|
|
|
return isNew
|
|
}
|
|
|
|
// RejectManualActionForRuntimeFinding fails closed when a manual lifecycle action targets
|
|
// a Patrol-owned runtime finding such as the synthetic ai-service provider/runtime error.
|
|
func (p *PatrolService) RejectManualActionForRuntimeFinding(findingID string, action string) error {
|
|
if p == nil || p.findings == nil {
|
|
return fmt.Errorf("finding not found: %s", findingID)
|
|
}
|
|
finding := p.findings.Get(findingID)
|
|
if finding == nil {
|
|
return fmt.Errorf("finding not found: %s", findingID)
|
|
}
|
|
if patrolFindingUsesSyntheticRuntimeResource(finding) {
|
|
return patrolRuntimeFindingManualActionError(action)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (p *PatrolService) setBlockedReason(reason string) {
|
|
if reason == "" {
|
|
return
|
|
}
|
|
p.mu.Lock()
|
|
p.lastBlockedReason = reason
|
|
p.lastBlockedAt = time.Now()
|
|
p.mu.Unlock()
|
|
}
|
|
|
|
func (p *PatrolService) clearBlockedReason() {
|
|
p.mu.Lock()
|
|
p.lastBlockedReason = ""
|
|
p.lastBlockedAt = time.Time{}
|
|
p.mu.Unlock()
|
|
}
|
|
|
|
// generateRemediationPlan creates a remediation plan for a finding if appropriate.
|
|
// Only generates plans for critical/warning findings when a remediation engine is configured.
|
|
func (p *PatrolService) generateRemediationPlan(finding *Finding) {
|
|
p.mu.RLock()
|
|
engine := p.remediationEngine
|
|
p.mu.RUnlock()
|
|
|
|
if engine == nil {
|
|
return
|
|
}
|
|
|
|
// Only generate plans for actionable findings
|
|
if finding.Severity != FindingSeverityCritical && finding.Severity != FindingSeverityWarning {
|
|
return
|
|
}
|
|
|
|
// Generate remediation steps based on finding category and resource type
|
|
steps := p.generateRemediationSteps(finding)
|
|
if len(steps) == 0 {
|
|
return
|
|
}
|
|
|
|
// Determine risk level based on finding severity and category
|
|
riskLevel := aicontracts.RiskLow
|
|
if finding.Severity == FindingSeverityWarning {
|
|
riskLevel = aicontracts.RiskMedium
|
|
}
|
|
if finding.Severity == FindingSeverityCritical {
|
|
riskLevel = aicontracts.RiskHigh
|
|
}
|
|
// Reliability issues involving restarts/reboots are higher risk
|
|
if finding.Category == FindingCategoryReliability {
|
|
title := strings.ToLower(finding.Title)
|
|
if strings.Contains(title, "restart") || strings.Contains(title, "reboot") || strings.Contains(title, "offline") {
|
|
if riskLevel < aicontracts.RiskHigh {
|
|
riskLevel = aicontracts.RiskHigh
|
|
}
|
|
} else if riskLevel < aicontracts.RiskMedium {
|
|
riskLevel = aicontracts.RiskMedium
|
|
}
|
|
}
|
|
|
|
// Create the remediation plan
|
|
plan := &aicontracts.RemediationPlan{
|
|
FindingID: finding.ID,
|
|
ResourceID: finding.ResourceID,
|
|
Title: fmt.Sprintf("Fix: %s", finding.Title),
|
|
Description: finding.Description,
|
|
Category: aicontracts.CategoryGuided, // All auto-generated plans require user approval
|
|
RiskLevel: riskLevel,
|
|
Steps: steps,
|
|
Rationale: finding.Recommendation,
|
|
}
|
|
|
|
// Add warnings based on risk level
|
|
if riskLevel == aicontracts.RiskHigh {
|
|
plan.Warnings = append(plan.Warnings, "High risk: This action may cause service disruption. Review carefully and consider scheduling during maintenance window.")
|
|
} else if riskLevel == aicontracts.RiskMedium {
|
|
plan.Warnings = append(plan.Warnings, "Review steps carefully before execution")
|
|
}
|
|
|
|
if err := engine.CreatePlan(plan); err != nil {
|
|
log.Debug().
|
|
Err(err).
|
|
Str("finding_id", finding.ID).
|
|
Str("resource", finding.ResourceName).
|
|
Msg("AI Patrol: Failed to create remediation plan")
|
|
return
|
|
}
|
|
|
|
log.Info().
|
|
Str("plan_id", plan.ID).
|
|
Str("finding_id", finding.ID).
|
|
Str("resource", finding.ResourceName).
|
|
Int("steps", len(steps)).
|
|
Msg("AI Patrol: Remediation plan generated")
|
|
}
|
|
|
|
// generateRemediationPlanFromInvestigation persists a remediation plan artifact when
|
|
// an investigation proposes a concrete fix command. This is intentionally separate
|
|
// from the "approval" execution pipeline; it's a durable summary users can act on
|
|
// later (often via Pulse Assistant).
|
|
func (p *PatrolService) generateRemediationPlanFromInvestigation(findingID string) {
|
|
p.mu.RLock()
|
|
engine := p.remediationEngine
|
|
orchestrator := p.investigationOrchestrator
|
|
p.mu.RUnlock()
|
|
|
|
if engine == nil || orchestrator == nil || p.findings == nil || findingID == "" {
|
|
return
|
|
}
|
|
|
|
finding := p.findings.Get(findingID)
|
|
if finding == nil {
|
|
return
|
|
}
|
|
|
|
inv := orchestrator.GetInvestigationByFinding(findingID)
|
|
if inv == nil || inv.ProposedFix == nil || len(inv.ProposedFix.Commands) == 0 {
|
|
return
|
|
}
|
|
fix := inv.ProposedFix
|
|
|
|
targetHost := strings.TrimSpace(fix.TargetHost)
|
|
if targetHost == "" {
|
|
targetHost = "local"
|
|
}
|
|
|
|
// Map investigation risk strings into remediation risk levels.
|
|
riskLevel := aicontracts.RiskMedium
|
|
switch strings.ToLower(strings.TrimSpace(fix.RiskLevel)) {
|
|
case "low":
|
|
riskLevel = aicontracts.RiskLow
|
|
case "medium":
|
|
riskLevel = aicontracts.RiskMedium
|
|
case "high":
|
|
riskLevel = aicontracts.RiskHigh
|
|
case "critical":
|
|
riskLevel = aicontracts.RiskHigh
|
|
}
|
|
|
|
steps := make([]aicontracts.RemediationStep, 0, 2+len(fix.Commands))
|
|
steps = append(steps, aicontracts.RemediationStep{
|
|
Order: 1,
|
|
Description: "Review the finding context and confirm the proposed fix is appropriate",
|
|
})
|
|
|
|
blockedCount := 0
|
|
order := 2
|
|
for _, raw := range fix.Commands {
|
|
cmd := strings.TrimSpace(raw)
|
|
if cmd == "" {
|
|
continue
|
|
}
|
|
|
|
stepCommand := cmd
|
|
stepDesc := fmt.Sprintf("Run the proposed fix on %s", targetHost)
|
|
if safety.IsBlockedCommand(cmd) {
|
|
// Don't store blocked commands in the remediation engine; keep the plan as an
|
|
// artifact for users to review and apply manually (typically via Assistant).
|
|
stepCommand = ""
|
|
stepDesc = fmt.Sprintf("Blocked command proposed by investigation (review and apply manually): %s", cmd)
|
|
blockedCount++
|
|
}
|
|
|
|
steps = append(steps, aicontracts.RemediationStep{
|
|
Order: order,
|
|
Description: stepDesc,
|
|
Command: stepCommand,
|
|
Target: targetHost,
|
|
})
|
|
order++
|
|
}
|
|
|
|
steps = append(steps, aicontracts.RemediationStep{
|
|
Order: order,
|
|
Description: "Verify the issue is resolved (re-check metrics/logs, confirm service health)",
|
|
})
|
|
|
|
description := strings.TrimSpace(fix.Rationale)
|
|
if description == "" {
|
|
description = strings.TrimSpace(inv.Summary)
|
|
}
|
|
if description == "" {
|
|
description = finding.Description
|
|
}
|
|
|
|
plan := &aicontracts.RemediationPlan{
|
|
FindingID: finding.ID,
|
|
ResourceID: finding.ResourceID,
|
|
Title: fmt.Sprintf("Investigation Fix: %s", finding.Title),
|
|
Description: description,
|
|
Category: aicontracts.CategoryGuided,
|
|
RiskLevel: riskLevel,
|
|
Steps: steps,
|
|
Rationale: fix.Description,
|
|
}
|
|
|
|
// Patrol findings are often reviewed hours/days later; keep investigation-derived
|
|
// plans around longer than the default ephemeral remediation TTL.
|
|
expires := time.Now().Add(7 * 24 * time.Hour)
|
|
plan.ExpiresAt = &expires
|
|
|
|
if blockedCount > 0 {
|
|
plan.Warnings = append(plan.Warnings, "Investigation suggested one or more commands that are blocked by safety policy. Review carefully and apply manually (prefer Pulse Assistant).")
|
|
}
|
|
|
|
if err := engine.CreatePlan(plan); err != nil {
|
|
// As a fallback, keep the plan as purely informational so it can still be
|
|
// surfaced to the user without enabling remediation engine execution.
|
|
for i := range plan.Steps {
|
|
if plan.Steps[i].Command == "" {
|
|
continue
|
|
}
|
|
plan.Steps[i].Description = fmt.Sprintf("%s: %s", plan.Steps[i].Description, plan.Steps[i].Command)
|
|
plan.Steps[i].Command = ""
|
|
}
|
|
plan.RiskLevel = aicontracts.RiskMedium
|
|
plan.Warnings = append(plan.Warnings, fmt.Sprintf("Failed to store command steps for automated remediation: %v", err))
|
|
if createErr := engine.CreatePlan(plan); createErr != nil {
|
|
log.Warn().Err(createErr).Str("findingID", finding.ID).Msg("failed to create fallback remediation plan")
|
|
}
|
|
}
|
|
}
|
|
|
|
// generateRemediationSteps creates appropriate steps based on finding type
|
|
func (p *PatrolService) generateRemediationSteps(finding *Finding) []aicontracts.RemediationStep {
|
|
var steps []aicontracts.RemediationStep
|
|
|
|
switch finding.Category {
|
|
case FindingCategoryPerformance:
|
|
steps = p.generatePerformanceSteps(finding)
|
|
case FindingCategoryCapacity:
|
|
steps = p.generateCapacitySteps(finding)
|
|
case FindingCategoryReliability:
|
|
steps = p.generateAvailabilitySteps(finding)
|
|
case FindingCategoryBackup:
|
|
steps = p.generateBackupSteps(finding)
|
|
case FindingCategorySecurity:
|
|
steps = p.generateSecuritySteps(finding)
|
|
case FindingCategoryGeneral:
|
|
steps = p.generateConfigurationSteps(finding)
|
|
default:
|
|
// Generic investigation steps for unknown categories
|
|
steps = []aicontracts.RemediationStep{
|
|
{Order: 1, Description: "Investigate the issue by reviewing current resource state"},
|
|
{Order: 2, Description: "Review recent changes that may have caused this issue"},
|
|
{Order: 3, Description: "Take appropriate corrective action based on findings"},
|
|
}
|
|
}
|
|
|
|
return steps
|
|
}
|
|
|
|
// generatePerformanceSteps creates steps for performance issues
|
|
func (p *PatrolService) generatePerformanceSteps(finding *Finding) []aicontracts.RemediationStep {
|
|
title := strings.ToLower(finding.Title)
|
|
|
|
if strings.Contains(title, "cpu") {
|
|
return []aicontracts.RemediationStep{
|
|
{Order: 1, Description: "Identify processes consuming excessive CPU", Target: finding.ResourceID},
|
|
{Order: 2, Description: "Check if resource needs more CPU cores allocated"},
|
|
{Order: 3, Description: "Consider migrating to a less loaded host if VM/container"},
|
|
{Order: 4, Description: "Optimize or restart resource-hungry applications"},
|
|
}
|
|
}
|
|
|
|
if strings.Contains(title, "memory") || strings.Contains(title, "ram") {
|
|
return []aicontracts.RemediationStep{
|
|
{Order: 1, Description: "Identify processes consuming excessive memory", Target: finding.ResourceID},
|
|
{Order: 2, Description: "Check for memory leaks in running applications"},
|
|
{Order: 3, Description: "Consider increasing allocated memory"},
|
|
{Order: 4, Description: "Restart affected services to reclaim memory"},
|
|
}
|
|
}
|
|
|
|
if strings.Contains(title, "io") || strings.Contains(title, "disk") {
|
|
return []aicontracts.RemediationStep{
|
|
{Order: 1, Description: "Identify processes causing high disk I/O", Target: finding.ResourceID},
|
|
{Order: 2, Description: "Check for runaway log files or heavy writes"},
|
|
{Order: 3, Description: "Consider migrating to faster storage"},
|
|
}
|
|
}
|
|
|
|
// Generic performance steps
|
|
return []aicontracts.RemediationStep{
|
|
{Order: 1, Description: "Review current resource utilization metrics", Target: finding.ResourceID},
|
|
{Order: 2, Description: "Identify performance bottlenecks"},
|
|
{Order: 3, Description: "Optimize resource allocation or application configuration"},
|
|
}
|
|
}
|
|
|
|
// generateCapacitySteps creates steps for capacity issues
|
|
func (p *PatrolService) generateCapacitySteps(finding *Finding) []aicontracts.RemediationStep {
|
|
title := strings.ToLower(finding.Title)
|
|
|
|
if strings.Contains(title, "disk") || strings.Contains(title, "storage") {
|
|
return []aicontracts.RemediationStep{
|
|
{Order: 1, Description: "Identify largest files and directories consuming space", Target: finding.ResourceID},
|
|
{Order: 2, Description: "Clean up temporary files, logs, and caches"},
|
|
{Order: 3, Description: "Remove unused packages and old kernels"},
|
|
{Order: 4, Description: "Consider expanding disk or adding additional storage"},
|
|
}
|
|
}
|
|
|
|
if strings.Contains(title, "memory") {
|
|
return []aicontracts.RemediationStep{
|
|
{Order: 1, Description: "Review memory allocation across workloads", Target: finding.ResourceID},
|
|
{Order: 2, Description: "Reduce memory allocation on over-provisioned VMs"},
|
|
{Order: 3, Description: "Add more physical memory to the host"},
|
|
}
|
|
}
|
|
|
|
// Generic capacity steps
|
|
return []aicontracts.RemediationStep{
|
|
{Order: 1, Description: "Review current capacity utilization", Target: finding.ResourceID},
|
|
{Order: 2, Description: "Identify growth trends and plan for expansion"},
|
|
{Order: 3, Description: "Clean up unused resources to free capacity"},
|
|
}
|
|
}
|
|
|
|
// generateAvailabilitySteps creates steps for availability issues
|
|
func (p *PatrolService) generateAvailabilitySteps(finding *Finding) []aicontracts.RemediationStep {
|
|
title := strings.ToLower(finding.Title)
|
|
|
|
if strings.Contains(title, "offline") || strings.Contains(title, "down") {
|
|
return []aicontracts.RemediationStep{
|
|
{Order: 1, Description: "Verify network connectivity to the resource", Target: finding.ResourceID},
|
|
{Order: 2, Description: "Check host status if this is a VM/container"},
|
|
{Order: 3, Description: "Review system logs for crash or shutdown reasons"},
|
|
{Order: 4, Description: "Attempt to start or restart the resource"},
|
|
}
|
|
}
|
|
|
|
if strings.Contains(title, "restart") || strings.Contains(title, "reboot") {
|
|
return []aicontracts.RemediationStep{
|
|
{Order: 1, Description: "Review system logs for cause of restarts", Target: finding.ResourceID},
|
|
{Order: 2, Description: "Check for OOM kills or kernel panics"},
|
|
{Order: 3, Description: "Investigate application crashes"},
|
|
{Order: 4, Description: "Consider enabling watchdog or health checks"},
|
|
}
|
|
}
|
|
|
|
// Generic availability steps
|
|
return []aicontracts.RemediationStep{
|
|
{Order: 1, Description: "Verify resource health and connectivity", Target: finding.ResourceID},
|
|
{Order: 2, Description: "Review recent events and logs"},
|
|
{Order: 3, Description: "Take corrective action to restore availability"},
|
|
}
|
|
}
|
|
|
|
// generateBackupSteps creates steps for backup-related issues
|
|
func (p *PatrolService) generateBackupSteps(finding *Finding) []aicontracts.RemediationStep {
|
|
title := strings.ToLower(finding.Title)
|
|
|
|
if strings.Contains(title, "missing") || strings.Contains(title, "no backup") {
|
|
return []aicontracts.RemediationStep{
|
|
{Order: 1, Description: "Verify backup job configuration exists", Target: finding.ResourceID},
|
|
{Order: 2, Description: "Check backup storage availability and capacity"},
|
|
{Order: 3, Description: "Create or enable backup schedule"},
|
|
{Order: 4, Description: "Run initial backup job"},
|
|
}
|
|
}
|
|
|
|
if strings.Contains(title, "failed") || strings.Contains(title, "error") {
|
|
return []aicontracts.RemediationStep{
|
|
{Order: 1, Description: "Review backup job logs for error details", Target: finding.ResourceID},
|
|
{Order: 2, Description: "Check backup storage connectivity and space"},
|
|
{Order: 3, Description: "Verify backup credentials and permissions"},
|
|
{Order: 4, Description: "Retry backup job after fixing issues"},
|
|
}
|
|
}
|
|
|
|
if strings.Contains(title, "old") || strings.Contains(title, "stale") || strings.Contains(title, "outdated") {
|
|
return []aicontracts.RemediationStep{
|
|
{Order: 1, Description: "Check why scheduled backups are not running", Target: finding.ResourceID},
|
|
{Order: 2, Description: "Review backup retention policy"},
|
|
{Order: 3, Description: "Trigger a new backup immediately"},
|
|
}
|
|
}
|
|
|
|
// Generic backup steps
|
|
return []aicontracts.RemediationStep{
|
|
{Order: 1, Description: "Review backup configuration and schedule", Target: finding.ResourceID},
|
|
{Order: 2, Description: "Verify backup storage health"},
|
|
{Order: 3, Description: "Ensure backup jobs are running successfully"},
|
|
}
|
|
}
|
|
|
|
// generateConfigurationSteps creates steps for configuration issues
|
|
func (p *PatrolService) generateConfigurationSteps(finding *Finding) []aicontracts.RemediationStep {
|
|
return []aicontracts.RemediationStep{
|
|
{Order: 1, Description: "Review current configuration settings", Target: finding.ResourceID},
|
|
{Order: 2, Description: "Compare against recommended best practices"},
|
|
{Order: 3, Description: "Apply configuration changes as needed"},
|
|
{Order: 4, Description: "Verify changes don't impact dependent services"},
|
|
}
|
|
}
|
|
|
|
// generateSecuritySteps creates steps for security issues
|
|
func (p *PatrolService) generateSecuritySteps(finding *Finding) []aicontracts.RemediationStep {
|
|
return []aicontracts.RemediationStep{
|
|
{Order: 1, Description: "Assess the security impact and urgency", Target: finding.ResourceID},
|
|
{Order: 2, Description: "Review access logs for suspicious activity"},
|
|
{Order: 3, Description: "Apply security patches or configuration fixes"},
|
|
{Order: 4, Description: "Verify remediation and update security policies"},
|
|
}
|
|
}
|
|
|
|
// GetFindingsForResource returns active findings for a specific resource
|
|
func (p *PatrolService) GetFindingsForResource(resourceID string) []*Finding {
|
|
findings := p.findings.GetByResource(resourceID)
|
|
normalizeFindingResourceTypes(findings)
|
|
return findings
|
|
}
|
|
|
|
// GetFindingsSummary returns a summary of all findings
|
|
func (p *PatrolService) GetFindingsSummary() FindingsSummary {
|
|
return p.findings.GetSummary()
|
|
}
|
|
|
|
// ResolveFinding marks a finding as resolved with a resolution note
|
|
// This is called when the AI successfully fixes an issue
|
|
func (p *PatrolService) ResolveFinding(findingID string, resolutionNote string) error {
|
|
if findingID == "" {
|
|
return fmt.Errorf("finding ID is required")
|
|
}
|
|
|
|
// Get the finding first to update its resolution note
|
|
finding := p.findings.Get(findingID)
|
|
if finding == nil {
|
|
return fmt.Errorf("finding not found: %s", findingID)
|
|
}
|
|
|
|
// Update the user note with the resolution
|
|
finding.UserNote = resolutionNote
|
|
|
|
// Mark as resolved (not auto-resolved since user/AI initiated it)
|
|
if !p.findings.Resolve(findingID, false) {
|
|
return fmt.Errorf("failed to resolve finding: %s", findingID)
|
|
}
|
|
|
|
p.mu.RLock()
|
|
resolveUnified := p.unifiedFindingResolver
|
|
p.mu.RUnlock()
|
|
if resolveUnified != nil {
|
|
resolveUnified(findingID)
|
|
}
|
|
|
|
log.Info().
|
|
Str("finding_id", findingID).
|
|
Str("resolution_note", resolutionNote).
|
|
Msg("AI resolved finding")
|
|
|
|
return nil
|
|
}
|
|
|
|
// DismissFinding dismisses a finding with a reason and note
|
|
// This is called when the AI determines the finding is not actually an issue
|
|
// For reasons "expected_behavior" or "not_an_issue", a suppression rule is automatically created
|
|
func (p *PatrolService) DismissFinding(findingID string, reason string, note string) error {
|
|
if findingID == "" {
|
|
return fmt.Errorf("finding ID is required")
|
|
}
|
|
|
|
// Validate reason
|
|
validReasons := map[string]bool{"not_an_issue": true, "expected_behavior": true, "will_fix_later": true}
|
|
if !validReasons[reason] {
|
|
return fmt.Errorf("invalid reason: %s", reason)
|
|
}
|
|
|
|
// Check that the finding exists
|
|
finding := p.findings.Get(findingID)
|
|
if finding == nil {
|
|
return fmt.Errorf("finding not found: %s", findingID)
|
|
}
|
|
if patrolFindingUsesSyntheticRuntimeResource(finding) {
|
|
return patrolRuntimeFindingManualActionError("dismissed")
|
|
}
|
|
|
|
// Dismiss the finding:
|
|
// - "not_an_issue" creates permanent suppression (true false positive)
|
|
// - "expected_behavior" and "will_fix_later" just acknowledge (stays visible but marked)
|
|
if !p.findings.Dismiss(findingID, reason, note) {
|
|
return fmt.Errorf("failed to dismiss finding: %s", findingID)
|
|
}
|
|
|
|
log.Info().
|
|
Str("finding_id", findingID).
|
|
Str("reason", reason).
|
|
Str("note", note).
|
|
Bool("permanently_suppressed", reason == "not_an_issue").
|
|
Msg("AI dismissed finding")
|
|
|
|
return nil
|
|
}
|
|
|
|
// GetRunHistory returns the history of patrol runs
|
|
// If limit is > 0, returns at most that many records
|
|
func (p *PatrolService) GetRunHistory(limit int) []PatrolRunRecord {
|
|
if limit <= 0 {
|
|
return p.runHistoryStore.GetAll()
|
|
}
|
|
return p.runHistoryStore.GetRecent(limit)
|
|
}
|
|
|
|
// GetRunByID returns a single patrol run from history.
|
|
func (p *PatrolService) GetRunByID(id string) (PatrolRunRecord, bool) {
|
|
if strings.TrimSpace(id) == "" {
|
|
return PatrolRunRecord{}, false
|
|
}
|
|
return p.runHistoryStore.GetByID(id)
|
|
}
|
|
|
|
// GetAllFindings returns all active findings sorted by severity
|
|
// Only returns critical and warning findings - watch/info are filtered out as noise
|
|
func (p *PatrolService) GetAllFindings() []*Finding {
|
|
findings := p.findings.GetActive(FindingSeverityWarning)
|
|
normalizeFindingResourceTypes(findings)
|
|
|
|
// Sort by severity (critical first) then by time
|
|
severityOrder := map[FindingSeverity]int{
|
|
FindingSeverityCritical: 0,
|
|
FindingSeverityWarning: 1,
|
|
FindingSeverityWatch: 2,
|
|
FindingSeverityInfo: 3,
|
|
}
|
|
|
|
sort.Slice(findings, func(i, j int) bool {
|
|
if severityOrder[findings[i].Severity] != severityOrder[findings[j].Severity] {
|
|
return severityOrder[findings[i].Severity] < severityOrder[findings[j].Severity]
|
|
}
|
|
return findings[i].DetectedAt.After(findings[j].DetectedAt)
|
|
})
|
|
|
|
return findings
|
|
}
|
|
|
|
func normalizeFindingResourceTypes(findings []*Finding) {
|
|
for _, f := range findings {
|
|
if f == nil {
|
|
continue
|
|
}
|
|
if strings.TrimSpace(f.ResourceType) == "" {
|
|
f.ResourceType = inferFindingResourceType(f.ResourceID, f.ResourceName)
|
|
continue
|
|
}
|
|
if normalized := canonicalFindingResourceType(f.ResourceType); normalized != "" {
|
|
f.ResourceType = normalized
|
|
continue
|
|
}
|
|
f.ResourceType = inferFindingResourceType(f.ResourceID, f.ResourceName)
|
|
}
|
|
}
|
|
|
|
// GetFindingsHistory returns all findings including resolved ones for history display
|
|
// Optionally filter by startTime
|
|
func (p *PatrolService) GetFindingsHistory(startTime *time.Time) []*Finding {
|
|
findings := p.findings.GetAll(startTime)
|
|
normalizeFindingResourceTypes(findings)
|
|
|
|
// Sort by detected time (newest first)
|
|
sort.Slice(findings, func(i, j int) bool {
|
|
return findings[i].DetectedAt.After(findings[j].DetectedAt)
|
|
})
|
|
|
|
return findings
|
|
}
|
|
|
|
// ForcePatrol triggers an immediate patrol run.
|
|
// Uses context.Background() since this runs async after the HTTP response.
|
|
func (p *PatrolService) ForcePatrol(ctx context.Context) {
|
|
runCtx := context.Background()
|
|
if ctx != nil {
|
|
runCtx = context.WithoutCancel(ctx)
|
|
}
|
|
go p.runPatrolWithTrigger(runCtx, TriggerReasonManual, nil)
|
|
}
|
|
|
|
// chatServiceExecutorAccessor is satisfied by *chat.Service, allowing patrol to
|
|
// access the executor without adding GetExecutor to the ChatServiceProvider interface.
|
|
type chatServiceExecutorAccessor interface {
|
|
GetExecutor() *tools.PulseToolExecutor
|
|
}
|
|
|
|
// patrolFindingCreatorAdapter implements tools.PatrolFindingCreator by wrapping
|
|
// the PatrolService's existing FindingsStore and recordFinding method.
|
|
type patrolFindingCreatorAdapter struct {
|
|
patrol *PatrolService
|
|
snap patrolRuntimeState
|
|
findingsMu sync.Mutex
|
|
findings []*Finding
|
|
resolvedIDs []string
|
|
rejectedCount int
|
|
checkedFindings bool
|
|
}
|
|
|
|
func newPatrolFindingCreatorAdapterState(p *PatrolService, snap patrolRuntimeState) *patrolFindingCreatorAdapter {
|
|
return &patrolFindingCreatorAdapter{
|
|
patrol: p,
|
|
snap: snap,
|
|
}
|
|
}
|
|
|
|
func (a *patrolFindingCreatorAdapter) CreateFinding(input tools.PatrolFindingInput) (string, bool, error) {
|
|
// Map severity
|
|
var sev FindingSeverity
|
|
switch strings.ToLower(input.Severity) {
|
|
case "critical":
|
|
sev = FindingSeverityCritical
|
|
case "warning":
|
|
sev = FindingSeverityWarning
|
|
case "watch":
|
|
sev = FindingSeverityWatch
|
|
default:
|
|
sev = FindingSeverityInfo
|
|
}
|
|
|
|
// Map category
|
|
var cat FindingCategory
|
|
switch strings.ToLower(input.Category) {
|
|
case "performance":
|
|
cat = FindingCategoryPerformance
|
|
case "capacity":
|
|
cat = FindingCategoryCapacity
|
|
case "reliability":
|
|
cat = FindingCategoryReliability
|
|
case "backup":
|
|
cat = FindingCategoryBackup
|
|
case "security":
|
|
cat = FindingCategorySecurity
|
|
default:
|
|
cat = FindingCategoryGeneral
|
|
}
|
|
|
|
// Normalize key for stable dedup
|
|
normalizedKey := normalizeFindingKey(input.Key)
|
|
if normalizedKey == "" {
|
|
normalizedKey = normalizeFindingKey(input.Title)
|
|
if normalizedKey == "" {
|
|
normalizedKey = "llm-finding"
|
|
}
|
|
}
|
|
|
|
// Generate stable ID
|
|
id := generateFindingID(input.ResourceID, string(cat), normalizedKey)
|
|
|
|
finding := &Finding{
|
|
ID: id,
|
|
Key: normalizedKey,
|
|
Severity: sev,
|
|
Category: cat,
|
|
ResourceID: input.ResourceID,
|
|
ResourceName: input.ResourceName,
|
|
ResourceType: input.ResourceType,
|
|
Title: input.Title,
|
|
Description: input.Description,
|
|
Recommendation: input.Recommendation,
|
|
Evidence: input.Evidence,
|
|
Source: "ai-analysis",
|
|
}
|
|
|
|
// Inline validation: check if finding is actionable against current metrics
|
|
if !a.isActionable(finding) {
|
|
// Determine which metric caused rejection for logging and metrics
|
|
rejectedMetric := "unknown"
|
|
keyLower := strings.ToLower(finding.Key)
|
|
titleLower := strings.ToLower(finding.Title)
|
|
if strings.Contains(keyLower, "cpu") || strings.Contains(titleLower, "cpu") {
|
|
rejectedMetric = "cpu"
|
|
} else if strings.Contains(keyLower, "memory") || strings.Contains(keyLower, "mem") || strings.Contains(titleLower, "memory") {
|
|
rejectedMetric = "memory"
|
|
} else if strings.Contains(keyLower, "disk") || strings.Contains(keyLower, "storage") || strings.Contains(titleLower, "disk") {
|
|
rejectedMetric = "disk"
|
|
}
|
|
a.findingsMu.Lock()
|
|
a.rejectedCount++
|
|
a.findingsMu.Unlock()
|
|
GetPatrolMetrics().RecordFindingRejected(input.ResourceType, rejectedMetric)
|
|
log.Info().
|
|
Str("finding_id", id).
|
|
Str("title", input.Title).
|
|
Str("resource", input.ResourceName).
|
|
Str("resource_type", input.ResourceType).
|
|
Str("rejected_metric", rejectedMetric).
|
|
Msg("AI Patrol: Finding rejected by threshold validation")
|
|
|
|
// Broadcast rejection to stream consumers
|
|
a.patrol.broadcast(PatrolStreamEvent{
|
|
Type: "finding_rejected",
|
|
Content: fmt.Sprintf("Finding rejected: %s on %s (metric %s below threshold)", input.Title, input.ResourceName, rejectedMetric),
|
|
})
|
|
|
|
return id, false, fmt.Errorf("finding rejected: metrics do not support this finding (below actionable thresholds)")
|
|
}
|
|
|
|
// Record finding via PatrolService
|
|
isNew := a.patrol.recordFinding(finding)
|
|
|
|
// Track for run stats
|
|
a.findingsMu.Lock()
|
|
a.findings = append(a.findings, finding)
|
|
a.findingsMu.Unlock()
|
|
|
|
return id, isNew, nil
|
|
}
|
|
|
|
// actionabilityThreshold returns the threshold below which a metric finding is rejected as noise.
|
|
// It reads user-configured PatrolThresholds (Watch level = lowest alarm tier) and falls back
|
|
// to hardcoded defaults (50/60/70) if the threshold is zero or unset.
|
|
// The resourceType parameter selects between node-level and guest-level thresholds where both exist.
|
|
func (a *patrolFindingCreatorAdapter) actionabilityThreshold(metric, resourceType string) float64 {
|
|
a.patrol.mu.RLock()
|
|
thresholds := a.patrol.thresholds
|
|
a.patrol.mu.RUnlock()
|
|
|
|
isNode := resourceType == "node"
|
|
|
|
switch metric {
|
|
case "cpu":
|
|
// Only node-level CPU threshold exists; used for all resource types.
|
|
if thresholds.NodeCPUWatch > 0 {
|
|
return thresholds.NodeCPUWatch
|
|
}
|
|
return 50.0
|
|
case "memory":
|
|
if isNode {
|
|
if thresholds.NodeMemWatch > 0 {
|
|
return thresholds.NodeMemWatch
|
|
}
|
|
} else {
|
|
if thresholds.GuestMemWatch > 0 {
|
|
return thresholds.GuestMemWatch
|
|
}
|
|
}
|
|
return 60.0
|
|
case "disk":
|
|
if thresholds.GuestDiskWatch > 0 {
|
|
return thresholds.GuestDiskWatch
|
|
}
|
|
return 70.0
|
|
case "storage":
|
|
if thresholds.StorageWatch > 0 {
|
|
return thresholds.StorageWatch
|
|
}
|
|
return 70.0
|
|
default:
|
|
return 50.0
|
|
}
|
|
}
|
|
|
|
// isBaselineAnomaly checks if the given value is anomalously high compared to the learned
|
|
// baseline for this resource/metric. Returns true only for upward anomalies (rising above
|
|
// baseline), since dropping usage is not concerning. Returns false if baseline data is
|
|
// unavailable or insufficient.
|
|
func (a *patrolFindingCreatorAdapter) isBaselineAnomaly(resourceID, metric string, value float64) bool {
|
|
a.patrol.mu.RLock()
|
|
store := a.patrol.baselineStore
|
|
a.patrol.mu.RUnlock()
|
|
|
|
if store == nil {
|
|
return false
|
|
}
|
|
|
|
severity, _, bl := store.CheckAnomaly(resourceID, metric, value)
|
|
if severity == baseline.AnomalyNone || bl == nil {
|
|
return false
|
|
}
|
|
|
|
// Only flag upward anomalies (value above baseline mean)
|
|
return value > bl.Mean
|
|
}
|
|
|
|
// isActionable validates a finding against current metrics (inline version of the old
|
|
// validateAIFindings + isActionableFinding logic).
|
|
// Uses user-configured thresholds from PatrolThresholds and baseline anomaly detection
|
|
// as a second-chance check for findings below the threshold but statistically anomalous.
|
|
func (a *patrolFindingCreatorAdapter) isActionable(f *Finding) bool {
|
|
resourceMetrics, hasInventory := a.actionabilityResourceMetrics()
|
|
|
|
// Reject findings for resources that no longer exist in the current infrastructure.
|
|
// Only enforce when we have state data (avoid rejecting during empty/error states).
|
|
metrics, hasMetrics := resourceMetrics[f.ResourceID]
|
|
if !hasMetrics {
|
|
metrics, hasMetrics = resourceMetrics[f.ResourceName]
|
|
}
|
|
if !hasMetrics && hasInventory {
|
|
// Resource not found — it may have been deleted. Reject the finding.
|
|
return false
|
|
}
|
|
|
|
// Allow critical findings without metric threshold checks
|
|
if f.Severity == FindingSeverityCritical {
|
|
return true
|
|
}
|
|
// Allow backup and reliability findings without metric threshold checks
|
|
if f.Category == FindingCategoryBackup || f.Category == FindingCategoryReliability {
|
|
return true
|
|
}
|
|
|
|
if !hasMetrics {
|
|
return true // empty state — benefit of doubt
|
|
}
|
|
|
|
key := strings.ToLower(f.Key)
|
|
titleLower := strings.ToLower(f.Title)
|
|
|
|
// CPU check
|
|
if strings.Contains(key, "cpu") || strings.Contains(titleLower, "cpu") {
|
|
if cpu, ok := metrics["cpu"]; ok && cpu < a.actionabilityThreshold("cpu", f.ResourceType) {
|
|
// Below threshold — check if anomalous (statistically unusual spike)
|
|
if a.isBaselineAnomaly(f.ResourceID, "cpu", cpu) {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
}
|
|
// Memory check
|
|
if strings.Contains(key, "memory") || strings.Contains(key, "mem") || strings.Contains(titleLower, "memory") {
|
|
if mem, ok := metrics["memory"]; ok && mem < a.actionabilityThreshold("memory", f.ResourceType) {
|
|
if a.isBaselineAnomaly(f.ResourceID, "memory", mem) {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
}
|
|
// Disk/storage check
|
|
if strings.Contains(key, "disk") || strings.Contains(key, "storage") || strings.Contains(titleLower, "disk") {
|
|
if disk, ok := metrics["disk"]; ok && disk < a.actionabilityThreshold("disk", f.ResourceType) {
|
|
if a.isBaselineAnomaly(f.ResourceID, "disk", disk) {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
if usage, ok := metrics["usage"]; ok && usage < a.actionabilityThreshold("storage", f.ResourceType) {
|
|
if a.isBaselineAnomaly(f.ResourceID, "storage", usage) {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func (a *patrolFindingCreatorAdapter) actionabilityResourceMetrics() (map[string]map[string]float64, bool) {
|
|
return patrolActionabilityResourceMetrics(a.snap)
|
|
}
|
|
|
|
func (a *patrolFindingCreatorAdapter) ResolveFinding(findingID, reason string) error {
|
|
scopedResources := patrolRuntimeKnownResources(a.snap)
|
|
if len(scopedResources) > 0 {
|
|
finding := a.patrol.findings.Get(findingID)
|
|
if finding == nil {
|
|
return fmt.Errorf("finding %s not found or already resolved", findingID)
|
|
}
|
|
if !scopedResources[finding.ResourceID] && !scopedResources[finding.ResourceName] {
|
|
return fmt.Errorf("finding %s is outside the current patrol scope", findingID)
|
|
}
|
|
}
|
|
|
|
resolved := a.patrol.findings.Resolve(findingID, true)
|
|
if !resolved {
|
|
return fmt.Errorf("finding %s not found or already resolved", findingID)
|
|
}
|
|
|
|
// Notify unified store
|
|
a.patrol.mu.RLock()
|
|
resolveUnified := a.patrol.unifiedFindingResolver
|
|
a.patrol.mu.RUnlock()
|
|
if resolveUnified != nil {
|
|
resolveUnified(findingID)
|
|
}
|
|
|
|
a.findingsMu.Lock()
|
|
a.resolvedIDs = append(a.resolvedIDs, findingID)
|
|
a.findingsMu.Unlock()
|
|
|
|
log.Info().
|
|
Str("finding_id", findingID).
|
|
Str("reason", reason).
|
|
Msg("AI Patrol: Finding resolved via patrol tool")
|
|
return nil
|
|
}
|
|
|
|
func (a *patrolFindingCreatorAdapter) GetActiveFindings(resourceID, minSeverity string) []tools.PatrolFindingInfo {
|
|
a.findingsMu.Lock()
|
|
a.checkedFindings = true
|
|
a.findingsMu.Unlock()
|
|
|
|
var minSev FindingSeverity
|
|
switch strings.ToLower(minSeverity) {
|
|
case "critical":
|
|
minSev = FindingSeverityCritical
|
|
case "warning":
|
|
minSev = FindingSeverityWarning
|
|
case "watch":
|
|
minSev = FindingSeverityWatch
|
|
default:
|
|
minSev = FindingSeverityInfo
|
|
}
|
|
|
|
active := a.patrol.findings.GetActive(minSev)
|
|
scopedResources := patrolRuntimeKnownResources(a.snap)
|
|
var result []tools.PatrolFindingInfo
|
|
for _, f := range active {
|
|
if resourceID != "" && f.ResourceID != resourceID && f.ResourceName != resourceID {
|
|
continue
|
|
}
|
|
if len(scopedResources) > 0 && !scopedResources[f.ResourceID] && !scopedResources[f.ResourceName] {
|
|
continue
|
|
}
|
|
result = append(result, tools.PatrolFindingInfo{
|
|
ID: f.ID,
|
|
Key: f.Key,
|
|
Severity: string(f.Severity),
|
|
Category: string(f.Category),
|
|
ResourceID: f.ResourceID,
|
|
ResourceName: f.ResourceName,
|
|
ResourceType: f.ResourceType,
|
|
Title: f.Title,
|
|
Description: f.Description,
|
|
DetectedAt: f.DetectedAt.Format("2006-01-02 15:04"),
|
|
})
|
|
}
|
|
return result
|
|
}
|
|
|
|
func (a *patrolFindingCreatorAdapter) HasCheckedFindings() bool {
|
|
a.findingsMu.Lock()
|
|
defer a.findingsMu.Unlock()
|
|
return a.checkedFindings
|
|
}
|
|
|
|
// getCollectedFindings returns all findings created during this patrol run.
|
|
func (a *patrolFindingCreatorAdapter) getCollectedFindings() []*Finding {
|
|
a.findingsMu.Lock()
|
|
defer a.findingsMu.Unlock()
|
|
result := make([]*Finding, len(a.findings))
|
|
copy(result, a.findings)
|
|
return result
|
|
}
|
|
|
|
// getResolvedCount returns the number of findings resolved during this patrol run.
|
|
func (a *patrolFindingCreatorAdapter) getResolvedCount() int {
|
|
a.findingsMu.Lock()
|
|
defer a.findingsMu.Unlock()
|
|
return len(a.resolvedIDs)
|
|
}
|
|
|
|
// getReportedFindingIDs returns the IDs of all findings created/re-reported this run.
|
|
func (a *patrolFindingCreatorAdapter) getReportedFindingIDs() []string {
|
|
a.findingsMu.Lock()
|
|
defer a.findingsMu.Unlock()
|
|
ids := make([]string, len(a.findings))
|
|
for i, f := range a.findings {
|
|
ids[i] = f.ID
|
|
}
|
|
return ids
|
|
}
|
|
|
|
// getResolvedIDs returns the IDs of findings explicitly resolved by the LLM this run.
|
|
func (a *patrolFindingCreatorAdapter) getResolvedIDs() []string {
|
|
a.findingsMu.Lock()
|
|
defer a.findingsMu.Unlock()
|
|
result := make([]string, len(a.resolvedIDs))
|
|
copy(result, a.resolvedIDs)
|
|
return result
|
|
}
|
|
|
|
func normalizeFindingKey(key string) string {
|
|
if key == "" {
|
|
return ""
|
|
}
|
|
key = strings.TrimSpace(strings.ToLower(key))
|
|
if key == "" {
|
|
return ""
|
|
}
|
|
key = strings.ReplaceAll(key, "_", "-")
|
|
key = strings.ReplaceAll(key, " ", "-")
|
|
var b strings.Builder
|
|
for _, r := range key {
|
|
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' {
|
|
b.WriteRune(r)
|
|
}
|
|
}
|
|
return strings.Trim(b.String(), "-")
|
|
}
|
|
|
|
// recoverStuckInvestigations detects findings stuck in "running" state for longer than
|
|
// the investigation timeout and resets them to "failed/timed_out" so they can be retried.
|
|
// This handles the case where an investigation goroutine panics or is killed without
|
|
// properly updating the finding status.
|
|
func (p *PatrolService) recoverStuckInvestigations() {
|
|
if p.findings == nil {
|
|
return
|
|
}
|
|
const stuckThreshold = 15 * time.Minute // investigation timeout is 10min; allow 5min grace
|
|
active := p.findings.GetActive(FindingSeverityWarning)
|
|
recovered := 0
|
|
for _, f := range active {
|
|
if f.InvestigationStatus != string(InvestigationStatusRunning) {
|
|
continue
|
|
}
|
|
if f.LastInvestigatedAt == nil {
|
|
continue
|
|
}
|
|
if time.Since(*f.LastInvestigatedAt) < stuckThreshold {
|
|
continue
|
|
}
|
|
// This finding has been "running" for too long — reset it
|
|
p.findings.UpdateInvestigation(
|
|
f.ID,
|
|
f.InvestigationSessionID,
|
|
string(InvestigationStatusFailed),
|
|
string(InvestigationOutcomeTimedOut),
|
|
f.LastInvestigatedAt,
|
|
f.InvestigationAttempts,
|
|
)
|
|
recovered++
|
|
log.Warn().
|
|
Str("finding_id", f.ID).
|
|
Str("resource", f.ResourceName).
|
|
Time("last_investigated", *f.LastInvestigatedAt).
|
|
Msg("AI Patrol: Recovered stuck investigation (exceeded timeout)")
|
|
}
|
|
if recovered > 0 {
|
|
log.Info().Int("recovered", recovered).
|
|
Msg("AI Patrol: Recovered stuck investigations")
|
|
}
|
|
}
|
|
|
|
// retryTimedOutInvestigations re-triggers investigation for findings that failed due to timeout.
|
|
// Called at the end of each patrol run to give timed-out investigations another chance
|
|
// without waiting for the full 1-hour cooldown.
|
|
func (p *PatrolService) retryTimedOutInvestigations() {
|
|
if p.findings == nil {
|
|
return
|
|
}
|
|
active := p.findings.GetActive(FindingSeverityWarning)
|
|
retried := 0
|
|
for _, f := range active {
|
|
if f.InvestigationStatus != string(InvestigationStatusFailed) {
|
|
continue
|
|
}
|
|
if f.InvestigationOutcome != string(InvestigationOutcomeTimedOut) {
|
|
continue
|
|
}
|
|
p.MaybeInvestigateFinding(f)
|
|
retried++
|
|
}
|
|
if retried > 0 {
|
|
log.Info().Int("retried", retried).
|
|
Msg("AI Patrol: Retried timed-out investigations")
|
|
}
|
|
}
|
|
|
|
// MaybeInvestigateFinding checks if a finding should be investigated and triggers investigation if so
|
|
// This is called both during scheduled patrol runs and when alert-triggered findings are created
|
|
func (p *PatrolService) MaybeInvestigateFinding(f *Finding) {
|
|
p.mu.RLock()
|
|
orchestrator := p.investigationOrchestrator
|
|
aiService := p.aiService
|
|
p.mu.RUnlock()
|
|
|
|
// No orchestrator configured
|
|
if orchestrator == nil {
|
|
return
|
|
}
|
|
|
|
// Get autonomy level from AI config
|
|
if aiService == nil {
|
|
return
|
|
}
|
|
if aiService.GetConfig() == nil {
|
|
return
|
|
}
|
|
autonomyLevel := aiService.GetEffectivePatrolAutonomyLevel()
|
|
|
|
// Check if finding should be investigated
|
|
if !f.ShouldInvestigate(autonomyLevel) {
|
|
return
|
|
}
|
|
|
|
// Check if we can start another investigation (concurrency limit)
|
|
if !orchestrator.CanStartInvestigation() {
|
|
log.Debug().
|
|
Str("finding_id", f.ID).
|
|
Msg("Cannot start investigation: max concurrent investigations reached")
|
|
return
|
|
}
|
|
|
|
// Convert Finding to shared finding type for the investigation orchestrator
|
|
invFinding := f.ToCoreFinding()
|
|
|
|
// Trigger investigation in background with a timeout to prevent indefinite runs.
|
|
// Track with WaitGroup so graceful shutdown can wait for completion.
|
|
p.investigationWg.Add(1)
|
|
go func() {
|
|
defer p.investigationWg.Done()
|
|
|
|
// Re-read autonomy level at execution time to avoid using a stale value
|
|
// captured before the goroutine was scheduled.
|
|
currentCfg := aiService.GetConfig()
|
|
if currentCfg == nil {
|
|
log.Warn().Str("finding_id", f.ID).Msg("AI config unavailable at investigation start, aborting")
|
|
return
|
|
}
|
|
currentAutonomy := aiService.GetEffectivePatrolAutonomyLevel()
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
|
|
defer cancel()
|
|
if err := orchestrator.InvestigateFinding(ctx, invFinding, currentAutonomy); err != nil {
|
|
log.Error().
|
|
Err(err).
|
|
Str("finding_id", f.ID).
|
|
Msg("Failed to start investigation")
|
|
return
|
|
}
|
|
|
|
// The orchestrator updates the patrol findings store; sync the latest state to the unified store.
|
|
// This makes fix verification and resolution visible as an actual closed loop in the UI.
|
|
var pushUnified UnifiedFindingCallback
|
|
var resolveUnified func(string)
|
|
var pushCb PushNotifyCallback
|
|
p.mu.RLock()
|
|
pushUnified = p.unifiedFindingCallback
|
|
resolveUnified = p.unifiedFindingResolver
|
|
pushCb = p.pushNotifyCallback
|
|
p.mu.RUnlock()
|
|
if latest := p.findings.Get(f.ID); latest != nil {
|
|
if pushUnified != nil {
|
|
pushUnified(latest)
|
|
}
|
|
if latest.ResolvedAt != nil && resolveUnified != nil {
|
|
resolveUnified(latest.ID)
|
|
}
|
|
|
|
// Send push notifications for investigation outcomes
|
|
if pushCb != nil {
|
|
switch latest.InvestigationOutcome {
|
|
case string(InvestigationOutcomeFixQueued):
|
|
approvalID := ""
|
|
riskLevel := ""
|
|
if orchestrator != nil {
|
|
if inv := orchestrator.GetInvestigationByFinding(latest.ID); inv != nil {
|
|
approvalID = inv.ApprovalID
|
|
if inv.ProposedFix != nil {
|
|
riskLevel = inv.ProposedFix.RiskLevel
|
|
}
|
|
}
|
|
}
|
|
if approvalID == "" {
|
|
log.Warn().
|
|
Str("finding_id", latest.ID).
|
|
Str("investigation_session_id", latest.InvestigationSessionID).
|
|
Msg("Investigation queued for approval but approval ID missing")
|
|
}
|
|
pushCb(relay.NewApprovalRequestNotification(
|
|
approvalID,
|
|
latest.Title,
|
|
riskLevel,
|
|
))
|
|
case string(InvestigationOutcomeFixExecuted), string(InvestigationOutcomeFixVerified):
|
|
pushCb(relay.NewFixCompletedNotification(latest.ID, latest.Title, true))
|
|
case string(InvestigationOutcomeFixFailed), string(InvestigationOutcomeFixVerificationFailed):
|
|
pushCb(relay.NewFixCompletedNotification(latest.ID, latest.Title, false))
|
|
}
|
|
}
|
|
}
|
|
|
|
// Investigation finished successfully. If it produced a proposed fix, persist a
|
|
// remediation plan artifact so the user can review and execute later.
|
|
p.generateRemediationPlanFromInvestigation(f.ID)
|
|
}()
|
|
|
|
log.Info().
|
|
Str("finding_id", f.ID).
|
|
Str("severity", string(f.Severity)).
|
|
Str("resource", f.ResourceName).
|
|
Str("autonomy_level", autonomyLevel).
|
|
Msg("Triggered autonomous investigation for finding")
|
|
}
|
|
|
|
// VerifyFixResolved runs a lightweight scoped patrol to check if the issue
|
|
// identified by the given finding has been resolved after a fix was executed.
|
|
// It bypasses tryStartRun (the patrol mutex) because verification runs inline
|
|
// within the investigation goroutine.
|
|
func (p *PatrolService) VerifyFixResolved(ctx context.Context, resourceID, resourceType, findingKey, findingID string) (bool, error) {
|
|
if p == nil || !p.hasPatrolRuntimeInputs() {
|
|
return false, fmt.Errorf("%w: no patrol runtime state available", aicontracts.ErrVerificationUnknown)
|
|
}
|
|
if ctx == nil {
|
|
ctx = context.Background()
|
|
}
|
|
|
|
startTime := time.Now()
|
|
|
|
// Prefer canonical finding details from store when available.
|
|
var finding *Finding
|
|
if p.findings != nil && findingID != "" {
|
|
finding = p.findings.Get(findingID)
|
|
}
|
|
if finding != nil {
|
|
if resourceID == "" {
|
|
resourceID = finding.ResourceID
|
|
}
|
|
if resourceType == "" {
|
|
resourceType = finding.ResourceType
|
|
}
|
|
if findingKey == "" {
|
|
findingKey = finding.Key
|
|
}
|
|
}
|
|
|
|
log.Info().
|
|
Str("finding_id", findingID).
|
|
Str("resource_id", resourceID).
|
|
Str("key", findingKey).
|
|
Msg("Running deterministic verification to confirm fix")
|
|
|
|
verified, verifyErr := p.verifyFixDeterministically(ctx, finding, resourceID, resourceType, findingKey, findingID)
|
|
|
|
endTime := time.Now()
|
|
duration := endTime.Sub(startTime)
|
|
|
|
// Persist a verification run record for debugging and user transparency.
|
|
status := "healthy"
|
|
summary := "Verification: issue resolved"
|
|
if verifyErr != nil {
|
|
status = "error"
|
|
summary = fmt.Sprintf("Verification inconclusive: %v", verifyErr)
|
|
} else if !verified {
|
|
status = "issues_found"
|
|
summary = "Verification: issue still present"
|
|
}
|
|
|
|
verifyRecord := PatrolRunRecord{
|
|
ID: fmt.Sprintf("%d", startTime.UnixNano()),
|
|
StartedAt: startTime,
|
|
CompletedAt: endTime,
|
|
Duration: duration,
|
|
DurationMs: duration.Milliseconds(),
|
|
Type: "verification",
|
|
TriggerReason: string(TriggerReasonVerification),
|
|
ScopeResourceIDs: []string{resourceID},
|
|
EffectiveScopeResourceIDs: []string{resourceID},
|
|
ScopeResourceTypes: []string{resourceType},
|
|
ScopeContext: fmt.Sprintf("Verifying fix for finding: %s", findingID),
|
|
FindingID: findingID,
|
|
ResourcesChecked: 1,
|
|
NewFindings: 0,
|
|
FindingsSummary: summary,
|
|
Status: status,
|
|
}
|
|
if strings.TrimSpace(resourceID) == "" {
|
|
verifyRecord.ScopeResourceIDs = nil
|
|
verifyRecord.EffectiveScopeResourceIDs = nil
|
|
verifyRecord.ResourcesChecked = 0
|
|
}
|
|
if strings.TrimSpace(resourceType) == "" {
|
|
verifyRecord.ScopeResourceTypes = nil
|
|
}
|
|
if verifyErr != nil {
|
|
verifyRecord.ErrorCount = 1
|
|
}
|
|
if p.runHistoryStore != nil {
|
|
p.runHistoryStore.Add(verifyRecord)
|
|
}
|
|
|
|
p.mu.Lock()
|
|
p.lastActivity = endTime
|
|
p.lastDuration = duration
|
|
p.resourcesChecked = verifyRecord.ResourcesChecked
|
|
p.errorCount = verifyRecord.ErrorCount
|
|
p.mu.Unlock()
|
|
|
|
return verified, verifyErr
|
|
}
|
|
|
|
func (p *PatrolService) verifyFixDeterministically(
|
|
ctx context.Context,
|
|
finding *Finding,
|
|
resourceID, resourceType, findingKey, findingID string,
|
|
) (bool, error) {
|
|
key := normalizeFindingKey(findingKey)
|
|
if key == "" {
|
|
return false, fmt.Errorf("%w: missing finding key", aicontracts.ErrVerificationUnknown)
|
|
}
|
|
|
|
// State-only verifiers (no tools required).
|
|
fullState := p.currentPatrolRuntimeState()
|
|
switch key {
|
|
case "backup-stale":
|
|
ok, err := verifyBackupFreshState(fullState, resourceID)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
return ok, nil
|
|
case "cpu-high", "memory-high", "disk-high":
|
|
ok, err := verifyMetricRecoveredState(fullState, p.thresholds, key, resourceID, resourceType)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
return ok, nil
|
|
case "guest-unreachable":
|
|
ok, err := p.verifyGuestReachabilityState(ctx, fullState, resourceID)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
return ok, nil
|
|
}
|
|
|
|
// Tool-based verifiers (deterministic tool calls + deterministic signal parsing).
|
|
executor, execErr := p.getExecutorForVerification()
|
|
if execErr != nil {
|
|
return false, execErr
|
|
}
|
|
|
|
p.mu.RLock()
|
|
sigThresholds := SignalThresholdsFromPatrol(p.thresholds)
|
|
p.mu.RUnlock()
|
|
|
|
switch key {
|
|
case "smart-failure":
|
|
node := strings.TrimSpace(resourceID)
|
|
device := ""
|
|
if finding != nil {
|
|
device = strings.TrimSpace(finding.ResourceName)
|
|
}
|
|
return p.verifyBySignals(ctx, executor, sigThresholds, key, node, device)
|
|
case "backup-failed":
|
|
guestID := strings.TrimSpace(resourceID)
|
|
return p.verifyBySignals(ctx, executor, sigThresholds, key, guestID, "")
|
|
default:
|
|
return false, fmt.Errorf("%w: no deterministic verifier for key=%q (finding_id=%s)", aicontracts.ErrVerificationUnknown, key, findingID)
|
|
}
|
|
}
|
|
|
|
func (p *PatrolService) getExecutorForVerification() (*tools.PulseToolExecutor, error) {
|
|
if p == nil || p.aiService == nil {
|
|
return nil, fmt.Errorf("%w: AI service unavailable", aicontracts.ErrVerificationUnknown)
|
|
}
|
|
cs := p.aiService.GetChatService()
|
|
if cs == nil {
|
|
return nil, fmt.Errorf("%w: chat service unavailable", aicontracts.ErrVerificationUnknown)
|
|
}
|
|
executorAccessor, ok := cs.(chatServiceExecutorAccessor)
|
|
if !ok {
|
|
return nil, fmt.Errorf("%w: chat service does not expose tool executor", aicontracts.ErrVerificationUnknown)
|
|
}
|
|
exec := executorAccessor.GetExecutor()
|
|
if exec == nil {
|
|
return nil, fmt.Errorf("%w: tool executor unavailable", aicontracts.ErrVerificationUnknown)
|
|
}
|
|
return exec, nil
|
|
}
|
|
|
|
func (p *PatrolService) verifyBySignals(
|
|
ctx context.Context,
|
|
executor *tools.PulseToolExecutor,
|
|
thresholds SignalThresholds,
|
|
findingKey string,
|
|
resourceID string,
|
|
resourceName string,
|
|
) (bool, error) {
|
|
if executor == nil {
|
|
return false, fmt.Errorf("%w: tool executor unavailable", aicontracts.ErrVerificationUnknown)
|
|
}
|
|
|
|
var toolName string
|
|
args := map[string]interface{}{}
|
|
switch findingKey {
|
|
case "smart-failure":
|
|
toolName = "pulse_storage"
|
|
args = map[string]interface{}{"type": "disk_health"}
|
|
if strings.TrimSpace(resourceID) != "" {
|
|
args["node"] = resourceID
|
|
}
|
|
case "backup-failed":
|
|
toolName = "pulse_storage"
|
|
args = map[string]interface{}{"type": "backup_tasks"}
|
|
if strings.TrimSpace(resourceID) != "" {
|
|
args["guest_id"] = resourceID
|
|
}
|
|
default:
|
|
return false, fmt.Errorf("%w: unhandled signal verifier key=%q", aicontracts.ErrVerificationUnknown, findingKey)
|
|
}
|
|
|
|
tc, err := executeToolCall(ctx, executor, toolName, args)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
|
|
signals := DetectSignals([]ToolCallRecord{tc}, thresholds)
|
|
persisting := false
|
|
for _, s := range signals {
|
|
switch findingKey {
|
|
case "smart-failure":
|
|
if s.SignalType == SignalSMARTFailure {
|
|
if resourceName == "" || strings.TrimSpace(strings.ToLower(s.ResourceName)) == strings.TrimSpace(strings.ToLower(resourceName)) {
|
|
persisting = true
|
|
}
|
|
}
|
|
case "backup-failed":
|
|
if s.SignalType == SignalBackupFailed && (resourceID == "" || s.ResourceID == resourceID) {
|
|
persisting = true
|
|
}
|
|
}
|
|
}
|
|
if persisting {
|
|
return false, nil
|
|
}
|
|
return true, nil
|
|
}
|
|
|
|
func executeToolCall(ctx context.Context, executor *tools.PulseToolExecutor, toolName string, args map[string]interface{}) (ToolCallRecord, error) {
|
|
if executor == nil {
|
|
return ToolCallRecord{}, fmt.Errorf("%w: tool executor unavailable", aicontracts.ErrVerificationUnknown)
|
|
}
|
|
if toolName == "" {
|
|
return ToolCallRecord{}, fmt.Errorf("%w: missing tool name", aicontracts.ErrVerificationUnknown)
|
|
}
|
|
if args == nil {
|
|
args = map[string]interface{}{}
|
|
}
|
|
inputBytes, _ := json.Marshal(args)
|
|
inputStr := string(inputBytes)
|
|
start := time.Now().UnixMilli()
|
|
|
|
result, execErr := executor.ExecuteTool(ctx, toolName, args)
|
|
output := ""
|
|
success := false
|
|
if execErr != nil {
|
|
output = execErr.Error()
|
|
} else {
|
|
output = formatToolResult(result)
|
|
success = !result.IsError
|
|
}
|
|
end := time.Now().UnixMilli()
|
|
|
|
if execErr != nil {
|
|
return ToolCallRecord{}, fmt.Errorf("%w: tool execution failed (%s): %v", aicontracts.ErrVerificationUnknown, toolName, execErr)
|
|
}
|
|
if result.IsError {
|
|
return ToolCallRecord{}, fmt.Errorf("%w: tool returned error (%s): %s", aicontracts.ErrVerificationUnknown, toolName, output)
|
|
}
|
|
// Most verification probes rely on parsing structured JSON outputs. If we receive
|
|
// non-JSON text, treat verification as inconclusive rather than "resolved".
|
|
if toolName == "pulse_storage" || toolName == "pulse_metrics" || toolName == "pulse_alerts" {
|
|
if !isValidJSON(output) {
|
|
return ToolCallRecord{}, fmt.Errorf("%w: tool returned non-JSON output (%s)", aicontracts.ErrVerificationUnknown, toolName)
|
|
}
|
|
}
|
|
|
|
return ToolCallRecord{
|
|
ID: fmt.Sprintf("verify-%d", time.Now().UnixNano()),
|
|
ToolName: toolName,
|
|
Input: truncateString(inputStr, MaxToolInputSize),
|
|
Output: output,
|
|
Success: success,
|
|
StartTime: start,
|
|
EndTime: end,
|
|
Duration: end - start,
|
|
}, nil
|
|
}
|
|
|
|
func isValidJSON(s string) bool {
|
|
trimmed := strings.TrimSpace(s)
|
|
if trimmed == "" {
|
|
return false
|
|
}
|
|
if trimmed[0] != '{' && trimmed[0] != '[' {
|
|
return false
|
|
}
|
|
var v interface{}
|
|
return json.Unmarshal([]byte(trimmed), &v) == nil
|
|
}
|
|
|
|
func verifyBackupFreshState(snap patrolRuntimeState, guestID string) (bool, error) {
|
|
vmID := strings.TrimSpace(guestID)
|
|
if vmID == "" {
|
|
return false, fmt.Errorf("%w: missing guest id", aicontracts.ErrVerificationUnknown)
|
|
}
|
|
|
|
now := time.Now()
|
|
details, ok := patrolLookupGuestRuntimeDetails(snap, vmID)
|
|
if !ok || details.lastBackup.IsZero() {
|
|
// If the guest cannot be found, verification can't be concluded deterministically.
|
|
return false, fmt.Errorf("%w: guest not found for backup verification (%s)", aicontracts.ErrVerificationUnknown, vmID)
|
|
}
|
|
|
|
if now.Sub(details.lastBackup) <= 48*time.Hour {
|
|
return true, nil
|
|
}
|
|
return false, nil
|
|
}
|
|
|
|
func verifyMetricRecoveredState(snap patrolRuntimeState, thresholds PatrolThresholds, key, resourceID, resourceType string) (bool, error) {
|
|
rid := strings.TrimSpace(resourceID)
|
|
if rid == "" {
|
|
return false, fmt.Errorf("%w: missing resource id", aicontracts.ErrVerificationUnknown)
|
|
}
|
|
|
|
// Use a small margin to avoid flapping around exact thresholds.
|
|
const margin = 0.95
|
|
metrics, ok := patrolLookupResourceMetricsForType(snap, rid, resourceType)
|
|
if ok {
|
|
switch key {
|
|
case "cpu-high":
|
|
if value, exists := metrics["cpu"]; exists {
|
|
return value < thresholds.NodeCPUWarning*margin, nil
|
|
}
|
|
case "memory-high":
|
|
value, exists := metrics["memory"]
|
|
if !exists {
|
|
break
|
|
}
|
|
if resourceType == "node" {
|
|
return value < thresholds.NodeMemWarning*margin, nil
|
|
}
|
|
return value < thresholds.GuestMemWarning*margin, nil
|
|
case "disk-high":
|
|
if resourceType == "physical_disk" {
|
|
if disk, exists := patrolLookupPhysicalDiskVerificationState(snap, rid); exists {
|
|
if disk.health != "" && !strings.EqualFold(disk.health, "PASSED") && !strings.EqualFold(disk.health, "UNKNOWN") && !strings.EqualFold(disk.health, "OK") {
|
|
return false, nil
|
|
}
|
|
if disk.wearout >= 0 && disk.wearout < 20 {
|
|
return false, nil
|
|
}
|
|
if disk.temperature > 55 {
|
|
return false, nil
|
|
}
|
|
return true, nil
|
|
}
|
|
break
|
|
}
|
|
if resourceType == "storage" {
|
|
if value, exists := metrics["usage"]; exists {
|
|
return value < thresholds.StorageWarning*margin, nil
|
|
}
|
|
break
|
|
}
|
|
if value, exists := metrics["disk"]; exists {
|
|
return value < thresholds.GuestDiskWarn*margin, nil
|
|
}
|
|
}
|
|
}
|
|
|
|
// If we can't locate the resource, verification is inconclusive.
|
|
return false, fmt.Errorf("%w: resource not found for metric verification (%s)", aicontracts.ErrVerificationUnknown, rid)
|
|
}
|
|
|
|
type patrolPhysicalDiskVerification struct {
|
|
health string
|
|
wearout int
|
|
temperature int
|
|
}
|
|
|
|
type patrolPhysicalDiskVisitor func(identifiers []string, verification patrolPhysicalDiskVerification) bool
|
|
|
|
func patrolLookupPhysicalDiskVerificationState(snap patrolRuntimeState, resourceID string) (patrolPhysicalDiskVerification, bool) {
|
|
return patrolLookupPhysicalDiskVerificationWithVisitor(resourceID, func(visit patrolPhysicalDiskVisitor) bool {
|
|
return patrolVisitPhysicalDiskVerification(snap, visit)
|
|
})
|
|
}
|
|
|
|
func (p *PatrolService) verifyGuestReachabilityState(ctx context.Context, snap patrolRuntimeState, guestID string) (bool, error) {
|
|
p.mu.RLock()
|
|
prober := p.guestProber
|
|
p.mu.RUnlock()
|
|
if prober == nil {
|
|
return false, fmt.Errorf("%w: guest prober not configured", aicontracts.ErrVerificationUnknown)
|
|
}
|
|
|
|
vmID := strings.TrimSpace(guestID)
|
|
if vmID == "" {
|
|
return false, fmt.Errorf("%w: missing guest id", aicontracts.ErrVerificationUnknown)
|
|
}
|
|
|
|
details, ok := patrolLookupGuestRuntimeDetails(snap, vmID)
|
|
if !ok || details.node == "" || details.ip == "" {
|
|
return false, fmt.Errorf("%w: missing node/ip for guest reachability verification (guest=%s)", aicontracts.ErrVerificationUnknown, vmID)
|
|
}
|
|
|
|
agentID, ok := prober.GetAgentForHost(details.node)
|
|
if !ok || strings.TrimSpace(agentID) == "" {
|
|
return false, fmt.Errorf("%w: no agent available for host %s", aicontracts.ErrVerificationUnknown, details.node)
|
|
}
|
|
|
|
results, err := prober.PingGuests(ctx, agentID, []string{details.ip})
|
|
if err != nil {
|
|
return false, fmt.Errorf("%w: reachability probe failed: %v", aicontracts.ErrVerificationUnknown, err)
|
|
}
|
|
if res, ok := results[details.ip]; ok {
|
|
if res.Reachable {
|
|
return true, nil
|
|
}
|
|
return false, nil
|
|
}
|
|
return false, fmt.Errorf("%w: missing ping result for %s", aicontracts.ErrVerificationUnknown, details.ip)
|
|
}
|
|
|
|
type patrolGuestRuntimeDetails struct {
|
|
lastBackup time.Time
|
|
node string
|
|
ip string
|
|
}
|
|
|
|
type patrolGuestRuntimeDetailsVisitor func(identifiers []string, details patrolGuestRuntimeDetails) bool
|
|
|
|
type patrolMetricVisitor func(identifiers []string, metrics map[string]float64) bool
|
|
|
|
func patrolActionabilityResourceMetrics(snap patrolRuntimeState) (map[string]map[string]float64, bool) {
|
|
resourceMetrics := make(map[string]map[string]float64)
|
|
hasInventory := patrolVisitMetrics(snap, func(identifiers []string, metrics map[string]float64) bool {
|
|
patrolRegisterResourceMetrics(resourceMetrics, metrics, identifiers...)
|
|
return true
|
|
})
|
|
return patrolAugmentActionabilityMetricsWithPhysicalDisks(resourceMetrics, snap), hasInventory
|
|
}
|
|
|
|
func patrolAugmentActionabilityMetricsWithPhysicalDisks(dest map[string]map[string]float64, snap patrolRuntimeState) map[string]map[string]float64 {
|
|
if dest == nil {
|
|
dest = make(map[string]map[string]float64)
|
|
}
|
|
for _, disk := range patrolPhysicalDiskRows(snap, nil) {
|
|
patrolRegisterResourceMetrics(dest, map[string]float64{}, disk.id, disk.name, disk.devPath, disk.model)
|
|
}
|
|
return dest
|
|
}
|
|
|
|
func patrolLookupGuestRuntimeDetails(snap patrolRuntimeState, guestID string) (patrolGuestRuntimeDetails, bool) {
|
|
return patrolLookupGuestRuntimeDetailsWithVisitor(guestID, func(visit patrolGuestRuntimeDetailsVisitor) bool {
|
|
return patrolVisitGuestRuntimeDetails(snap, visit)
|
|
})
|
|
}
|
|
|
|
func patrolLookupResourceMetrics(snap patrolRuntimeState, resourceID string) (map[string]float64, bool) {
|
|
return patrolLookupMetricsWithVisitor(resourceID, func(visit patrolMetricVisitor) bool {
|
|
return patrolVisitMetrics(snap, visit)
|
|
})
|
|
}
|
|
|
|
func patrolLookupResourceMetricsForType(snap patrolRuntimeState, resourceID, resourceType string) (map[string]float64, bool) {
|
|
switch strings.ToLower(strings.TrimSpace(resourceType)) {
|
|
case "node", "agent":
|
|
return patrolLookupMetricsWithVisitor(resourceID, func(visit patrolMetricVisitor) bool {
|
|
return patrolVisitNodeMetrics(snap, visit)
|
|
})
|
|
case "vm":
|
|
return patrolLookupMetricsWithVisitor(resourceID, func(visit patrolMetricVisitor) bool {
|
|
return patrolVisitGuestMetrics(snap, "VM", visit)
|
|
})
|
|
case "container", "system-container":
|
|
return patrolLookupMetricsWithVisitor(resourceID, func(visit patrolMetricVisitor) bool {
|
|
return patrolVisitGuestMetrics(snap, "Container", visit)
|
|
})
|
|
case "storage":
|
|
return patrolLookupMetricsWithVisitor(resourceID, func(visit patrolMetricVisitor) bool {
|
|
return patrolVisitStorageMetrics(snap, visit)
|
|
})
|
|
case "physical_disk":
|
|
if metrics, ok := patrolLookupPhysicalDiskMetricsState(snap, resourceID); ok {
|
|
return metrics, true
|
|
}
|
|
return nil, false
|
|
default:
|
|
return patrolLookupResourceMetrics(snap, resourceID)
|
|
}
|
|
}
|
|
|
|
func patrolLookupPhysicalDiskMetricsState(snap patrolRuntimeState, resourceID string) (map[string]float64, bool) {
|
|
if _, ok := patrolLookupPhysicalDiskVerificationState(snap, resourceID); ok {
|
|
return map[string]float64{}, true
|
|
}
|
|
return nil, false
|
|
}
|
|
|
|
func patrolLookupPhysicalDiskVerificationWithVisitor(resourceID string, walk func(patrolPhysicalDiskVisitor) bool) (patrolPhysicalDiskVerification, bool) {
|
|
found := false
|
|
var result patrolPhysicalDiskVerification
|
|
walk(func(identifiers []string, verification patrolPhysicalDiskVerification) bool {
|
|
for _, identifier := range identifiers {
|
|
if strings.TrimSpace(identifier) != strings.TrimSpace(resourceID) {
|
|
continue
|
|
}
|
|
result = verification
|
|
found = true
|
|
return false
|
|
}
|
|
return true
|
|
})
|
|
return result, found
|
|
}
|
|
|
|
func patrolVisitPhysicalDiskVerification(snap patrolRuntimeState, visit patrolPhysicalDiskVisitor) bool {
|
|
rows := patrolPhysicalDiskRows(snap, nil)
|
|
for _, disk := range rows {
|
|
if !visit([]string{disk.id, disk.name, disk.devPath, disk.model}, patrolPhysicalDiskVerification{
|
|
health: strings.TrimSpace(disk.health),
|
|
wearout: disk.wearout,
|
|
temperature: disk.temperature,
|
|
}) {
|
|
return true
|
|
}
|
|
}
|
|
return len(rows) > 0
|
|
}
|
|
|
|
func patrolLookupGuestRuntimeDetailsWithVisitor(guestID string, walk func(patrolGuestRuntimeDetailsVisitor) bool) (patrolGuestRuntimeDetails, bool) {
|
|
found := false
|
|
var result patrolGuestRuntimeDetails
|
|
walk(func(identifiers []string, details patrolGuestRuntimeDetails) bool {
|
|
for _, identifier := range identifiers {
|
|
if strings.TrimSpace(identifier) != strings.TrimSpace(guestID) {
|
|
continue
|
|
}
|
|
result = details
|
|
found = true
|
|
return false
|
|
}
|
|
return true
|
|
})
|
|
return result, found
|
|
}
|
|
|
|
func patrolVisitGuestRuntimeDetails(snap patrolRuntimeState, visit patrolGuestRuntimeDetailsVisitor) bool {
|
|
rows := patrolGuestInventoryRows(snap, nil, nil)
|
|
for _, guest := range rows {
|
|
identifiers := []string{guest.id, guest.name}
|
|
if guest.vmid > 0 {
|
|
identifiers = append(identifiers, fmt.Sprintf("%d", guest.vmid))
|
|
}
|
|
if !visit(identifiers, patrolGuestRuntimeDetails{
|
|
lastBackup: guest.lastBackup,
|
|
node: guest.node,
|
|
ip: guest.ip,
|
|
}) {
|
|
return true
|
|
}
|
|
}
|
|
return len(rows) > 0
|
|
}
|
|
|
|
func patrolLookupMetricsWithVisitor(resourceID string, walk func(patrolMetricVisitor) bool) (map[string]float64, bool) {
|
|
found := false
|
|
var result map[string]float64
|
|
walk(func(identifiers []string, metrics map[string]float64) bool {
|
|
for _, identifier := range identifiers {
|
|
if strings.TrimSpace(identifier) != strings.TrimSpace(resourceID) {
|
|
continue
|
|
}
|
|
result = metrics
|
|
found = true
|
|
return false
|
|
}
|
|
return true
|
|
})
|
|
return result, found
|
|
}
|
|
|
|
func patrolVisitMetrics(snap patrolRuntimeState, visit patrolMetricVisitor) bool {
|
|
hasInventory := false
|
|
for _, walk := range []func(patrolRuntimeState, patrolMetricVisitor) bool{
|
|
patrolVisitNodeMetrics,
|
|
func(s patrolRuntimeState, v patrolMetricVisitor) bool { return patrolVisitGuestMetrics(s, "VM", v) },
|
|
func(s patrolRuntimeState, v patrolMetricVisitor) bool {
|
|
return patrolVisitGuestMetrics(s, "Container", v)
|
|
},
|
|
patrolVisitStorageMetrics,
|
|
} {
|
|
if walk(snap, visit) {
|
|
hasInventory = true
|
|
}
|
|
}
|
|
return hasInventory
|
|
}
|
|
|
|
func patrolVisitNodeMetrics(snap patrolRuntimeState, visit patrolMetricVisitor) bool {
|
|
rows := patrolNodeInventoryRows(snap, nil)
|
|
for _, node := range rows {
|
|
metrics := map[string]float64{"cpu": node.cpu}
|
|
if node.mem > 0 {
|
|
metrics["memory"] = node.mem
|
|
}
|
|
if !visit([]string{node.id, node.name}, metrics) {
|
|
return true
|
|
}
|
|
}
|
|
return len(rows) > 0
|
|
}
|
|
|
|
func patrolVisitGuestMetrics(snap patrolRuntimeState, guestType string, visit patrolMetricVisitor) bool {
|
|
rows := patrolGuestInventoryRows(snap, nil, nil)
|
|
count := 0
|
|
for _, guest := range rows {
|
|
if guest.gType != guestType {
|
|
continue
|
|
}
|
|
count++
|
|
if !visit([]string{guest.id, guest.name}, map[string]float64{
|
|
"cpu": guest.cpu,
|
|
"memory": guest.mem,
|
|
"disk": guest.disk,
|
|
}) {
|
|
return true
|
|
}
|
|
}
|
|
return count > 0
|
|
}
|
|
|
|
func patrolVisitStorageMetrics(snap patrolRuntimeState, visit patrolMetricVisitor) bool {
|
|
rows := patrolStoragePoolRows(snap, nil)
|
|
for _, storage := range rows {
|
|
if !visit([]string{storage.id, storage.name}, map[string]float64{"usage": storage.usage}) {
|
|
return true
|
|
}
|
|
}
|
|
return len(rows) > 0
|
|
}
|
|
|
|
func patrolRegisterResourceMetrics(dest map[string]map[string]float64, metrics map[string]float64, identifiers ...string) {
|
|
for _, identifier := range identifiers {
|
|
identifier = strings.TrimSpace(identifier)
|
|
if identifier == "" {
|
|
continue
|
|
}
|
|
dest[identifier] = metrics
|
|
}
|
|
}
|
|
|
|
func patrolGuestMatches(guestID, id, name string, vmid int) bool {
|
|
return id == guestID || name == guestID || fmt.Sprintf("%d", vmid) == guestID
|
|
}
|
|
|
|
func patrolFirstIP(ips []string) string {
|
|
if len(ips) == 0 {
|
|
return ""
|
|
}
|
|
return ips[0]
|
|
}
|