Pulse/internal/ai/alert_provider.go
rcourtman 37e7aebc98 feat: enhance AI patrol with streaming and improved findings
- Add streaming support to patrol operations
- Improve finding detection and reporting
- Enhance agentic chat capabilities
- Add alert integration improvements
2026-01-22 22:30:35 +00:00

249 lines
8 KiB
Go

// Package ai provides AI-powered infrastructure investigation and remediation.
package ai
import (
"fmt"
"strings"
"time"
)
// AlertInfo contains information about an alert for AI context
type AlertInfo struct {
ID string `json:"id"`
Type string `json:"type"` // cpu, memory, disk, offline, etc.
Level string `json:"level"` // warning, critical
ResourceID string `json:"resource_id"` // unique resource identifier
ResourceName string `json:"resource_name"` // human-readable name
ResourceType string `json:"resource_type"` // guest, node, storage, docker, etc.
Node string `json:"node"` // PVE node (if applicable)
Instance string `json:"instance"` // Proxmox instance name
Message string `json:"message"` // Alert description
Value float64 `json:"value"` // Current metric value
Threshold float64 `json:"threshold"` // Threshold that was exceeded
StartTime time.Time `json:"start_time"` // When alert started
Duration string `json:"duration"` // Human-readable duration
Acknowledged bool `json:"acknowledged"` // Whether alert has been acked
}
// ResolvedAlertInfo contains information about a recently resolved alert
type ResolvedAlertInfo struct {
AlertInfo
ResolvedTime time.Time `json:"resolved_time"`
Duration string `json:"total_duration"` // How long the alert lasted
}
// AlertProvider provides access to the current alert state
type AlertProvider interface {
// GetActiveAlerts returns all currently active alerts
GetActiveAlerts() []AlertInfo
// GetRecentlyResolved returns alerts resolved in the last N minutes
GetRecentlyResolved(minutes int) []ResolvedAlertInfo
// GetAlertsByResource returns active alerts for a specific resource
GetAlertsByResource(resourceID string) []AlertInfo
// GetAlertHistory returns historical alerts for a resource
GetAlertHistory(resourceID string, limit int) []ResolvedAlertInfo
}
// SetAlertProvider sets the alert provider for AI context
func (s *Service) SetAlertProvider(ap AlertProvider) {
s.mu.Lock()
defer s.mu.Unlock()
s.alertProvider = ap
}
// SetAlertResolver sets the alert resolver for AI Patrol to use for autonomous alert management.
// This allows the patrol service to review and auto-resolve alerts when issues are fixed.
func (s *Service) SetAlertResolver(resolver AlertResolver) {
s.mu.Lock()
patrol := s.patrolService
s.mu.Unlock()
if patrol != nil {
patrol.SetAlertResolver(resolver)
}
}
// buildAlertContext generates AI context from current alerts
func (s *Service) buildAlertContext() string {
s.mu.RLock()
ap := s.alertProvider
s.mu.RUnlock()
if ap == nil {
return ""
}
activeAlerts := ap.GetActiveAlerts()
recentlyResolved := ap.GetRecentlyResolved(30) // Last 30 minutes
if len(activeAlerts) == 0 && len(recentlyResolved) == 0 {
return ""
}
var sections []string
sections = append(sections, "\n## Alert Status")
// Active alerts
if len(activeAlerts) > 0 {
sections = append(sections, "\n### Active Alerts")
sections = append(sections, fmt.Sprintf("There are **%d active alert(s)** that may need attention:\n", len(activeAlerts)))
// Group by severity
var critical, warning []AlertInfo
for _, a := range activeAlerts {
if a.Level == "critical" {
critical = append(critical, a)
} else {
warning = append(warning, a)
}
}
if len(critical) > 0 {
sections = append(sections, "**Critical:**")
for _, a := range critical {
sections = append(sections, formatAlertForAI(a))
}
}
if len(warning) > 0 {
sections = append(sections, "**Warning:**")
for _, a := range warning {
sections = append(sections, formatAlertForAI(a))
}
}
} else {
sections = append(sections, "\n### No Active Alerts")
sections = append(sections, "All systems are operating within normal thresholds.")
}
// Recently resolved
if len(recentlyResolved) > 0 {
sections = append(sections, fmt.Sprintf("\n### Recently Resolved (%d)", len(recentlyResolved)))
sections = append(sections, "These alerts were resolved in the last 30 minutes:")
// Show up to 5 most recent
limit := 5
if len(recentlyResolved) < limit {
limit = len(recentlyResolved)
}
for i := 0; i < limit; i++ {
a := recentlyResolved[i]
sections = append(sections, fmt.Sprintf("- **%s** on %s: %s (lasted %s, resolved %s ago)",
a.Type, a.ResourceName, a.Message, a.Duration,
formatTimeAgo(a.ResolvedTime)))
}
if len(recentlyResolved) > limit {
sections = append(sections, fmt.Sprintf(" ... and %d more", len(recentlyResolved)-limit))
}
}
return strings.Join(sections, "\n")
}
// buildTargetAlertContext builds alert context for a specific target
func (s *Service) buildTargetAlertContext(resourceID string) string {
s.mu.RLock()
ap := s.alertProvider
s.mu.RUnlock()
if ap == nil || resourceID == "" {
return ""
}
alerts := ap.GetAlertsByResource(resourceID)
if len(alerts) == 0 {
return ""
}
var lines []string
lines = append(lines, "\n### Active Alerts for This Resource")
for _, a := range alerts {
lines = append(lines, formatAlertForAI(a))
}
return strings.Join(lines, "\n")
}
// formatAlertForAI formats an alert for inclusion in AI context
func formatAlertForAI(a AlertInfo) string {
ackedNote := ""
if a.Acknowledged {
ackedNote = " [ACKNOWLEDGED]"
}
nodeInfo := ""
if a.Node != "" {
nodeInfo = fmt.Sprintf(" on node %s", a.Node)
}
return fmt.Sprintf("- **%s** %s: %s (current: %.1f%%, threshold: %.1f%%) - active for %s%s%s",
strings.ToUpper(a.Level), a.Type, a.ResourceName,
a.Value, a.Threshold, a.Duration, nodeInfo, ackedNote)
}
// formatTimeAgo returns a human-readable time-ago string
func formatTimeAgo(t time.Time) string {
d := time.Since(t)
if d < time.Minute {
return "just now"
} else if d < time.Hour {
mins := int(d.Minutes())
if mins == 1 {
return "1 minute"
}
return fmt.Sprintf("%d minutes", mins)
} else if d < 24*time.Hour {
hours := int(d.Hours())
if hours == 1 {
return "1 hour"
}
return fmt.Sprintf("%d hours", hours)
}
days := int(d.Hours() / 24)
if days == 1 {
return "1 day"
}
return fmt.Sprintf("%d days", days)
}
// AlertInvestigationRequest represents a request to investigate an alert
type AlertInvestigationRequest struct {
AlertID string `json:"alert_id"`
ResourceID string `json:"resource_id"`
ResourceName string `json:"resource_name"`
ResourceType string `json:"resource_type"` // guest, node, storage, docker
AlertType string `json:"alert_type"` // cpu, memory, disk, offline, etc.
Level string `json:"level"` // warning, critical
Value float64 `json:"value"`
Threshold float64 `json:"threshold"`
Message string `json:"message"`
Duration string `json:"duration"` // How long the alert has been active
Node string `json:"node,omitempty"`
VMID int `json:"vmid,omitempty"`
}
// GenerateAlertInvestigationPrompt creates a focused prompt for alert investigation
func GenerateAlertInvestigationPrompt(req AlertInvestigationRequest) string {
var prompt strings.Builder
prompt.WriteString(fmt.Sprintf("Investigate this %s alert:\n\n", strings.ToUpper(req.Level)))
prompt.WriteString(fmt.Sprintf("**Resource:** %s (%s)\n", req.ResourceName, req.ResourceType))
prompt.WriteString(fmt.Sprintf("**Alert Type:** %s\n", req.AlertType))
prompt.WriteString(fmt.Sprintf("**Current Value:** %.1f%%\n", req.Value))
prompt.WriteString(fmt.Sprintf("**Threshold:** %.1f%%\n", req.Threshold))
prompt.WriteString(fmt.Sprintf("**Duration:** %s\n", req.Duration))
if req.Node != "" {
prompt.WriteString(fmt.Sprintf("**Node:** %s\n", req.Node))
}
prompt.WriteString("\n**Action Required:**\n")
prompt.WriteString("1. Identify the root cause of this alert\n")
prompt.WriteString("2. Check related metrics and system state\n")
prompt.WriteString("3. Suggest specific remediation steps\n")
prompt.WriteString("4. If safe, execute diagnostic commands to gather more info\n")
return prompt.String()
}