Pulse/internal/ai/incident_coordinator.go
rcourtman 27f1a11acb feat: add AI Intelligence system with investigation and forecasting
Major new AI capabilities for infrastructure monitoring:

Investigation System:
- Autonomous finding investigation with configurable autonomy levels
- Investigation orchestrator with rate limiting and guardrails
- Safety checks for read-only mode enforcement
- Chat-based investigation with approval workflows

Forecasting & Remediation:
- Trend forecasting for resource capacity planning
- Remediation engine for generating fix proposals
- Circuit breaker for AI operation protection

Unified Findings:
- Unified store bridging alerts and AI findings
- Correlation and root cause analysis
- Incident coordinator with metrics recording

New Frontend:
- AI Intelligence page with patrol controls
- Investigation drawer for finding details
- Unified findings panel with actions

Supporting Infrastructure:
- Learning store for user preference tracking
- Proxmox event ingestion and correlation
- Enhanced patrol with investigation triggers
2026-01-24 22:41:43 +00:00

325 lines
8 KiB
Go

// Package ai provides AI-powered infrastructure analysis.
package ai
import (
"sync"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/ai/memory"
"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
"github.com/rcourtman/pulse-go-rewrite/internal/metrics"
"github.com/rs/zerolog/log"
)
// IncidentCoordinatorConfig configures the incident coordinator
type IncidentCoordinatorConfig struct {
PreBuffer time.Duration // History to capture before incident (default: 5 min)
PostDuration time.Duration // How long to record after trigger (default: 10 min)
MaxConcurrent int // Maximum concurrent incident recordings (default: 50)
EnableRecorder bool // Whether to enable high-frequency recording
}
// DefaultIncidentCoordinatorConfig returns sensible defaults
func DefaultIncidentCoordinatorConfig() IncidentCoordinatorConfig {
return IncidentCoordinatorConfig{
PreBuffer: 5 * time.Minute,
PostDuration: 10 * time.Minute,
MaxConcurrent: 50,
EnableRecorder: true,
}
}
// IncidentCoordinator coordinates incident recording between the metrics.IncidentRecorder
// (for high-frequency data capture) and memory.IncidentStore (for incident timeline tracking).
type IncidentCoordinator struct {
mu sync.RWMutex
config IncidentCoordinatorConfig
// Components
recorder *metrics.IncidentRecorder // High-frequency metrics capture
incidentStore *memory.IncidentStore // Incident timeline tracking
// Active incidents - maps alert ID to window ID
activeIncidents map[string]activeIncident
// Control
running bool
}
type activeIncident struct {
windowID string
resourceID string
startedAt time.Time
stopTimer *time.Timer // Timer to auto-stop recording after post-duration
}
// NewIncidentCoordinator creates a new incident coordinator
func NewIncidentCoordinator(cfg IncidentCoordinatorConfig) *IncidentCoordinator {
if cfg.PreBuffer <= 0 {
cfg.PreBuffer = 5 * time.Minute
}
if cfg.PostDuration <= 0 {
cfg.PostDuration = 10 * time.Minute
}
if cfg.MaxConcurrent <= 0 {
cfg.MaxConcurrent = 50
}
return &IncidentCoordinator{
config: cfg,
activeIncidents: make(map[string]activeIncident),
}
}
// SetRecorder sets the metrics incident recorder
func (c *IncidentCoordinator) SetRecorder(recorder *metrics.IncidentRecorder) {
c.mu.Lock()
defer c.mu.Unlock()
c.recorder = recorder
}
// SetIncidentStore sets the incident timeline store
func (c *IncidentCoordinator) SetIncidentStore(store *memory.IncidentStore) {
c.mu.Lock()
defer c.mu.Unlock()
c.incidentStore = store
}
// Start starts the incident coordinator
func (c *IncidentCoordinator) Start() {
c.mu.Lock()
defer c.mu.Unlock()
if c.running {
return
}
c.running = true
log.Info().Msg("Incident coordinator started")
}
// Stop stops the incident coordinator
func (c *IncidentCoordinator) Stop() {
c.mu.Lock()
defer c.mu.Unlock()
if !c.running {
return
}
c.running = false
// Stop all active timers
for _, inc := range c.activeIncidents {
if inc.stopTimer != nil {
inc.stopTimer.Stop()
}
}
c.activeIncidents = make(map[string]activeIncident)
log.Info().Msg("Incident coordinator stopped")
}
// OnAlertFired is called when an alert fires - starts incident recording
func (c *IncidentCoordinator) OnAlertFired(alert *alerts.Alert) {
if alert == nil {
return
}
c.mu.Lock()
defer c.mu.Unlock()
if !c.running {
return
}
// Check if we already have an active incident for this alert
if _, exists := c.activeIncidents[alert.ID]; exists {
log.Debug().
Str("alert_id", alert.ID).
Msg("Incident already being recorded for this alert")
return
}
// Check concurrent limit
if len(c.activeIncidents) >= c.config.MaxConcurrent {
log.Warn().
Str("alert_id", alert.ID).
Int("active_count", len(c.activeIncidents)).
Msg("Incident coordinator at capacity, skipping new incident")
return
}
// Start high-frequency recording if enabled and recorder available
var windowID string
if c.config.EnableRecorder && c.recorder != nil {
windowID = c.recorder.StartRecording(
alert.ResourceID,
alert.ResourceName,
"", // resourceType - we don't always have this
"alert",
alert.ID,
)
}
// Record in incident store
if c.incidentStore != nil {
c.incidentStore.RecordAlertFired(alert)
}
// Track the active incident
inc := activeIncident{
windowID: windowID,
resourceID: alert.ResourceID,
startedAt: time.Now(),
}
c.activeIncidents[alert.ID] = inc
log.Info().
Str("alert_id", alert.ID).
Str("resource_id", alert.ResourceID).
Str("window_id", windowID).
Msg("Incident coordinator: Started incident recording")
}
// OnAlertCleared is called when an alert clears - schedules recording stop
func (c *IncidentCoordinator) OnAlertCleared(alert *alerts.Alert) {
if alert == nil {
return
}
c.mu.Lock()
inc, exists := c.activeIncidents[alert.ID]
if !exists {
c.mu.Unlock()
return
}
// Record resolution in incident store
if c.incidentStore != nil {
c.incidentStore.RecordAlertResolved(alert, time.Now())
}
// If no recorder or no window, just clean up immediately
if c.recorder == nil || inc.windowID == "" {
delete(c.activeIncidents, alert.ID)
c.mu.Unlock()
return
}
// Schedule stop after post-duration to capture post-incident data
alertID := alert.ID
timer := time.AfterFunc(c.config.PostDuration, func() {
c.stopIncidentRecording(alertID)
})
inc.stopTimer = timer
c.activeIncidents[alert.ID] = inc
c.mu.Unlock()
log.Info().
Str("alert_id", alert.ID).
Str("window_id", inc.windowID).
Dur("post_duration", c.config.PostDuration).
Msg("Incident coordinator: Alert cleared, scheduled recording stop")
}
// stopIncidentRecording stops recording for a specific alert
func (c *IncidentCoordinator) stopIncidentRecording(alertID string) {
c.mu.Lock()
defer c.mu.Unlock()
inc, exists := c.activeIncidents[alertID]
if !exists {
return
}
// Stop the recorder
if c.recorder != nil && inc.windowID != "" {
c.recorder.StopRecording(inc.windowID)
}
// Clean up
if inc.stopTimer != nil {
inc.stopTimer.Stop()
}
delete(c.activeIncidents, alertID)
log.Info().
Str("alert_id", alertID).
Str("window_id", inc.windowID).
Msg("Incident coordinator: Stopped incident recording")
}
// OnAnomalyDetected is called when an anomaly is detected - starts focused recording
func (c *IncidentCoordinator) OnAnomalyDetected(resourceID, resourceType, metric string, severity string) {
c.mu.Lock()
defer c.mu.Unlock()
if !c.running || !c.config.EnableRecorder || c.recorder == nil {
return
}
// Create a pseudo-alert ID for the anomaly
anomalyID := "anomaly-" + resourceID + "-" + metric
// Check if we already have an active incident for this
if _, exists := c.activeIncidents[anomalyID]; exists {
return
}
// Check concurrent limit
if len(c.activeIncidents) >= c.config.MaxConcurrent {
return
}
// Start recording
windowID := c.recorder.StartRecording(
resourceID,
"", // name
resourceType,
"anomaly",
anomalyID,
)
// Track the active incident
c.activeIncidents[anomalyID] = activeIncident{
windowID: windowID,
resourceID: resourceID,
startedAt: time.Now(),
}
// Schedule auto-stop after post-duration (anomalies don't have "clear" events)
timer := time.AfterFunc(c.config.PostDuration, func() {
c.stopIncidentRecording(anomalyID)
})
c.activeIncidents[anomalyID] = activeIncident{
windowID: windowID,
resourceID: resourceID,
startedAt: time.Now(),
stopTimer: timer,
}
log.Info().
Str("resource_id", resourceID).
Str("metric", metric).
Str("severity", severity).
Str("window_id", windowID).
Msg("Incident coordinator: Started anomaly recording")
}
// GetActiveIncidentCount returns the number of active incidents being recorded
func (c *IncidentCoordinator) GetActiveIncidentCount() int {
c.mu.RLock()
defer c.mu.RUnlock()
return len(c.activeIncidents)
}
// GetRecordingWindowID returns the recording window ID for an alert
func (c *IncidentCoordinator) GetRecordingWindowID(alertID string) string {
c.mu.RLock()
defer c.mu.RUnlock()
if inc, exists := c.activeIncidents[alertID]; exists {
return inc.windowID
}
return ""
}