Pulse/internal/metrics/incident_recorder.go
rcourtman 27f1a11acb feat: add AI Intelligence system with investigation and forecasting
Major new AI capabilities for infrastructure monitoring:

Investigation System:
- Autonomous finding investigation with configurable autonomy levels
- Investigation orchestrator with rate limiting and guardrails
- Safety checks for read-only mode enforcement
- Chat-based investigation with approval workflows

Forecasting & Remediation:
- Trend forecasting for resource capacity planning
- Remediation engine for generating fix proposals
- Circuit breaker for AI operation protection

Unified Findings:
- Unified store bridging alerts and AI findings
- Correlation and root cause analysis
- Incident coordinator with metrics recording

New Frontend:
- AI Intelligence page with patrol controls
- Investigation drawer for finding details
- Unified findings panel with actions

Supporting Infrastructure:
- Learning store for user preference tracking
- Proxmox event ingestion and correlation
- Enhanced patrol with investigation triggers
2026-01-24 22:41:43 +00:00

742 lines
19 KiB
Go

// Package metrics provides metrics collection and incident recording functionality.
package metrics
import (
"encoding/json"
"os"
"path/filepath"
"sync"
"time"
"github.com/rs/zerolog/log"
)
// IncidentWindow represents a high-frequency recording window during an incident
type IncidentWindow struct {
ID string `json:"id"`
ResourceID string `json:"resource_id"`
ResourceName string `json:"resource_name,omitempty"`
ResourceType string `json:"resource_type,omitempty"`
TriggerType string `json:"trigger_type"` // "alert", "anomaly", "focus", "manual"
TriggerID string `json:"trigger_id,omitempty"`
StartTime time.Time `json:"start_time"`
EndTime *time.Time `json:"end_time,omitempty"`
Status IncidentWindowStatus `json:"status"`
DataPoints []IncidentDataPoint `json:"data_points"`
Summary *IncidentSummary `json:"summary,omitempty"`
}
// IncidentWindowStatus represents the status of an incident window
type IncidentWindowStatus string
const (
IncidentWindowStatusRecording IncidentWindowStatus = "recording"
IncidentWindowStatusComplete IncidentWindowStatus = "complete"
IncidentWindowStatusTruncated IncidentWindowStatus = "truncated" // Stopped due to limits
)
// IncidentDataPoint represents a single data point in an incident window
type IncidentDataPoint struct {
Timestamp time.Time `json:"timestamp"`
Metrics map[string]float64 `json:"metrics"` // cpu, memory, disk, etc.
Metadata map[string]interface{} `json:"metadata,omitempty"`
}
// IncidentSummary provides computed statistics about an incident window
type IncidentSummary struct {
Duration time.Duration `json:"duration_ms"`
DataPoints int `json:"data_points"`
Peaks map[string]float64 `json:"peaks"` // Maximum values
Lows map[string]float64 `json:"lows"` // Minimum values
Averages map[string]float64 `json:"averages"` // Average values
Changes map[string]float64 `json:"changes"` // Change from start to end
Anomalies []string `json:"anomalies,omitempty"` // Detected anomalies
}
// IncidentRecorderConfig configures the incident recorder
type IncidentRecorderConfig struct {
// Recording settings
SampleInterval time.Duration // How often to record data points (default: 5s)
PreIncidentWindow time.Duration // How much data to capture before incident (default: 5min)
PostIncidentWindow time.Duration // How much data to capture after incident (default: 10min)
MaxDataPointsPerWindow int // Maximum data points per window (default: 500)
// Storage settings
DataDir string
MaxWindows int // Maximum number of windows to keep (default: 100)
RetentionDuration time.Duration // How long to keep windows (default: 24h)
}
// DefaultIncidentRecorderConfig returns sensible defaults
func DefaultIncidentRecorderConfig() IncidentRecorderConfig {
return IncidentRecorderConfig{
SampleInterval: 5 * time.Second,
PreIncidentWindow: 5 * time.Minute,
PostIncidentWindow: 10 * time.Minute,
MaxDataPointsPerWindow: 500,
MaxWindows: 100,
RetentionDuration: 24 * time.Hour,
}
}
// MetricsProvider provides current metrics for a resource
type MetricsProvider interface {
GetCurrentMetrics(resourceID string) (map[string]float64, error)
GetMonitoredResourceIDs() []string // Returns all resource IDs being monitored
}
// IncidentRecorder captures high-frequency metrics during incidents
type IncidentRecorder struct {
mu sync.RWMutex
config IncidentRecorderConfig
provider MetricsProvider
// Active recordings
activeWindows map[string]*IncidentWindow // keyed by window ID
// Completed recordings (ring buffer)
completedWindows []*IncidentWindow
// Background recording for pre-incident buffer
preIncidentBuffer map[string][]IncidentDataPoint // keyed by resource ID
// Persistence
dataDir string
filePath string
// Control
stopCh chan struct{}
running bool
}
// NewIncidentRecorder creates a new incident recorder
func NewIncidentRecorder(cfg IncidentRecorderConfig) *IncidentRecorder {
if cfg.SampleInterval <= 0 {
cfg.SampleInterval = 5 * time.Second
}
if cfg.PreIncidentWindow <= 0 {
cfg.PreIncidentWindow = 5 * time.Minute
}
if cfg.PostIncidentWindow <= 0 {
cfg.PostIncidentWindow = 10 * time.Minute
}
if cfg.MaxDataPointsPerWindow <= 0 {
cfg.MaxDataPointsPerWindow = 500
}
if cfg.MaxWindows <= 0 {
cfg.MaxWindows = 100
}
if cfg.RetentionDuration <= 0 {
cfg.RetentionDuration = 24 * time.Hour
}
recorder := &IncidentRecorder{
config: cfg,
activeWindows: make(map[string]*IncidentWindow),
completedWindows: make([]*IncidentWindow, 0),
preIncidentBuffer: make(map[string][]IncidentDataPoint),
dataDir: cfg.DataDir,
stopCh: make(chan struct{}),
}
if cfg.DataDir != "" {
recorder.filePath = filepath.Join(cfg.DataDir, "incident_windows.json")
if err := recorder.loadFromDisk(); err != nil {
log.Warn().Err(err).Msg("Failed to load incident windows from disk")
}
}
return recorder
}
// SetMetricsProvider sets the metrics provider for recording
func (r *IncidentRecorder) SetMetricsProvider(provider MetricsProvider) {
r.mu.Lock()
defer r.mu.Unlock()
r.provider = provider
}
// Start begins background recording for pre-incident buffer
func (r *IncidentRecorder) Start() {
r.mu.Lock()
if r.running {
r.mu.Unlock()
return
}
r.running = true
r.stopCh = make(chan struct{})
r.mu.Unlock()
go r.recordingLoop()
log.Info().Msg("Incident recorder started")
}
// Stop stops the incident recorder
func (r *IncidentRecorder) Stop() {
r.mu.Lock()
if !r.running {
r.mu.Unlock()
return
}
r.running = false
close(r.stopCh)
r.mu.Unlock()
// Save to disk
if err := r.saveToDisk(); err != nil {
log.Warn().Err(err).Msg("Failed to save incident windows on stop")
}
log.Info().Msg("Incident recorder stopped")
}
// recordingLoop runs in the background to maintain pre-incident buffers and active windows
func (r *IncidentRecorder) recordingLoop() {
ticker := time.NewTicker(r.config.SampleInterval)
defer ticker.Stop()
for {
select {
case <-r.stopCh:
return
case <-ticker.C:
r.recordSample()
}
}
}
// recordSample captures a data point for all active windows and buffers
func (r *IncidentRecorder) recordSample() {
r.mu.Lock()
defer r.mu.Unlock()
if r.provider == nil {
return
}
now := time.Now()
// Record for active windows
for _, window := range r.activeWindows {
if window.Status != IncidentWindowStatusRecording {
continue
}
// Check if we've exceeded the post-incident window
if window.EndTime != nil && now.After(*window.EndTime) {
r.completeWindow(window)
continue
}
// Check if we've exceeded max data points
if len(window.DataPoints) >= r.config.MaxDataPointsPerWindow {
window.Status = IncidentWindowStatusTruncated
r.completeWindow(window)
continue
}
// Get metrics
metrics, err := r.provider.GetCurrentMetrics(window.ResourceID)
if err != nil {
log.Debug().
Str("resource_id", window.ResourceID).
Err(err).
Msg("Failed to get metrics for incident window")
continue
}
window.DataPoints = append(window.DataPoints, IncidentDataPoint{
Timestamp: now,
Metrics: metrics,
})
}
// Continuously buffer ALL monitored resources for pre-incident data
// This ensures we have history when an alert fires on any resource
monitoredResources := r.provider.GetMonitoredResourceIDs()
bufferCutoff := now.Add(-r.config.PreIncidentWindow)
for _, resourceID := range monitoredResources {
metrics, err := r.provider.GetCurrentMetrics(resourceID)
if err != nil {
continue
}
// Add to pre-incident buffer
buffer := r.preIncidentBuffer[resourceID]
buffer = append(buffer, IncidentDataPoint{
Timestamp: now,
Metrics: metrics,
})
// Keep only last PreIncidentWindow duration
kept := make([]IncidentDataPoint, 0, len(buffer))
for _, dp := range buffer {
if dp.Timestamp.After(bufferCutoff) {
kept = append(kept, dp)
}
}
r.preIncidentBuffer[resourceID] = kept
}
// Clean up buffers for resources no longer monitored
monitoredSet := make(map[string]bool, len(monitoredResources))
for _, id := range monitoredResources {
monitoredSet[id] = true
}
for resourceID := range r.preIncidentBuffer {
if !monitoredSet[resourceID] {
delete(r.preIncidentBuffer, resourceID)
}
}
}
// StartRecording begins recording an incident window
func (r *IncidentRecorder) StartRecording(resourceID, resourceName, resourceType, triggerType, triggerID string) string {
r.mu.Lock()
defer r.mu.Unlock()
// Check if we already have an active window for this resource
for _, window := range r.activeWindows {
if window.ResourceID == resourceID && window.Status == IncidentWindowStatusRecording {
// Extend existing window
endTime := time.Now().Add(r.config.PostIncidentWindow)
window.EndTime = &endTime
return window.ID
}
}
// Create new window
windowID := generateWindowID(resourceID)
now := time.Now()
endTime := now.Add(r.config.PostIncidentWindow)
window := &IncidentWindow{
ID: windowID,
ResourceID: resourceID,
ResourceName: resourceName,
ResourceType: resourceType,
TriggerType: triggerType,
TriggerID: triggerID,
StartTime: now.Add(-r.config.PreIncidentWindow), // Include pre-incident data
EndTime: &endTime,
Status: IncidentWindowStatusRecording,
DataPoints: make([]IncidentDataPoint, 0),
}
// Copy pre-incident buffer if available
if preBuffer, ok := r.preIncidentBuffer[resourceID]; ok {
window.DataPoints = append(window.DataPoints, preBuffer...)
}
r.activeWindows[windowID] = window
log.Info().
Str("window_id", windowID).
Str("resource_id", resourceID).
Str("trigger_type", triggerType).
Msg("Started incident recording")
return windowID
}
// StopRecording stops recording for a specific window
func (r *IncidentRecorder) StopRecording(windowID string) {
r.mu.Lock()
defer r.mu.Unlock()
if window, ok := r.activeWindows[windowID]; ok {
r.completeWindow(window)
}
}
// completeWindow finalizes a recording window
func (r *IncidentRecorder) completeWindow(window *IncidentWindow) {
if window.Status != IncidentWindowStatusRecording && window.Status != IncidentWindowStatusTruncated {
return
}
now := time.Now()
if window.Status == IncidentWindowStatusRecording {
window.Status = IncidentWindowStatusComplete
}
window.EndTime = &now
// Compute summary
window.Summary = r.computeSummary(window)
// Move to completed
r.completedWindows = append(r.completedWindows, window)
delete(r.activeWindows, window.ID)
// Trim completed windows
r.trimCompletedWindows()
log.Info().
Str("window_id", window.ID).
Str("resource_id", window.ResourceID).
Int("data_points", len(window.DataPoints)).
Msg("Completed incident recording")
// Save asynchronously
go func() {
if err := r.saveToDisk(); err != nil {
log.Warn().Err(err).Msg("Failed to save incident windows")
}
}()
}
// computeSummary computes statistics for a window
func (r *IncidentRecorder) computeSummary(window *IncidentWindow) *IncidentSummary {
if len(window.DataPoints) == 0 {
return nil
}
summary := &IncidentSummary{
DataPoints: len(window.DataPoints),
Peaks: make(map[string]float64),
Lows: make(map[string]float64),
Averages: make(map[string]float64),
Changes: make(map[string]float64),
}
// Calculate duration
if len(window.DataPoints) > 1 {
first := window.DataPoints[0].Timestamp
last := window.DataPoints[len(window.DataPoints)-1].Timestamp
summary.Duration = last.Sub(first)
}
// Track sums for averages
sums := make(map[string]float64)
counts := make(map[string]int)
// First and last values for change calculation
firstValues := make(map[string]float64)
lastValues := make(map[string]float64)
for i, dp := range window.DataPoints {
for metric, value := range dp.Metrics {
// Track first value
if i == 0 {
firstValues[metric] = value
summary.Peaks[metric] = value
summary.Lows[metric] = value
}
// Track last value
lastValues[metric] = value
// Track peaks and lows
if value > summary.Peaks[metric] {
summary.Peaks[metric] = value
}
if value < summary.Lows[metric] {
summary.Lows[metric] = value
}
// Track sums for average
sums[metric] += value
counts[metric]++
}
}
// Calculate averages and changes
for metric, sum := range sums {
if counts[metric] > 0 {
summary.Averages[metric] = sum / float64(counts[metric])
}
if first, ok := firstValues[metric]; ok {
if last, ok := lastValues[metric]; ok {
summary.Changes[metric] = last - first
}
}
}
return summary
}
// trimCompletedWindows removes old windows
func (r *IncidentRecorder) trimCompletedWindows() {
// Remove by retention duration
cutoff := time.Now().Add(-r.config.RetentionDuration)
kept := make([]*IncidentWindow, 0, len(r.completedWindows))
for _, w := range r.completedWindows {
if w.EndTime != nil && w.EndTime.After(cutoff) {
kept = append(kept, w)
}
}
r.completedWindows = kept
// Remove by max windows
if len(r.completedWindows) > r.config.MaxWindows {
r.completedWindows = r.completedWindows[len(r.completedWindows)-r.config.MaxWindows:]
}
}
// GetWindow returns a specific incident window
func (r *IncidentRecorder) GetWindow(windowID string) *IncidentWindow {
r.mu.RLock()
defer r.mu.RUnlock()
// Check active windows
if window, ok := r.activeWindows[windowID]; ok {
return copyWindow(window)
}
// Check completed windows
for _, window := range r.completedWindows {
if window.ID == windowID {
return copyWindow(window)
}
}
return nil
}
// GetWindowsForResource returns all incident windows for a resource
func (r *IncidentRecorder) GetWindowsForResource(resourceID string, limit int) []*IncidentWindow {
r.mu.RLock()
defer r.mu.RUnlock()
var result []*IncidentWindow
// Check active windows
for _, window := range r.activeWindows {
if window.ResourceID == resourceID {
result = append(result, copyWindow(window))
}
}
// Check completed windows (in reverse order for most recent first)
for i := len(r.completedWindows) - 1; i >= 0; i-- {
if r.completedWindows[i].ResourceID == resourceID {
result = append(result, copyWindow(r.completedWindows[i]))
if limit > 0 && len(result) >= limit {
break
}
}
}
return result
}
// GetRecentWindows returns recent incident windows across all resources
func (r *IncidentRecorder) GetRecentWindows(limit int) []*IncidentWindow {
r.mu.RLock()
defer r.mu.RUnlock()
var result []*IncidentWindow
// Add active windows
for _, window := range r.activeWindows {
result = append(result, copyWindow(window))
}
// Add completed windows (most recent first)
start := len(r.completedWindows) - 1
for i := start; i >= 0 && len(result) < limit; i-- {
result = append(result, copyWindow(r.completedWindows[i]))
}
return result
}
// FormatForContext formats incident data for AI prompt injection
func (r *IncidentRecorder) FormatForContext(resourceID string, windowID string) string {
var window *IncidentWindow
if windowID != "" {
window = r.GetWindow(windowID)
} else if resourceID != "" {
windows := r.GetWindowsForResource(resourceID, 1)
if len(windows) > 0 {
window = windows[0]
}
}
if window == nil {
return ""
}
result := "\n## Incident Recording Data\n"
result += "High-frequency metrics captured during incident:\n\n"
if window.Summary != nil {
result += "### Summary\n"
result += "- Duration: " + window.Summary.Duration.String() + "\n"
result += "- Data points: " + intToString(window.Summary.DataPoints) + "\n"
if len(window.Summary.Peaks) > 0 {
result += "\nPeak values:\n"
for metric, value := range window.Summary.Peaks {
result += "- " + metric + ": " + floatToString(value) + "\n"
}
}
if len(window.Summary.Changes) > 0 {
result += "\nChanges during incident:\n"
for metric, change := range window.Summary.Changes {
result += "- " + metric + ": " + signedFloatToString(change) + "\n"
}
}
}
// Include recent data points (last 10)
if len(window.DataPoints) > 0 {
result += "\n### Recent Data Points\n"
start := len(window.DataPoints) - 10
if start < 0 {
start = 0
}
for i := start; i < len(window.DataPoints); i++ {
dp := window.DataPoints[i]
result += dp.Timestamp.Format("15:04:05") + ": "
for metric, value := range dp.Metrics {
result += metric + "=" + floatToString(value) + " "
}
result += "\n"
}
}
return result
}
// saveToDisk persists completed windows
func (r *IncidentRecorder) saveToDisk() error {
if r.filePath == "" {
return nil
}
r.mu.RLock()
data := struct {
CompletedWindows []*IncidentWindow `json:"completed_windows"`
}{
CompletedWindows: r.completedWindows,
}
r.mu.RUnlock()
jsonData, err := json.MarshalIndent(data, "", " ")
if err != nil {
return err
}
if err := os.MkdirAll(r.dataDir, 0755); err != nil {
return err
}
tmpPath := r.filePath + ".tmp"
if err := os.WriteFile(tmpPath, jsonData, 0600); err != nil {
return err
}
return os.Rename(tmpPath, r.filePath)
}
// loadFromDisk loads completed windows
func (r *IncidentRecorder) loadFromDisk() error {
if r.filePath == "" {
return nil
}
jsonData, err := os.ReadFile(r.filePath)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
var data struct {
CompletedWindows []*IncidentWindow `json:"completed_windows"`
}
if err := json.Unmarshal(jsonData, &data); err != nil {
return err
}
r.completedWindows = data.CompletedWindows
r.trimCompletedWindows()
return nil
}
// Helper functions
func copyWindow(w *IncidentWindow) *IncidentWindow {
if w == nil {
return nil
}
copy := *w
if w.EndTime != nil {
t := *w.EndTime
copy.EndTime = &t
}
if w.DataPoints != nil {
copy.DataPoints = make([]IncidentDataPoint, len(w.DataPoints))
for i, dp := range w.DataPoints {
copy.DataPoints[i] = dp
if dp.Metrics != nil {
copy.DataPoints[i].Metrics = make(map[string]float64)
for k, v := range dp.Metrics {
copy.DataPoints[i].Metrics[k] = v
}
}
}
}
if w.Summary != nil {
s := *w.Summary
copy.Summary = &s
}
return &copy
}
var windowCounter int64
func generateWindowID(resourceID string) string {
windowCounter++
return "iw-" + resourceID + "-" + time.Now().Format("20060102150405") + "-" + intToString(int(windowCounter%1000))
}
func intToString(n int) string {
if n == 0 {
return "0"
}
negative := n < 0
if negative {
n = -n
}
var result string
for n > 0 {
result = string(rune('0'+n%10)) + result
n /= 10
}
if negative {
result = "-" + result
}
return result
}
func floatToString(f float64) string {
// Simple formatting - 2 decimal places
intPart := int(f)
fracPart := int((f - float64(intPart)) * 100)
if fracPart < 0 {
fracPart = -fracPart
}
return intToString(intPart) + "." + padLeft(intToString(fracPart), 2, '0')
}
func signedFloatToString(f float64) string {
if f >= 0 {
return "+" + floatToString(f)
}
return floatToString(f)
}
func padLeft(s string, length int, pad rune) string {
for len(s) < length {
s = string(pad) + s
}
return s
}