mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 11:30:15 +00:00
Update history entry LastSeen on alert resolution so the stored duration reflects how long the alert was actually active, not the snapshot captured at creation time. This fixes the "0m" duration display for all resolved metric-based alerts. Fix reevaluateActiveAlertsLocked to use HostDefaults for host agent alerts and PBSDefaults for PBS alerts instead of falling through to GuestDefaults and NodeDefaults respectively, which could incorrectly resolve or retain alerts on config save when thresholds differ.
430 lines
11 KiB
Go
430 lines
11 KiB
Go
package alerts
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/utils"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
const (
|
|
// MaxHistoryDays is the maximum number of days to keep alert history
|
|
MaxHistoryDays = 30
|
|
// HistoryFileName is the name of the history file
|
|
HistoryFileName = "alert-history.json"
|
|
// HistoryBackupFileName is the name of the backup history file
|
|
HistoryBackupFileName = "alert-history.backup.json"
|
|
)
|
|
|
|
// HistoryEntry represents a historical alert entry
|
|
type HistoryEntry struct {
|
|
Alert Alert `json:"alert"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
}
|
|
|
|
// AlertCallback is called when an alert is added to history
|
|
// This enables external systems to track alerts (e.g., pattern detection)
|
|
type AlertCallback func(alert Alert)
|
|
|
|
// HistoryManager manages persistent alert history
|
|
type HistoryManager struct {
|
|
mu sync.RWMutex
|
|
saveMu sync.Mutex // Serializes disk writes to prevent save race condition
|
|
dataDir string
|
|
historyFile string
|
|
backupFile string
|
|
history []HistoryEntry
|
|
saveInterval time.Duration
|
|
stopChan chan struct{}
|
|
saveTicker *time.Ticker
|
|
callbacks []AlertCallback // Called when alerts are added
|
|
}
|
|
|
|
// NewHistoryManager creates a new history manager
|
|
func NewHistoryManager(dataDir string) *HistoryManager {
|
|
if dataDir == "" {
|
|
dataDir = utils.GetDataDir()
|
|
}
|
|
|
|
hm := &HistoryManager{
|
|
dataDir: dataDir,
|
|
historyFile: filepath.Join(dataDir, HistoryFileName),
|
|
backupFile: filepath.Join(dataDir, HistoryBackupFileName),
|
|
history: make([]HistoryEntry, 0),
|
|
saveInterval: 5 * time.Minute,
|
|
stopChan: make(chan struct{}),
|
|
}
|
|
|
|
// Ensure data directory exists
|
|
if err := os.MkdirAll(dataDir, 0755); err != nil {
|
|
log.Error().Err(err).Str("dir", dataDir).Msg("Failed to create data directory")
|
|
}
|
|
|
|
// Load existing history
|
|
if err := hm.loadHistory(); err != nil {
|
|
log.Error().Err(err).Msg("Failed to load alert history")
|
|
}
|
|
|
|
// Start periodic save routine
|
|
hm.startPeriodicSave()
|
|
|
|
// Start cleanup routine
|
|
go hm.cleanupRoutine()
|
|
|
|
return hm
|
|
}
|
|
|
|
// OnAlert registers a callback to be called when alerts are added
|
|
func (hm *HistoryManager) OnAlert(cb AlertCallback) {
|
|
hm.mu.Lock()
|
|
defer hm.mu.Unlock()
|
|
hm.callbacks = append(hm.callbacks, cb)
|
|
}
|
|
|
|
// AddAlert adds an alert to history
|
|
func (hm *HistoryManager) AddAlert(alert Alert) {
|
|
hm.mu.Lock()
|
|
|
|
entry := HistoryEntry{
|
|
Alert: *alert.Clone(),
|
|
Timestamp: time.Now(),
|
|
}
|
|
|
|
hm.history = append(hm.history, entry)
|
|
callbacks := hm.callbacks
|
|
hm.mu.Unlock()
|
|
|
|
log.Debug().Str("alertID", alert.ID).Msg("Added alert to history")
|
|
|
|
// Call callbacks outside the lock
|
|
for _, cb := range callbacks {
|
|
cb(alert)
|
|
}
|
|
}
|
|
|
|
// UpdateAlertLastSeen updates the LastSeen timestamp on the most recent
|
|
// history entry matching the given alert ID. This is called when an alert is
|
|
// resolved so that the stored history reflects the true duration of the alert,
|
|
// not just the snapshot captured at creation time.
|
|
func (hm *HistoryManager) UpdateAlertLastSeen(alertID string, lastSeen time.Time) {
|
|
hm.mu.Lock()
|
|
defer hm.mu.Unlock()
|
|
|
|
// Iterate from newest to oldest to find the most recent entry for this alert
|
|
for i := len(hm.history) - 1; i >= 0; i-- {
|
|
if hm.history[i].Alert.ID == alertID {
|
|
hm.history[i].Alert.LastSeen = lastSeen
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// GetHistory returns alert history within the specified time range
|
|
func (hm *HistoryManager) GetHistory(since time.Time, limit int) []Alert {
|
|
hm.mu.RLock()
|
|
defer hm.mu.RUnlock()
|
|
|
|
var results []Alert
|
|
count := 0
|
|
|
|
// Iterate from newest to oldest
|
|
for i := len(hm.history) - 1; i >= 0 && (limit <= 0 || count < limit); i-- {
|
|
entry := hm.history[i]
|
|
if entry.Timestamp.After(since) {
|
|
results = append(results, entry.Alert)
|
|
count++
|
|
}
|
|
}
|
|
|
|
return results
|
|
}
|
|
|
|
// GetAllHistory returns all alert history (up to limit)
|
|
func (hm *HistoryManager) GetAllHistory(limit int) []Alert {
|
|
hm.mu.RLock()
|
|
defer hm.mu.RUnlock()
|
|
|
|
if limit <= 0 || limit > len(hm.history) {
|
|
limit = len(hm.history)
|
|
}
|
|
|
|
results := make([]Alert, 0, limit)
|
|
start := len(hm.history) - limit
|
|
|
|
for i := len(hm.history) - 1; i >= start; i-- {
|
|
results = append(results, hm.history[i].Alert)
|
|
}
|
|
|
|
return results
|
|
}
|
|
|
|
// loadHistory loads history from disk
|
|
func (hm *HistoryManager) loadHistory() error {
|
|
// Try loading from main file first
|
|
data, err := os.ReadFile(hm.historyFile)
|
|
if err != nil {
|
|
if !os.IsNotExist(err) {
|
|
log.Warn().Err(err).Str("file", hm.historyFile).Msg("Failed to read history file")
|
|
}
|
|
|
|
// Try backup file
|
|
data, err = os.ReadFile(hm.backupFile)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
// Both files don't exist - this is normal on first startup
|
|
log.Debug().Msg("No alert history files found, starting fresh")
|
|
return nil
|
|
}
|
|
// Check if it's a permission error
|
|
if os.IsPermission(err) {
|
|
log.Warn().Err(err).Str("file", hm.backupFile).Msg("Permission denied reading backup history file - check file ownership")
|
|
return nil // Continue without history rather than failing
|
|
}
|
|
return fmt.Errorf("failed to load backup history: %w", err)
|
|
}
|
|
log.Info().Msg("Loaded alert history from backup file")
|
|
}
|
|
|
|
var history []HistoryEntry
|
|
if err := json.Unmarshal(data, &history); err != nil {
|
|
return fmt.Errorf("failed to unmarshal history: %w", err)
|
|
}
|
|
|
|
hm.history = history
|
|
log.Info().Int("count", len(history)).Msg("Loaded alert history")
|
|
|
|
// Clean old entries immediately
|
|
hm.cleanOldEntries()
|
|
|
|
return nil
|
|
}
|
|
|
|
// saveHistory saves history to disk with retry logic
|
|
func (hm *HistoryManager) saveHistory() error {
|
|
return hm.saveHistoryWithRetry(3)
|
|
}
|
|
|
|
// saveHistoryWithRetry saves history with exponential backoff retry
|
|
func (hm *HistoryManager) saveHistoryWithRetry(maxRetries int) error {
|
|
// Serialize all disk writes to prevent concurrent saves from overwriting each other
|
|
hm.saveMu.Lock()
|
|
defer hm.saveMu.Unlock()
|
|
|
|
hm.mu.RLock()
|
|
snapshot := make([]HistoryEntry, len(hm.history))
|
|
copy(snapshot, hm.history)
|
|
hm.mu.RUnlock()
|
|
|
|
data, err := json.Marshal(snapshot)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to marshal history: %w", err)
|
|
}
|
|
|
|
historyFile := hm.historyFile
|
|
backupFile := hm.backupFile
|
|
|
|
// Create backup of existing file once before any write attempts.
|
|
// This ensures we don't lose data if all retries fail.
|
|
backupCreated := false
|
|
if _, err := os.Stat(historyFile); err == nil {
|
|
if err := os.Rename(historyFile, backupFile); err != nil {
|
|
log.Warn().Err(err).Msg("Failed to create backup file")
|
|
} else {
|
|
backupCreated = true
|
|
}
|
|
}
|
|
|
|
var lastErr error
|
|
for attempt := 1; attempt <= maxRetries; attempt++ {
|
|
// Write new file
|
|
if err := os.WriteFile(historyFile, data, 0644); err != nil {
|
|
lastErr = err
|
|
log.Warn().
|
|
Err(err).
|
|
Int("attempt", attempt).
|
|
Int("maxRetries", maxRetries).
|
|
Msg("Failed to write history file, will retry")
|
|
|
|
// Exponential backoff: 100ms, 200ms, 400ms
|
|
if attempt < maxRetries {
|
|
backoff := time.Duration(100*(1<<uint(attempt-1))) * time.Millisecond
|
|
time.Sleep(backoff)
|
|
}
|
|
continue
|
|
}
|
|
|
|
// Success - remove backup file now that we've successfully written
|
|
if backupCreated {
|
|
_ = os.Remove(backupFile)
|
|
}
|
|
log.Debug().Int("entries", len(snapshot)).Msg("Saved alert history")
|
|
return nil
|
|
}
|
|
|
|
// All retries failed - restore backup if we have one
|
|
if backupCreated {
|
|
if restoreErr := os.Rename(backupFile, historyFile); restoreErr != nil {
|
|
log.Error().Err(restoreErr).Msg("Failed to restore backup after all write attempts failed")
|
|
} else {
|
|
log.Info().Msg("Restored backup after history save failure")
|
|
}
|
|
}
|
|
|
|
return fmt.Errorf("failed to write history file after %d attempts: %w", maxRetries, lastErr)
|
|
}
|
|
|
|
// startPeriodicSave starts the periodic save routine
|
|
func (hm *HistoryManager) startPeriodicSave() {
|
|
hm.saveTicker = time.NewTicker(hm.saveInterval)
|
|
|
|
go func() {
|
|
for {
|
|
select {
|
|
case <-hm.saveTicker.C:
|
|
if err := hm.saveHistory(); err != nil {
|
|
log.Error().Err(err).Msg("Failed to save alert history")
|
|
}
|
|
case <-hm.stopChan:
|
|
return
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
// cleanupRoutine runs periodically to clean old entries
|
|
func (hm *HistoryManager) cleanupRoutine() {
|
|
// Run cleanup daily
|
|
ticker := time.NewTicker(24 * time.Hour)
|
|
defer ticker.Stop()
|
|
|
|
// Also run cleanup on startup after a delay
|
|
// Also run cleanup on startup after a delay
|
|
select {
|
|
case <-time.After(1 * time.Minute):
|
|
hm.cleanOldEntries()
|
|
case <-hm.stopChan:
|
|
return
|
|
}
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
hm.cleanOldEntries()
|
|
case <-hm.stopChan:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// cleanOldEntries removes entries older than MaxHistoryDays
|
|
func (hm *HistoryManager) cleanOldEntries() {
|
|
hm.mu.Lock()
|
|
defer hm.mu.Unlock()
|
|
|
|
cutoff := time.Now().AddDate(0, 0, -MaxHistoryDays)
|
|
newHistory := make([]HistoryEntry, 0, len(hm.history))
|
|
|
|
removed := 0
|
|
for _, entry := range hm.history {
|
|
if entry.Timestamp.After(cutoff) {
|
|
newHistory = append(newHistory, entry)
|
|
} else {
|
|
removed++
|
|
}
|
|
}
|
|
|
|
if removed > 0 {
|
|
hm.history = newHistory
|
|
log.Info().
|
|
Int("removed", removed).
|
|
Int("remaining", len(newHistory)).
|
|
Msg("Cleaned old alert history entries")
|
|
}
|
|
}
|
|
|
|
// RemoveAlert removes a specific alert from history by ID
|
|
func (hm *HistoryManager) RemoveAlert(alertID string) {
|
|
hm.mu.Lock()
|
|
defer hm.mu.Unlock()
|
|
|
|
newHistory := make([]HistoryEntry, 0, len(hm.history))
|
|
removed := false
|
|
|
|
for _, entry := range hm.history {
|
|
if entry.Alert.ID != alertID {
|
|
newHistory = append(newHistory, entry)
|
|
} else {
|
|
removed = true
|
|
}
|
|
}
|
|
|
|
if removed {
|
|
hm.history = newHistory
|
|
log.Debug().Str("alertID", alertID).Msg("Removed alert from history")
|
|
}
|
|
}
|
|
|
|
// ClearAllHistory clears all alert history
|
|
func (hm *HistoryManager) ClearAllHistory() error {
|
|
hm.mu.Lock()
|
|
defer hm.mu.Unlock()
|
|
|
|
// Clear the in-memory history
|
|
hm.history = make([]HistoryEntry, 0)
|
|
|
|
// Remove the history files
|
|
_ = os.Remove(hm.historyFile)
|
|
_ = os.Remove(hm.backupFile)
|
|
|
|
log.Info().Msg("Cleared all alert history")
|
|
return nil
|
|
}
|
|
|
|
// Stop stops the history manager
|
|
func (hm *HistoryManager) Stop() {
|
|
close(hm.stopChan)
|
|
if hm.saveTicker != nil {
|
|
hm.saveTicker.Stop()
|
|
}
|
|
|
|
// Save one final time
|
|
if err := hm.saveHistory(); err != nil {
|
|
log.Error().Err(err).Msg("Failed to save alert history on shutdown")
|
|
}
|
|
}
|
|
|
|
// GetStats returns statistics about the alert history
|
|
func (hm *HistoryManager) GetStats() map[string]interface{} {
|
|
hm.mu.RLock()
|
|
defer hm.mu.RUnlock()
|
|
|
|
oldest := time.Now()
|
|
newest := time.Time{}
|
|
|
|
if len(hm.history) > 0 {
|
|
oldest = hm.history[0].Timestamp
|
|
newest = hm.history[len(hm.history)-1].Timestamp
|
|
}
|
|
|
|
return map[string]interface{}{
|
|
"totalEntries": len(hm.history),
|
|
"oldestEntry": oldest,
|
|
"newestEntry": newest,
|
|
"dataDir": hm.dataDir,
|
|
"fileSize": hm.getFileSize(),
|
|
}
|
|
}
|
|
|
|
// getFileSize returns the size of the history file
|
|
func (hm *HistoryManager) getFileSize() int64 {
|
|
info, err := os.Stat(hm.historyFile)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
return info.Size()
|
|
}
|