mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-22 03:02:35 +00:00
640 lines
17 KiB
Go
640 lines
17 KiB
Go
package alerts
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/securityutil"
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/utils"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
const (
|
|
// MaxHistoryDays is the maximum number of days to keep alert history
|
|
MaxHistoryDays = 30
|
|
// HistoryFileName is the name of the history file
|
|
HistoryFileName = "alert-history.json"
|
|
// HistoryBackupFileName is the name of the backup history file
|
|
HistoryBackupFileName = "alert-history.backup.json"
|
|
)
|
|
|
|
// HistoryEntry represents a historical alert entry
|
|
type HistoryEntry struct {
|
|
Alert Alert `json:"alert"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
}
|
|
|
|
// HistoryStats represents typed alert history statistics.
|
|
type HistoryStats struct {
|
|
TotalEntries int
|
|
OldestEntry time.Time
|
|
NewestEntry time.Time
|
|
DataDir string
|
|
FileSize int64
|
|
}
|
|
|
|
// AlertCallback is called when an alert is added to history
|
|
// This enables external systems to track alerts (e.g., pattern detection)
|
|
type AlertCallback func(alert Alert)
|
|
|
|
// HistoryManager manages persistent alert history
|
|
type HistoryManager struct {
|
|
mu sync.RWMutex
|
|
saveMu sync.Mutex // Serializes disk writes to prevent save race condition
|
|
stopOnce sync.Once
|
|
dataDir string
|
|
historyFile string
|
|
backupFile string
|
|
history []HistoryEntry
|
|
saveInterval time.Duration
|
|
stopChan chan struct{}
|
|
saveTicker *time.Ticker
|
|
callbacks []AlertCallback // Called when alerts are added
|
|
}
|
|
|
|
func historyIdentityKey(alert *Alert) string {
|
|
if alert == nil {
|
|
return ""
|
|
}
|
|
backfillCanonicalIdentity(alert)
|
|
if alert.CanonicalState != "" {
|
|
return alert.CanonicalState
|
|
}
|
|
return alert.ID
|
|
}
|
|
|
|
func resolveHistoryStoragePaths(dataDir string) (resolvedDataDir string, historyFile string, backupFile string, err error) {
|
|
resolvedDataDir, err = securityutil.NormalizeStorageDir(dataDir)
|
|
if err != nil {
|
|
return "", "", "", fmt.Errorf("resolve history data directory: %w", err)
|
|
}
|
|
|
|
historyFile, err = securityutil.JoinStorageLeaf(resolvedDataDir, HistoryFileName)
|
|
if err != nil {
|
|
return "", "", "", fmt.Errorf("resolve history file path: %w", err)
|
|
}
|
|
backupFile, err = securityutil.JoinStorageLeaf(resolvedDataDir, HistoryBackupFileName)
|
|
if err != nil {
|
|
return "", "", "", fmt.Errorf("resolve history backup file path: %w", err)
|
|
}
|
|
|
|
return resolvedDataDir, historyFile, backupFile, nil
|
|
}
|
|
|
|
// NewHistoryManager creates a new history manager
|
|
func NewHistoryManager(dataDir string) *HistoryManager {
|
|
resolvedDataDir, historyFile, backupFile, err := resolveHistoryStoragePaths(utils.ResolveDataDir(dataDir))
|
|
if err != nil {
|
|
panic(fmt.Sprintf("invalid alert history storage paths for %q: %v", dataDir, err))
|
|
}
|
|
|
|
hm := &HistoryManager{
|
|
dataDir: resolvedDataDir,
|
|
historyFile: historyFile,
|
|
backupFile: backupFile,
|
|
history: make([]HistoryEntry, 0),
|
|
saveInterval: 5 * time.Minute,
|
|
stopChan: make(chan struct{}),
|
|
}
|
|
|
|
// Ensure data directory exists
|
|
if err := os.MkdirAll(resolvedDataDir, alertsDirPerm); err != nil {
|
|
log.Error().Err(err).Str("dir", resolvedDataDir).Msg("Failed to create data directory")
|
|
} else if err := os.Chmod(resolvedDataDir, alertsDirPerm); err != nil {
|
|
log.Warn().Err(err).Str("dir", resolvedDataDir).Msg("Failed to harden history directory permissions")
|
|
}
|
|
|
|
// Load existing history
|
|
if err := hm.loadHistory(); err != nil {
|
|
log.Error().Err(err).Msg("failed to load alert history")
|
|
}
|
|
|
|
// Start periodic save routine
|
|
hm.startPeriodicSave()
|
|
|
|
// Start cleanup routine
|
|
go hm.cleanupRoutine()
|
|
|
|
return hm
|
|
}
|
|
|
|
// OnAlert registers a callback to be called when alerts are added
|
|
func (hm *HistoryManager) OnAlert(cb AlertCallback) {
|
|
hm.mu.Lock()
|
|
defer hm.mu.Unlock()
|
|
hm.callbacks = append(hm.callbacks, cb)
|
|
}
|
|
|
|
// AddAlert adds an alert to history
|
|
func (hm *HistoryManager) AddAlert(alert Alert) {
|
|
hm.mu.Lock()
|
|
|
|
entry := HistoryEntry{
|
|
Alert: *alert.Clone(),
|
|
Timestamp: time.Now(),
|
|
}
|
|
|
|
hm.history = append(hm.history, entry)
|
|
callbacks := append([]AlertCallback(nil), hm.callbacks...)
|
|
hm.mu.Unlock()
|
|
|
|
log.Debug().Str("alertID", alert.ID).Msg("added alert to history")
|
|
|
|
// Call callbacks outside the lock
|
|
for _, cb := range callbacks {
|
|
cb(alert)
|
|
}
|
|
}
|
|
|
|
// UpdateAlertLastSeen updates the LastSeen timestamp on the most recent
|
|
// history entry matching the given alert ID. This is called when an alert is
|
|
// resolved so that the stored history reflects the true duration of the alert,
|
|
// not just the snapshot captured at creation time.
|
|
func (hm *HistoryManager) UpdateAlertLastSeen(alertID string, lastSeen time.Time) {
|
|
hm.UpdateAlertLastSeenForAlert(&Alert{ID: alertID}, lastSeen)
|
|
}
|
|
|
|
// UpdateAlertLastSeenForAlert updates the LastSeen timestamp on the most recent
|
|
// history entry matching the given alert identity. Canonical alerts match by
|
|
// canonical state first, then fall back to legacy alert ID for compatibility.
|
|
func (hm *HistoryManager) UpdateAlertLastSeenForAlert(alert *Alert, lastSeen time.Time) {
|
|
if alert == nil {
|
|
return
|
|
}
|
|
|
|
matchKey := historyIdentityKey(alert)
|
|
matchID := alert.ID
|
|
|
|
hm.mu.Lock()
|
|
defer hm.mu.Unlock()
|
|
|
|
// Iterate from newest to oldest to find the most recent entry for this alert.
|
|
for i := len(hm.history) - 1; i >= 0; i-- {
|
|
entry := hm.history[i].Alert.Clone()
|
|
entryKey := historyIdentityKey(entry)
|
|
if matchKey != "" && entryKey == matchKey {
|
|
hm.history[i].Alert.LastSeen = lastSeen
|
|
return
|
|
}
|
|
if matchID != "" && hm.history[i].Alert.ID == matchID {
|
|
hm.history[i].Alert.LastSeen = lastSeen
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// MigrateActiveAlert updates the most recent history entry for an in-flight
|
|
// alert when its canonical runtime identity changes, such as a guest metric
|
|
// alert moving from one node-scoped resource key to another after a VM move.
|
|
func (hm *HistoryManager) MigrateActiveAlert(oldTrackingKey string, updated Alert) {
|
|
if oldTrackingKey == "" {
|
|
return
|
|
}
|
|
|
|
updatedAlert := updated.Clone()
|
|
if updatedAlert == nil {
|
|
return
|
|
}
|
|
backfillCanonicalIdentity(updatedAlert)
|
|
|
|
hm.mu.Lock()
|
|
defer hm.mu.Unlock()
|
|
|
|
for i := len(hm.history) - 1; i >= 0; i-- {
|
|
entry := hm.history[i].Alert.Clone()
|
|
if historyIdentityKey(entry) == oldTrackingKey || hm.history[i].Alert.ID == oldTrackingKey {
|
|
hm.history[i].Alert = *updatedAlert
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// GetHistory returns alert history within the specified time range
|
|
func (hm *HistoryManager) GetHistory(since time.Time, limit int) []Alert {
|
|
hm.mu.RLock()
|
|
defer hm.mu.RUnlock()
|
|
|
|
var results []Alert
|
|
count := 0
|
|
|
|
// Iterate from newest to oldest
|
|
for i := len(hm.history) - 1; i >= 0 && (limit <= 0 || count < limit); i-- {
|
|
entry := hm.history[i]
|
|
if entry.Timestamp.After(since) {
|
|
results = append(results, entry.Alert)
|
|
count++
|
|
}
|
|
}
|
|
|
|
return results
|
|
}
|
|
|
|
// GetAllHistory returns all alert history (up to limit)
|
|
func (hm *HistoryManager) GetAllHistory(limit int) []Alert {
|
|
hm.mu.RLock()
|
|
defer hm.mu.RUnlock()
|
|
|
|
if limit <= 0 || limit > len(hm.history) {
|
|
limit = len(hm.history)
|
|
}
|
|
|
|
results := make([]Alert, 0, limit)
|
|
start := len(hm.history) - limit
|
|
|
|
for i := len(hm.history) - 1; i >= start; i-- {
|
|
results = append(results, hm.history[i].Alert)
|
|
}
|
|
|
|
return results
|
|
}
|
|
|
|
// loadHistory loads history from disk
|
|
func (hm *HistoryManager) loadHistory() error {
|
|
// Try loading from main file first
|
|
data, err := readLimitedRegularFile(hm.historyFile, maxAlertHistoryFileSizeBytes)
|
|
if err != nil {
|
|
if !os.IsNotExist(err) {
|
|
log.Warn().Err(err).Str("file", hm.historyFile).Msg("Failed to read history file")
|
|
}
|
|
|
|
// Try backup file
|
|
data, err = readLimitedRegularFile(hm.backupFile, maxAlertHistoryFileSizeBytes)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
// Both files don't exist - this is normal on first startup
|
|
log.Debug().Msg("No alert history files found, starting fresh")
|
|
return nil
|
|
}
|
|
if os.IsPermission(err) {
|
|
log.Warn().
|
|
Err(err).
|
|
Str("file", hm.backupFile).
|
|
Msg("Permission denied reading backup history file - check file ownership")
|
|
return nil
|
|
}
|
|
return fmt.Errorf("failed to read history backup file %q: %w", hm.backupFile, err)
|
|
}
|
|
log.Info().Msg("loaded alert history from backup file")
|
|
}
|
|
|
|
var history []HistoryEntry
|
|
if err := json.Unmarshal(data, &history); err != nil {
|
|
return fmt.Errorf("failed to unmarshal history: %w", err)
|
|
}
|
|
|
|
hm.history = history
|
|
log.Info().Int("count", len(history)).Msg("Loaded alert history")
|
|
|
|
// Clean old entries immediately
|
|
hm.cleanOldEntries()
|
|
return nil
|
|
}
|
|
|
|
// saveHistory saves history to disk with retry logic
|
|
func (hm *HistoryManager) saveHistory() error {
|
|
return hm.saveHistoryWithRetry(3)
|
|
}
|
|
|
|
// saveHistoryWithRetry saves history with exponential backoff retry
|
|
func (hm *HistoryManager) saveHistoryWithRetry(maxRetries int) error {
|
|
if maxRetries < 1 {
|
|
maxRetries = 1
|
|
}
|
|
|
|
// Serialize all disk writes to prevent concurrent saves from overwriting each other
|
|
hm.saveMu.Lock()
|
|
defer hm.saveMu.Unlock()
|
|
|
|
hm.mu.RLock()
|
|
snapshot := make([]HistoryEntry, len(hm.history))
|
|
copy(snapshot, hm.history)
|
|
hm.mu.RUnlock()
|
|
|
|
data, err := json.Marshal(snapshot)
|
|
if err != nil {
|
|
return fmt.Errorf("historyManager.saveHistoryWithRetry: marshal history snapshot: %w", err)
|
|
}
|
|
|
|
historyFile := hm.historyFile
|
|
backupFile := hm.backupFile
|
|
|
|
// Create backup of existing file once before any write attempts.
|
|
// This ensures we don't lose data if all retries fail.
|
|
backupCreated := false
|
|
if _, err := os.Stat(historyFile); err == nil {
|
|
if err := os.Remove(backupFile); err != nil && !os.IsNotExist(err) {
|
|
log.Warn().Err(err).Str("file", backupFile).Msg("Failed to remove existing backup file before save")
|
|
}
|
|
if err := os.Rename(historyFile, backupFile); err != nil {
|
|
log.Warn().
|
|
Err(err).
|
|
Str("source", historyFile).
|
|
Str("backup", backupFile).
|
|
Msg("Failed to create backup file")
|
|
} else {
|
|
backupCreated = true
|
|
}
|
|
}
|
|
|
|
var lastErr error
|
|
for attempt := 1; attempt <= maxRetries; attempt++ {
|
|
tempFile := fmt.Sprintf("%s.tmp-%d-%d", historyFile, os.Getpid(), time.Now().UnixNano())
|
|
|
|
if err := writeFileSynced(tempFile, data, 0644); err != nil {
|
|
lastErr = fmt.Errorf("failed to write temp history file: %w", err)
|
|
_ = os.Remove(tempFile)
|
|
log.Warn().
|
|
Err(lastErr).
|
|
Int("attempt", attempt).
|
|
Int("maxRetries", maxRetries).
|
|
Msg("failed to write history file, will retry")
|
|
|
|
// Exponential backoff: 100ms, 200ms, 400ms
|
|
if attempt < maxRetries {
|
|
backoff := time.Duration(100*(1<<uint(attempt-1))) * time.Millisecond
|
|
time.Sleep(backoff)
|
|
}
|
|
continue
|
|
}
|
|
|
|
if err := os.Rename(tempFile, historyFile); err != nil {
|
|
lastErr = fmt.Errorf("failed to replace history file with temp file: %w", err)
|
|
_ = os.Remove(tempFile)
|
|
log.Warn().
|
|
Err(lastErr).
|
|
Int("attempt", attempt).
|
|
Int("maxRetries", maxRetries).
|
|
Msg("Failed to write history file, will retry")
|
|
|
|
// Exponential backoff: 100ms, 200ms, 400ms
|
|
if attempt < maxRetries {
|
|
backoff := time.Duration(100*(1<<uint(attempt-1))) * time.Millisecond
|
|
time.Sleep(backoff)
|
|
}
|
|
continue
|
|
}
|
|
if err := os.Chmod(historyFile, alertsFilePerm); err != nil {
|
|
lastErr = err
|
|
log.Warn().
|
|
Err(err).
|
|
Int("attempt", attempt).
|
|
Int("maxRetries", maxRetries).
|
|
Msg("Failed to harden history file permissions, will retry")
|
|
|
|
if attempt < maxRetries {
|
|
backoff := time.Duration(100*(1<<uint(attempt-1))) * time.Millisecond
|
|
time.Sleep(backoff)
|
|
}
|
|
continue
|
|
}
|
|
|
|
if err := syncDir(filepath.Dir(historyFile)); err != nil {
|
|
log.Warn().Err(err).Str("dir", filepath.Dir(historyFile)).Msg("Failed to sync history directory")
|
|
}
|
|
|
|
// Success - remove backup file now that we've successfully written
|
|
if backupCreated {
|
|
if err := os.Remove(backupFile); err != nil && !os.IsNotExist(err) {
|
|
log.Warn().Err(err).Str("file", backupFile).Msg("failed to remove backup file after successful save")
|
|
}
|
|
}
|
|
log.Debug().Int("entries", len(snapshot)).Msg("saved alert history")
|
|
return nil
|
|
}
|
|
|
|
if lastErr == nil {
|
|
lastErr = fmt.Errorf("historyManager.saveHistoryWithRetry: no write attempts executed for %s", historyFile)
|
|
}
|
|
|
|
// All retries failed - restore backup if we have one
|
|
if backupCreated {
|
|
if restoreErr := os.Rename(backupFile, historyFile); restoreErr != nil {
|
|
restoreErr = fmt.Errorf("restore backup file %s to %s: %w", backupFile, historyFile, restoreErr)
|
|
log.Error().Err(restoreErr).Msg("failed to restore backup after all write attempts failed")
|
|
} else {
|
|
if err := syncDir(filepath.Dir(historyFile)); err != nil {
|
|
log.Warn().Err(err).Str("dir", filepath.Dir(historyFile)).Msg("Failed to sync history directory after restore")
|
|
}
|
|
log.Info().Msg("restored backup after history save failure")
|
|
}
|
|
log.Info().Msg("Restored backup after history save failure")
|
|
}
|
|
|
|
return fmt.Errorf("historyManager.saveHistoryWithRetry: failed to write history file after %d attempts: %w", maxRetries, lastErr)
|
|
}
|
|
|
|
func writeFileSynced(path string, data []byte, perm os.FileMode) error {
|
|
file, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, perm)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if _, err := file.Write(data); err != nil {
|
|
_ = file.Close()
|
|
return err
|
|
}
|
|
|
|
if err := file.Sync(); err != nil {
|
|
_ = file.Close()
|
|
return err
|
|
}
|
|
|
|
return file.Close()
|
|
}
|
|
|
|
func syncDir(dirPath string) error {
|
|
dir, err := os.Open(dirPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer dir.Close()
|
|
|
|
return dir.Sync()
|
|
}
|
|
|
|
// startPeriodicSave starts the periodic save routine
|
|
func (hm *HistoryManager) startPeriodicSave() {
|
|
hm.saveTicker = time.NewTicker(hm.saveInterval)
|
|
|
|
go func() {
|
|
for {
|
|
select {
|
|
case <-hm.saveTicker.C:
|
|
if err := hm.saveHistory(); err != nil {
|
|
log.Error().Err(err).Msg("failed to save alert history")
|
|
}
|
|
case <-hm.stopChan:
|
|
return
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
// cleanupRoutine runs periodically to clean old entries
|
|
func (hm *HistoryManager) cleanupRoutine() {
|
|
// Run cleanup daily
|
|
ticker := time.NewTicker(24 * time.Hour)
|
|
defer ticker.Stop()
|
|
|
|
// Also run cleanup on startup after a delay
|
|
startupDelay := time.NewTimer(1 * time.Minute)
|
|
defer func() {
|
|
if !startupDelay.Stop() {
|
|
select {
|
|
case <-startupDelay.C:
|
|
default:
|
|
}
|
|
}
|
|
}()
|
|
|
|
select {
|
|
case <-startupDelay.C:
|
|
hm.cleanOldEntries()
|
|
case <-hm.stopChan:
|
|
return
|
|
}
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
hm.cleanOldEntries()
|
|
case <-hm.stopChan:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// cleanOldEntries removes entries older than MaxHistoryDays
|
|
func (hm *HistoryManager) cleanOldEntries() {
|
|
hm.mu.Lock()
|
|
defer hm.mu.Unlock()
|
|
|
|
cutoff := time.Now().AddDate(0, 0, -MaxHistoryDays)
|
|
newHistory := make([]HistoryEntry, 0, len(hm.history))
|
|
|
|
removed := 0
|
|
for _, entry := range hm.history {
|
|
if entry.Timestamp.After(cutoff) {
|
|
newHistory = append(newHistory, entry)
|
|
} else {
|
|
removed++
|
|
}
|
|
}
|
|
|
|
if removed > 0 {
|
|
hm.history = newHistory
|
|
log.Info().
|
|
Int("removed", removed).
|
|
Int("remaining", len(newHistory)).
|
|
Msg("cleaned old alert history entries")
|
|
}
|
|
}
|
|
|
|
// RemoveAlert removes a specific alert from history by ID
|
|
func (hm *HistoryManager) RemoveAlert(alertID string) {
|
|
hm.mu.Lock()
|
|
defer hm.mu.Unlock()
|
|
|
|
newHistory := make([]HistoryEntry, 0, len(hm.history))
|
|
removed := false
|
|
|
|
for _, entry := range hm.history {
|
|
if entry.Alert.ID != alertID {
|
|
newHistory = append(newHistory, entry)
|
|
} else {
|
|
removed = true
|
|
}
|
|
}
|
|
|
|
if removed {
|
|
hm.history = newHistory
|
|
log.Debug().Str("alertID", alertID).Msg("removed alert from history")
|
|
}
|
|
}
|
|
|
|
// ClearAllHistory clears all alert history
|
|
func (hm *HistoryManager) ClearAllHistory() error {
|
|
hm.mu.Lock()
|
|
defer hm.mu.Unlock()
|
|
|
|
// Clear the in-memory history
|
|
hm.history = make([]HistoryEntry, 0)
|
|
|
|
// Remove the history files
|
|
var removeErrs []error
|
|
if err := os.Remove(hm.historyFile); err != nil && !os.IsNotExist(err) {
|
|
removeErrs = append(removeErrs, fmt.Errorf("remove history file %s: %w", hm.historyFile, err))
|
|
}
|
|
if err := os.Remove(hm.backupFile); err != nil && !os.IsNotExist(err) {
|
|
removeErrs = append(removeErrs, fmt.Errorf("remove backup file %s: %w", hm.backupFile, err))
|
|
}
|
|
if len(removeErrs) > 0 {
|
|
return fmt.Errorf("clear alert history files: %w", errors.Join(removeErrs...))
|
|
}
|
|
|
|
log.Info().Msg("cleared all alert history")
|
|
return nil
|
|
}
|
|
|
|
// Stop stops the history manager
|
|
func (hm *HistoryManager) Stop() {
|
|
hm.stopOnce.Do(func() {
|
|
close(hm.stopChan)
|
|
if hm.saveTicker != nil {
|
|
hm.saveTicker.Stop()
|
|
}
|
|
|
|
// Save one final time
|
|
if err := hm.saveHistory(); err != nil {
|
|
log.Error().Err(err).Msg("Failed to save alert history on shutdown")
|
|
}
|
|
})
|
|
}
|
|
|
|
// GetStats returns statistics about the alert history
|
|
func (hm *HistoryManager) GetStats() map[string]any {
|
|
stats := hm.Stats()
|
|
return map[string]any{
|
|
"totalEntries": stats.TotalEntries,
|
|
"oldestEntry": stats.OldestEntry,
|
|
"newestEntry": stats.NewestEntry,
|
|
"dataDir": stats.DataDir,
|
|
"fileSize": stats.FileSize,
|
|
}
|
|
}
|
|
|
|
// Stats returns typed statistics about the alert history.
|
|
func (hm *HistoryManager) Stats() HistoryStats {
|
|
hm.mu.RLock()
|
|
defer hm.mu.RUnlock()
|
|
|
|
oldest := time.Now()
|
|
newest := time.Time{}
|
|
|
|
if len(hm.history) > 0 {
|
|
oldest = hm.history[0].Timestamp
|
|
newest = hm.history[len(hm.history)-1].Timestamp
|
|
}
|
|
|
|
return HistoryStats{
|
|
TotalEntries: len(hm.history),
|
|
OldestEntry: oldest,
|
|
NewestEntry: newest,
|
|
DataDir: hm.dataDir,
|
|
FileSize: hm.getFileSize(),
|
|
}
|
|
}
|
|
|
|
// getFileSize returns the size of the history file
|
|
func (hm *HistoryManager) getFileSize() int64 {
|
|
info, err := os.Stat(hm.historyFile)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
return info.Size()
|
|
}
|