mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 19:41:17 +00:00
Fixed an issue where all Docker containers were showing 'click to update' even when they were up to date. The root cause was comparing the wrong digest types: - Previously: Compared ImageID (local config hash) vs registry manifest digest - Now: Uses RepoDigests from image inspect, which is the actual manifest digest that Docker received from the registry when pulling the image For multi-arch images, the registry returns a manifest list digest, while Docker stores the platform-specific image config digest locally. These will never match, causing false positives for all multi-arch images. Changes: - Added ImageInspectWithRaw to dockerClient interface - Added getImageRepoDigest method to extract RepoDigest from image - Added matchesImageReference helper for Docker Hub naming conventions - Added tests for matchesImageReference Fixes #955
573 lines
15 KiB
Go
573 lines
15 KiB
Go
// Package patterns provides failure pattern detection for predictive intelligence.
|
|
// It analyzes historical data to identify recurring issues and predict future failures.
|
|
package patterns
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
// EventType represents the type of event being tracked
|
|
type EventType string
|
|
|
|
const (
|
|
EventHighMemory EventType = "high_memory" // Memory exceeded threshold
|
|
EventHighCPU EventType = "high_cpu" // CPU exceeded threshold
|
|
EventDiskFull EventType = "disk_full" // Disk space critical
|
|
EventOOM EventType = "oom" // Out of memory kill
|
|
EventRestart EventType = "restart" // Resource restarted
|
|
EventUnresponsive EventType = "unresponsive" // Resource became unresponsive
|
|
EventBackupFailed EventType = "backup_failed" // Backup job failed
|
|
)
|
|
|
|
// HistoricalEvent represents a recorded event
|
|
type HistoricalEvent struct {
|
|
ID string `json:"id"`
|
|
ResourceID string `json:"resource_id"`
|
|
EventType EventType `json:"event_type"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
Description string `json:"description,omitempty"`
|
|
Resolved bool `json:"resolved"`
|
|
ResolvedAt time.Time `json:"resolved_at,omitempty"`
|
|
Duration time.Duration `json:"duration,omitempty"` // How long it lasted
|
|
}
|
|
|
|
// Pattern represents a detected recurring pattern
|
|
type Pattern struct {
|
|
ResourceID string `json:"resource_id"`
|
|
EventType EventType `json:"event_type"`
|
|
Occurrences int `json:"occurrences"` // Number of times event occurred
|
|
AverageInterval time.Duration `json:"average_interval"` // Average time between occurrences
|
|
StdDevInterval time.Duration `json:"stddev_interval"` // Standard deviation
|
|
LastOccurrence time.Time `json:"last_occurrence"`
|
|
NextPredicted time.Time `json:"next_predicted"` // When we expect it to happen again
|
|
Confidence float64 `json:"confidence"` // 0-1, based on consistency
|
|
AverageDuration time.Duration `json:"average_duration,omitempty"` // How long events typically last
|
|
}
|
|
|
|
// FailurePrediction represents a predicted future failure
|
|
type FailurePrediction struct {
|
|
ResourceID string `json:"resource_id"`
|
|
EventType EventType `json:"event_type"`
|
|
PredictedAt time.Time `json:"predicted_at"`
|
|
DaysUntil float64 `json:"days_until"`
|
|
Confidence float64 `json:"confidence"`
|
|
Basis string `json:"basis"` // Human-readable explanation
|
|
Pattern *Pattern `json:"pattern,omitempty"`
|
|
}
|
|
|
|
// Detector tracks historical events and detects patterns
|
|
type Detector struct {
|
|
mu sync.RWMutex
|
|
events []HistoricalEvent
|
|
patterns map[string]*Pattern // resourceID:eventType -> pattern
|
|
|
|
// Configuration
|
|
maxEvents int
|
|
minOccurrences int // Minimum occurrences to form a pattern
|
|
patternWindow time.Duration // How far back to look for patterns
|
|
predictionLimit time.Duration // How far ahead to predict
|
|
|
|
// Persistence
|
|
dataDir string
|
|
}
|
|
|
|
// DetectorConfig configures the pattern detector
|
|
type DetectorConfig struct {
|
|
MaxEvents int
|
|
MinOccurrences int // Default: 3
|
|
PatternWindow time.Duration // Default: 90 days
|
|
PredictionLimit time.Duration // Default: 30 days
|
|
DataDir string
|
|
}
|
|
|
|
// DefaultConfig returns default detector configuration
|
|
func DefaultConfig() DetectorConfig {
|
|
return DetectorConfig{
|
|
MaxEvents: 5000,
|
|
MinOccurrences: 3,
|
|
PatternWindow: 90 * 24 * time.Hour,
|
|
PredictionLimit: 30 * 24 * time.Hour,
|
|
}
|
|
}
|
|
|
|
// NewDetector creates a new pattern detector
|
|
func NewDetector(cfg DetectorConfig) *Detector {
|
|
if cfg.MaxEvents <= 0 {
|
|
cfg.MaxEvents = 5000
|
|
}
|
|
if cfg.MinOccurrences <= 0 {
|
|
cfg.MinOccurrences = 3
|
|
}
|
|
if cfg.PatternWindow <= 0 {
|
|
cfg.PatternWindow = 90 * 24 * time.Hour
|
|
}
|
|
if cfg.PredictionLimit <= 0 {
|
|
cfg.PredictionLimit = 30 * 24 * time.Hour
|
|
}
|
|
|
|
d := &Detector{
|
|
events: make([]HistoricalEvent, 0),
|
|
patterns: make(map[string]*Pattern),
|
|
maxEvents: cfg.MaxEvents,
|
|
minOccurrences: cfg.MinOccurrences,
|
|
patternWindow: cfg.PatternWindow,
|
|
predictionLimit: cfg.PredictionLimit,
|
|
dataDir: cfg.DataDir,
|
|
}
|
|
|
|
// Load existing data
|
|
if cfg.DataDir != "" {
|
|
if err := d.loadFromDisk(); err != nil {
|
|
log.Warn().Err(err).Msg("Failed to load pattern history from disk")
|
|
} else if len(d.events) > 0 {
|
|
log.Info().Int("events", len(d.events)).Int("patterns", len(d.patterns)).
|
|
Msg("Loaded pattern history from disk")
|
|
}
|
|
}
|
|
|
|
return d
|
|
}
|
|
|
|
// RecordEvent records a new event for pattern analysis
|
|
func (d *Detector) RecordEvent(event HistoricalEvent) {
|
|
d.mu.Lock()
|
|
defer d.mu.Unlock()
|
|
|
|
if event.ID == "" {
|
|
event.ID = generateEventID()
|
|
}
|
|
if event.Timestamp.IsZero() {
|
|
event.Timestamp = time.Now()
|
|
}
|
|
|
|
d.events = append(d.events, event)
|
|
d.trimEvents()
|
|
|
|
// Recompute pattern for this resource/event type
|
|
key := patternKey(event.ResourceID, event.EventType)
|
|
pattern := d.computePattern(event.ResourceID, event.EventType)
|
|
if pattern == nil {
|
|
delete(d.patterns, key)
|
|
} else {
|
|
d.patterns[key] = pattern
|
|
}
|
|
|
|
// Persist asynchronously
|
|
go func() {
|
|
if err := d.saveToDisk(); err != nil {
|
|
log.Warn().Err(err).Msg("Failed to save pattern history")
|
|
}
|
|
}()
|
|
}
|
|
|
|
// RecordFromAlert records an event from an alert
|
|
func (d *Detector) RecordFromAlert(resourceID string, alertType string, timestamp time.Time) {
|
|
eventType := mapAlertToEventType(alertType)
|
|
if eventType == "" {
|
|
return // Not a trackable event type
|
|
}
|
|
|
|
d.RecordEvent(HistoricalEvent{
|
|
ResourceID: resourceID,
|
|
EventType: eventType,
|
|
Timestamp: timestamp,
|
|
Description: alertType,
|
|
})
|
|
}
|
|
|
|
// GetPredictions returns failure predictions for all tracked resources
|
|
func (d *Detector) GetPredictions() []FailurePrediction {
|
|
d.mu.RLock()
|
|
defer d.mu.RUnlock()
|
|
|
|
var predictions []FailurePrediction
|
|
now := time.Now()
|
|
|
|
for _, pattern := range d.patterns {
|
|
if pattern == nil {
|
|
continue
|
|
}
|
|
// Only predict if pattern has sufficient confidence
|
|
if pattern.Confidence < 0.3 || pattern.Occurrences < d.minOccurrences {
|
|
continue
|
|
}
|
|
|
|
// Check if prediction is within our limit
|
|
if pattern.NextPredicted.Before(now) || pattern.NextPredicted.After(now.Add(d.predictionLimit)) {
|
|
continue
|
|
}
|
|
|
|
daysUntil := pattern.NextPredicted.Sub(now).Hours() / 24
|
|
|
|
predictions = append(predictions, FailurePrediction{
|
|
ResourceID: pattern.ResourceID,
|
|
EventType: pattern.EventType,
|
|
PredictedAt: pattern.NextPredicted,
|
|
DaysUntil: daysUntil,
|
|
Confidence: pattern.Confidence,
|
|
Basis: formatPatternBasis(pattern),
|
|
Pattern: pattern,
|
|
})
|
|
}
|
|
|
|
// Sort by days until (soonest first)
|
|
sort.Slice(predictions, func(i, j int) bool {
|
|
return predictions[i].DaysUntil < predictions[j].DaysUntil
|
|
})
|
|
|
|
return predictions
|
|
}
|
|
|
|
// GetPredictionsForResource returns failure predictions for a specific resource
|
|
func (d *Detector) GetPredictionsForResource(resourceID string) []FailurePrediction {
|
|
all := d.GetPredictions()
|
|
var result []FailurePrediction
|
|
for _, p := range all {
|
|
if p.ResourceID == resourceID {
|
|
result = append(result, p)
|
|
}
|
|
}
|
|
return result
|
|
}
|
|
|
|
// GetPatterns returns all detected patterns
|
|
func (d *Detector) GetPatterns() map[string]*Pattern {
|
|
d.mu.RLock()
|
|
defer d.mu.RUnlock()
|
|
|
|
result := make(map[string]*Pattern)
|
|
for k, v := range d.patterns {
|
|
if v == nil {
|
|
continue
|
|
}
|
|
result[k] = v
|
|
}
|
|
return result
|
|
}
|
|
|
|
// computePattern analyzes events to find patterns for a resource/event type
|
|
func (d *Detector) computePattern(resourceID string, eventType EventType) *Pattern {
|
|
cutoff := time.Now().Add(-d.patternWindow)
|
|
|
|
// Get all events for this resource/type within the window
|
|
var events []HistoricalEvent
|
|
for _, e := range d.events {
|
|
if e.ResourceID == resourceID && e.EventType == eventType && e.Timestamp.After(cutoff) {
|
|
events = append(events, e)
|
|
}
|
|
}
|
|
|
|
if len(events) < d.minOccurrences {
|
|
return nil
|
|
}
|
|
|
|
// Sort by timestamp
|
|
sort.Slice(events, func(i, j int) bool {
|
|
return events[i].Timestamp.Before(events[j].Timestamp)
|
|
})
|
|
|
|
// Calculate intervals between events
|
|
var intervals []time.Duration
|
|
var durations []time.Duration
|
|
|
|
for i := 1; i < len(events); i++ {
|
|
interval := events[i].Timestamp.Sub(events[i-1].Timestamp)
|
|
intervals = append(intervals, interval)
|
|
|
|
if events[i-1].Duration > 0 {
|
|
durations = append(durations, events[i-1].Duration)
|
|
}
|
|
}
|
|
|
|
if len(intervals) == 0 {
|
|
return nil
|
|
}
|
|
|
|
// Calculate average and stddev of intervals
|
|
avgInterval := averageDuration(intervals)
|
|
stddevInterval := stddevDuration(intervals, avgInterval)
|
|
|
|
// Calculate confidence based on consistency
|
|
// If stddev is low relative to mean, pattern is more reliable
|
|
consistency := 1.0
|
|
if avgInterval > 0 {
|
|
cv := float64(stddevInterval) / float64(avgInterval) // Coefficient of variation
|
|
consistency = 1.0 - math.Min(cv, 1.0) // Higher consistency = lower CV
|
|
}
|
|
|
|
// Adjust confidence based on number of occurrences
|
|
occurrenceBonus := math.Min(float64(len(events))/10.0, 0.3)
|
|
confidence := consistency*0.7 + occurrenceBonus
|
|
|
|
// Predict next occurrence
|
|
lastEvent := events[len(events)-1]
|
|
nextPredicted := lastEvent.Timestamp.Add(avgInterval)
|
|
|
|
// Calculate average duration if available
|
|
var avgDuration time.Duration
|
|
if len(durations) > 0 {
|
|
avgDuration = averageDuration(durations)
|
|
}
|
|
|
|
return &Pattern{
|
|
ResourceID: resourceID,
|
|
EventType: eventType,
|
|
Occurrences: len(events),
|
|
AverageInterval: avgInterval,
|
|
StdDevInterval: stddevInterval,
|
|
LastOccurrence: lastEvent.Timestamp,
|
|
NextPredicted: nextPredicted,
|
|
Confidence: confidence,
|
|
AverageDuration: avgDuration,
|
|
}
|
|
}
|
|
|
|
// trimEvents removes old events beyond maxEvents
|
|
func (d *Detector) trimEvents() {
|
|
cutoff := time.Now().Add(-d.patternWindow)
|
|
kept := d.events[:0]
|
|
for _, e := range d.events {
|
|
if e.Timestamp.After(cutoff) {
|
|
kept = append(kept, e)
|
|
}
|
|
}
|
|
d.events = kept
|
|
|
|
if len(d.events) > d.maxEvents {
|
|
d.events = d.events[len(d.events)-d.maxEvents:]
|
|
}
|
|
}
|
|
|
|
// saveToDisk persists events and patterns
|
|
func (d *Detector) saveToDisk() error {
|
|
if d.dataDir == "" {
|
|
return nil
|
|
}
|
|
|
|
d.mu.RLock()
|
|
data := struct {
|
|
Events []HistoricalEvent `json:"events"`
|
|
Patterns map[string]*Pattern `json:"patterns"`
|
|
}{
|
|
Events: d.events,
|
|
Patterns: d.patterns,
|
|
}
|
|
d.mu.RUnlock()
|
|
|
|
jsonData, err := json.MarshalIndent(data, "", " ")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
path := filepath.Join(d.dataDir, "ai_patterns.json")
|
|
tmpPath := path + ".tmp"
|
|
if err := os.WriteFile(tmpPath, jsonData, 0600); err != nil {
|
|
return err
|
|
}
|
|
|
|
return os.Rename(tmpPath, path)
|
|
}
|
|
|
|
// loadFromDisk loads events and patterns
|
|
func (d *Detector) loadFromDisk() error {
|
|
if d.dataDir == "" {
|
|
return nil
|
|
}
|
|
|
|
path := filepath.Join(d.dataDir, "ai_patterns.json")
|
|
if st, err := os.Stat(path); err == nil {
|
|
const maxOnDiskBytes = 10 << 20 // 10 MiB safety cap
|
|
if st.Size() > maxOnDiskBytes {
|
|
return fmt.Errorf("pattern history file too large (%d bytes)", st.Size())
|
|
}
|
|
}
|
|
jsonData, err := os.ReadFile(path)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return nil
|
|
}
|
|
return err
|
|
}
|
|
|
|
var data struct {
|
|
Events []HistoricalEvent `json:"events"`
|
|
Patterns map[string]*Pattern `json:"patterns"`
|
|
}
|
|
|
|
if err := json.Unmarshal(jsonData, &data); err != nil {
|
|
return err
|
|
}
|
|
|
|
d.events = data.Events
|
|
d.patterns = make(map[string]*Pattern, len(data.Patterns))
|
|
for k, v := range data.Patterns {
|
|
d.patterns[k] = v
|
|
}
|
|
|
|
d.trimEvents()
|
|
cutoff := time.Now().Add(-d.patternWindow)
|
|
for k, v := range d.patterns {
|
|
if v == nil {
|
|
delete(d.patterns, k)
|
|
continue
|
|
}
|
|
if v.Occurrences < d.minOccurrences || v.LastOccurrence.Before(cutoff) {
|
|
delete(d.patterns, k)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// FormatForContext formats predictions for AI consumption
|
|
func (d *Detector) FormatForContext(resourceID string) string {
|
|
var predictions []FailurePrediction
|
|
if resourceID != "" {
|
|
predictions = d.GetPredictionsForResource(resourceID)
|
|
} else {
|
|
predictions = d.GetPredictions()
|
|
}
|
|
|
|
if len(predictions) == 0 {
|
|
return ""
|
|
}
|
|
|
|
var result string
|
|
result = "\n## ⏰ Failure Predictions\n"
|
|
result += "Based on historical patterns:\n"
|
|
|
|
for _, p := range predictions {
|
|
if len(result) > 2000 { // Limit context size
|
|
result += "\n... and more\n"
|
|
break
|
|
}
|
|
result += "- " + p.Basis + "\n"
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// Helper functions
|
|
|
|
var eventCounter int64
|
|
|
|
func generateEventID() string {
|
|
eventCounter++
|
|
return time.Now().Format("20060102150405") + "-" + intToStr(int(eventCounter%1000))
|
|
}
|
|
|
|
func intToStr(n int) string {
|
|
if n == 0 {
|
|
return "0"
|
|
}
|
|
var result string
|
|
for n > 0 {
|
|
result = string(rune('0'+n%10)) + result
|
|
n /= 10
|
|
}
|
|
return result
|
|
}
|
|
|
|
func patternKey(resourceID string, eventType EventType) string {
|
|
return resourceID + ":" + string(eventType)
|
|
}
|
|
|
|
func mapAlertToEventType(alertType string) EventType {
|
|
switch alertType {
|
|
case "memory_warning", "memory_critical":
|
|
return EventHighMemory
|
|
case "cpu_warning", "cpu_critical":
|
|
return EventHighCPU
|
|
case "disk_warning", "disk_critical":
|
|
return EventDiskFull
|
|
case "oom", "out_of_memory":
|
|
return EventOOM
|
|
case "restart", "restarted":
|
|
return EventRestart
|
|
case "unresponsive", "unreachable":
|
|
return EventUnresponsive
|
|
case "backup_failed":
|
|
return EventBackupFailed
|
|
default:
|
|
return ""
|
|
}
|
|
}
|
|
|
|
func averageDuration(durations []time.Duration) time.Duration {
|
|
if len(durations) == 0 {
|
|
return 0
|
|
}
|
|
var sum int64
|
|
for _, d := range durations {
|
|
sum += int64(d)
|
|
}
|
|
return time.Duration(sum / int64(len(durations)))
|
|
}
|
|
|
|
func stddevDuration(durations []time.Duration, mean time.Duration) time.Duration {
|
|
if len(durations) < 2 {
|
|
return 0
|
|
}
|
|
var sumSquares float64
|
|
for _, d := range durations {
|
|
diff := float64(d - mean)
|
|
sumSquares += diff * diff
|
|
}
|
|
variance := sumSquares / float64(len(durations)-1)
|
|
return time.Duration(math.Sqrt(variance))
|
|
}
|
|
|
|
func formatPatternBasis(p *Pattern) string {
|
|
daysInterval := p.AverageInterval.Hours() / 24
|
|
daysSinceLast := time.Since(p.LastOccurrence).Hours() / 24
|
|
daysUntilNext := p.NextPredicted.Sub(time.Now()).Hours() / 24
|
|
|
|
eventName := string(p.EventType)
|
|
switch p.EventType {
|
|
case EventHighMemory:
|
|
eventName = "high memory usage"
|
|
case EventHighCPU:
|
|
eventName = "high CPU usage"
|
|
case EventDiskFull:
|
|
eventName = "disk space critical"
|
|
case EventOOM:
|
|
eventName = "OOM events"
|
|
case EventRestart:
|
|
eventName = "restarts"
|
|
case EventUnresponsive:
|
|
eventName = "unresponsive periods"
|
|
case EventBackupFailed:
|
|
eventName = "backup failures"
|
|
}
|
|
|
|
if daysUntilNext < 0 {
|
|
return eventName + " typically occurs every ~" + formatDays(daysInterval) +
|
|
" (last: " + formatDays(daysSinceLast) + " ago, overdue)"
|
|
}
|
|
|
|
return eventName + " typically occurs every ~" + formatDays(daysInterval) +
|
|
" (next expected in ~" + formatDays(daysUntilNext) + ")"
|
|
}
|
|
|
|
func formatDays(days float64) string {
|
|
if days < 1 {
|
|
hours := days * 24
|
|
if hours < 1 {
|
|
return "less than an hour"
|
|
}
|
|
return intToStr(int(hours)) + " hours"
|
|
}
|
|
if days < 2 {
|
|
return "1 day"
|
|
}
|
|
return intToStr(int(days)) + " days"
|
|
}
|