mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 19:41:17 +00:00
539 lines
18 KiB
Go
539 lines
18 KiB
Go
package ai
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/rcourtman/pulse-go-rewrite/internal/alerts"
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
// AlertTriggeredAnalyzer handles AI analysis triggered by firing alerts
|
|
// This provides token-efficient, real-time AI insights on specific resources
|
|
type AlertTriggeredAnalyzer struct {
|
|
mu sync.RWMutex
|
|
|
|
patrolService *PatrolService
|
|
stateProvider StateProvider
|
|
enabled bool
|
|
|
|
// Cooldown to prevent analyzing the same resource repeatedly
|
|
lastAnalyzed map[string]time.Time
|
|
cooldown time.Duration
|
|
|
|
// Track pending analyses to deduplicate concurrent alerts
|
|
pending map[string]bool
|
|
|
|
// Cleanup goroutine management
|
|
cleanupTicker *time.Ticker
|
|
stopCh chan struct{}
|
|
}
|
|
|
|
// NewAlertTriggeredAnalyzer creates a new alert-triggered analyzer
|
|
func NewAlertTriggeredAnalyzer(patrolService *PatrolService, stateProvider StateProvider) *AlertTriggeredAnalyzer {
|
|
return &AlertTriggeredAnalyzer{
|
|
patrolService: patrolService,
|
|
stateProvider: stateProvider,
|
|
enabled: false,
|
|
lastAnalyzed: make(map[string]time.Time),
|
|
cooldown: 5 * time.Minute, // Don't re-analyze the same resource within 5 minutes
|
|
pending: make(map[string]bool),
|
|
stopCh: make(chan struct{}),
|
|
}
|
|
}
|
|
|
|
// Start begins the background cleanup goroutine
|
|
func (a *AlertTriggeredAnalyzer) Start() {
|
|
a.mu.Lock()
|
|
defer a.mu.Unlock()
|
|
|
|
if a.cleanupTicker != nil {
|
|
return // Already started
|
|
}
|
|
|
|
ticker := time.NewTicker(30 * time.Minute)
|
|
a.cleanupTicker = ticker
|
|
stopCh := a.stopCh
|
|
|
|
go func() {
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
a.CleanupOldCooldowns()
|
|
case <-stopCh:
|
|
return
|
|
}
|
|
}
|
|
}()
|
|
log.Debug().Msg("Alert-triggered analyzer cleanup goroutine started")
|
|
}
|
|
|
|
// Stop stops the background cleanup goroutine
|
|
func (a *AlertTriggeredAnalyzer) Stop() {
|
|
a.mu.Lock()
|
|
defer a.mu.Unlock()
|
|
|
|
if a.cleanupTicker != nil {
|
|
a.cleanupTicker.Stop()
|
|
a.cleanupTicker = nil
|
|
}
|
|
select {
|
|
case <-a.stopCh:
|
|
// Already closed
|
|
default:
|
|
close(a.stopCh)
|
|
}
|
|
a.stopCh = make(chan struct{}) // Reset for potential restart
|
|
log.Debug().Msg("Alert-triggered analyzer cleanup goroutine stopped")
|
|
}
|
|
|
|
// SetEnabled enables or disables alert-triggered analysis
|
|
func (a *AlertTriggeredAnalyzer) SetEnabled(enabled bool) {
|
|
a.mu.Lock()
|
|
defer a.mu.Unlock()
|
|
a.enabled = enabled
|
|
log.Info().Bool("enabled", enabled).Msg("Alert-triggered AI analysis setting updated")
|
|
}
|
|
|
|
// IsEnabled returns whether alert-triggered analysis is enabled
|
|
func (a *AlertTriggeredAnalyzer) IsEnabled() bool {
|
|
a.mu.RLock()
|
|
defer a.mu.RUnlock()
|
|
return a.enabled
|
|
}
|
|
|
|
// OnAlertFired is called when an alert fires - triggers AI analysis of the affected resource
|
|
func (a *AlertTriggeredAnalyzer) OnAlertFired(alert *alerts.Alert) {
|
|
if alert == nil {
|
|
return
|
|
}
|
|
|
|
a.mu.Lock()
|
|
if !a.enabled {
|
|
a.mu.Unlock()
|
|
return
|
|
}
|
|
|
|
// Create a resource key for deduplication
|
|
resourceKey := a.resourceKeyFromAlert(alert)
|
|
if resourceKey == "" {
|
|
a.mu.Unlock()
|
|
log.Debug().
|
|
Str("alertID", alert.ID).
|
|
Str("type", alert.Type).
|
|
Msg("Cannot determine resource key for alert, skipping AI analysis")
|
|
return
|
|
}
|
|
|
|
// Check cooldown
|
|
if lastTime, exists := a.lastAnalyzed[resourceKey]; exists {
|
|
if time.Since(lastTime) < a.cooldown {
|
|
a.mu.Unlock()
|
|
log.Debug().
|
|
Str("resourceKey", resourceKey).
|
|
Str("alertID", alert.ID).
|
|
Dur("cooldownRemaining", a.cooldown-time.Since(lastTime)).
|
|
Msg("Resource recently analyzed, skipping due to cooldown")
|
|
return
|
|
}
|
|
}
|
|
|
|
// Check for pending analysis
|
|
if a.pending[resourceKey] {
|
|
a.mu.Unlock()
|
|
log.Debug().
|
|
Str("resourceKey", resourceKey).
|
|
Str("alertID", alert.ID).
|
|
Msg("Analysis already pending for resource, skipping duplicate")
|
|
return
|
|
}
|
|
|
|
// Mark as pending
|
|
a.pending[resourceKey] = true
|
|
a.mu.Unlock()
|
|
|
|
// Run analysis in background
|
|
go a.analyzeResource(alert, resourceKey)
|
|
}
|
|
|
|
// analyzeResource performs AI analysis on the resource associated with an alert
|
|
func (a *AlertTriggeredAnalyzer) analyzeResource(alert *alerts.Alert, resourceKey string) {
|
|
defer func() {
|
|
a.mu.Lock()
|
|
delete(a.pending, resourceKey)
|
|
a.lastAnalyzed[resourceKey] = time.Now()
|
|
a.mu.Unlock()
|
|
}()
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
|
defer cancel()
|
|
|
|
log.Info().
|
|
Str("alertID", alert.ID).
|
|
Str("type", alert.Type).
|
|
Str("resource", alert.ResourceName).
|
|
Str("resourceKey", resourceKey).
|
|
Float64("value", alert.Value).
|
|
Float64("threshold", alert.Threshold).
|
|
Msg("Starting AI analysis triggered by alert")
|
|
|
|
startTime := time.Now()
|
|
|
|
// Determine what type of resource this is and analyze it
|
|
findings := a.analyzeResourceByAlert(ctx, alert)
|
|
|
|
duration := time.Since(startTime)
|
|
|
|
if len(findings) > 0 {
|
|
log.Info().
|
|
Str("alertID", alert.ID).
|
|
Str("resourceKey", resourceKey).
|
|
Int("findingsCount", len(findings)).
|
|
Dur("duration", duration).
|
|
Msg("Alert-triggered AI analysis completed with findings")
|
|
|
|
// Add findings through the patrol pipeline to keep behavior consistent
|
|
if a.patrolService != nil {
|
|
for _, finding := range findings {
|
|
// Link finding to the triggering alert
|
|
finding.AlertID = alert.ID
|
|
a.patrolService.recordFinding(finding)
|
|
}
|
|
}
|
|
} else {
|
|
log.Debug().
|
|
Str("alertID", alert.ID).
|
|
Str("resourceKey", resourceKey).
|
|
Dur("duration", duration).
|
|
Msg("Alert-triggered AI analysis completed with no additional findings")
|
|
}
|
|
|
|
if a.patrolService != nil && a.patrolService.aiService != nil {
|
|
summary := "Alert-triggered Patrol analysis completed"
|
|
if len(findings) > 0 {
|
|
summary = fmt.Sprintf("Alert-triggered Patrol analysis found %d findings", len(findings))
|
|
}
|
|
a.patrolService.aiService.RecordIncidentAnalysis(alert.ID, summary, map[string]interface{}{
|
|
"findings": len(findings),
|
|
"duration": duration.String(),
|
|
})
|
|
}
|
|
}
|
|
|
|
// analyzeResourceByAlert determines the resource type from the alert and analyzes it.
|
|
// Only docker-container-update alerts produce findings; all other alert types are
|
|
// handled by LLM-based patrol via TriggerScopedPatrol.
|
|
func (a *AlertTriggeredAnalyzer) analyzeResourceByAlert(ctx context.Context, alert *alerts.Alert) []*Finding {
|
|
if a.patrolService == nil {
|
|
return nil
|
|
}
|
|
|
|
alertType := strings.ToLower(alert.Type)
|
|
if alertType == "docker-container-update" {
|
|
return a.analyzeUpdateAlertFromAlert(ctx, alert)
|
|
}
|
|
|
|
// All other alert types are handled by LLM-based patrol via TriggerScopedPatrol
|
|
return nil
|
|
}
|
|
|
|
// analyzeUpdateAlertFromAlert provides AI-powered update risk assessment for container updates
|
|
// This is a Pro feature that helps users prioritize and schedule updates intelligently
|
|
func (a *AlertTriggeredAnalyzer) analyzeUpdateAlertFromAlert(_ context.Context, alert *alerts.Alert) []*Finding {
|
|
// Handle nil inputs gracefully
|
|
if alert == nil {
|
|
return nil
|
|
}
|
|
|
|
// Extract metadata from the update alert with safe type assertions
|
|
var containerName, imageName, hostName string
|
|
var pendingHours int
|
|
|
|
if alert.Metadata != nil {
|
|
containerName, _ = alert.Metadata["containerName"].(string)
|
|
imageName, _ = alert.Metadata["image"].(string)
|
|
hostName, _ = alert.Metadata["hostName"].(string)
|
|
// Handle both int and float64 (JSON unmarshaling can produce either)
|
|
if v, ok := alert.Metadata["pendingHours"].(int); ok {
|
|
pendingHours = v
|
|
} else if v, ok := alert.Metadata["pendingHours"].(float64); ok {
|
|
pendingHours = int(v)
|
|
}
|
|
}
|
|
|
|
// Fallback to alert fields if metadata is incomplete
|
|
if containerName == "" {
|
|
containerName = alert.ResourceName
|
|
}
|
|
if containerName == "" {
|
|
containerName = "unknown container"
|
|
}
|
|
if imageName == "" {
|
|
imageName = "unknown"
|
|
}
|
|
if hostName == "" {
|
|
hostName = alert.Node
|
|
}
|
|
|
|
// Analyze the update and generate recommendations
|
|
severity, category, urgencyReason, recommendation := a.classifyContainerUpdate(containerName, imageName, pendingHours)
|
|
|
|
title := fmt.Sprintf("Update available for %s", containerName)
|
|
if urgencyReason != "" {
|
|
title = fmt.Sprintf("Update available for %s (%s)", containerName, urgencyReason)
|
|
}
|
|
|
|
// Build evidence string, omitting empty fields
|
|
var evidenceParts []string
|
|
if imageName != "unknown" {
|
|
evidenceParts = append(evidenceParts, "Image: "+imageName)
|
|
}
|
|
if hostName != "" {
|
|
evidenceParts = append(evidenceParts, "Host: "+hostName)
|
|
}
|
|
if pendingHours > 0 {
|
|
evidenceParts = append(evidenceParts, fmt.Sprintf("Pending: %d hours", pendingHours))
|
|
}
|
|
if urgencyReason != "" {
|
|
evidenceParts = append(evidenceParts, "Type: "+urgencyReason)
|
|
}
|
|
evidence := strings.Join(evidenceParts, " | ")
|
|
|
|
description := alert.Message
|
|
if description == "" {
|
|
description = fmt.Sprintf("Image update available for container '%s'", containerName)
|
|
}
|
|
|
|
finding := &Finding{
|
|
ID: fmt.Sprintf("update-analysis-%s", alert.ResourceID),
|
|
Key: fmt.Sprintf("update-analysis:%s", alert.ResourceID),
|
|
Severity: severity,
|
|
Category: category,
|
|
ResourceID: alert.ResourceID,
|
|
ResourceName: containerName,
|
|
ResourceType: "Docker Container",
|
|
Title: title,
|
|
Description: description,
|
|
Recommendation: recommendation,
|
|
Evidence: evidence,
|
|
DetectedAt: time.Now(),
|
|
LastSeenAt: time.Now(),
|
|
AlertID: alert.ID,
|
|
}
|
|
|
|
log.Info().
|
|
Str("container", containerName).
|
|
Str("image", imageName).
|
|
Str("severity", string(severity)).
|
|
Str("urgency", urgencyReason).
|
|
Int("pendingHours", pendingHours).
|
|
Msg("AI update risk assessment generated for container")
|
|
|
|
return []*Finding{finding}
|
|
}
|
|
|
|
// classifyContainerUpdate determines the urgency and recommendations for a container update
|
|
// based on the image name and how long the update has been pending
|
|
func (a *AlertTriggeredAnalyzer) classifyContainerUpdate(containerName, imageName string, pendingHours int) (FindingSeverity, FindingCategory, string, string) {
|
|
severity := FindingSeverityWatch
|
|
category := FindingCategoryReliability
|
|
urgencyReason := ""
|
|
recommendation := ""
|
|
|
|
imageNameLower := strings.ToLower(imageName)
|
|
|
|
switch {
|
|
// Security-critical: Authentication & Identity services - update ASAP
|
|
case strings.Contains(imageNameLower, "keycloak") ||
|
|
strings.Contains(imageNameLower, "authelia") ||
|
|
strings.Contains(imageNameLower, "authentik") ||
|
|
strings.Contains(imageNameLower, "oauth") ||
|
|
strings.Contains(imageNameLower, "dex") ||
|
|
strings.Contains(imageNameLower, "vault"):
|
|
severity = FindingSeverityWarning
|
|
category = FindingCategorySecurity
|
|
urgencyReason = "auth/identity service"
|
|
recommendation = fmt.Sprintf("Container '%s' is an %s which handles authentication. "+
|
|
"Security updates are critical. Review changelog immediately and update within 24 hours if security-related. "+
|
|
"Schedule during low-auth-activity periods to minimize locked-out users.", containerName, urgencyReason)
|
|
|
|
// Security-critical: Reverse proxies and web servers - update promptly
|
|
case strings.Contains(imageNameLower, "nginx") ||
|
|
strings.Contains(imageNameLower, "apache") ||
|
|
strings.Contains(imageNameLower, "httpd") ||
|
|
strings.Contains(imageNameLower, "traefik") ||
|
|
strings.Contains(imageNameLower, "haproxy") ||
|
|
strings.Contains(imageNameLower, "caddy") ||
|
|
strings.Contains(imageNameLower, "envoy"):
|
|
severity = FindingSeverityWarning
|
|
category = FindingCategorySecurity
|
|
urgencyReason = "reverse proxy/web server"
|
|
recommendation = fmt.Sprintf("Container '%s' is a %s which is often internet-facing. "+
|
|
"Review the changelog for security fixes and consider updating within 24-48 hours. "+
|
|
"Schedule during low-traffic periods.", containerName, urgencyReason)
|
|
|
|
// Database - careful updates needed, verify backup first
|
|
case strings.Contains(imageNameLower, "postgres") ||
|
|
strings.Contains(imageNameLower, "mysql") ||
|
|
strings.Contains(imageNameLower, "mariadb") ||
|
|
strings.Contains(imageNameLower, "mongo") ||
|
|
strings.Contains(imageNameLower, "redis") ||
|
|
strings.Contains(imageNameLower, "memcached") ||
|
|
strings.Contains(imageNameLower, "cassandra") ||
|
|
strings.Contains(imageNameLower, "cockroach") ||
|
|
strings.Contains(imageNameLower, "influxdb") ||
|
|
strings.Contains(imageNameLower, "timescale") ||
|
|
strings.Contains(imageNameLower, "clickhouse"):
|
|
severity = FindingSeverityWatch
|
|
category = FindingCategoryReliability
|
|
urgencyReason = "database"
|
|
recommendation = fmt.Sprintf("Container '%s' is a %s. "+
|
|
"Before updating: 1) Verify backup is current and tested, 2) Review changelog for breaking changes, "+
|
|
"3) Check for major version incompatibilities. Schedule during maintenance window with low activity.",
|
|
containerName, urgencyReason)
|
|
|
|
// Message queues - handle with care, drain queues first
|
|
case strings.Contains(imageNameLower, "rabbitmq") ||
|
|
strings.Contains(imageNameLower, "kafka") ||
|
|
strings.Contains(imageNameLower, "nats") ||
|
|
strings.Contains(imageNameLower, "mosquitto") ||
|
|
strings.Contains(imageNameLower, "activemq") ||
|
|
strings.Contains(imageNameLower, "zeromq"):
|
|
severity = FindingSeverityWatch
|
|
category = FindingCategoryReliability
|
|
urgencyReason = "message queue"
|
|
recommendation = fmt.Sprintf("Container '%s' is a %s. "+
|
|
"Ensure all consumers are healthy and queues can be drained before updating. "+
|
|
"Consider updating during low message volume periods. Check for protocol compatibility.",
|
|
containerName, urgencyReason)
|
|
|
|
// CI/CD and automation - schedule during non-deployment windows
|
|
case strings.Contains(imageNameLower, "jenkins") ||
|
|
strings.Contains(imageNameLower, "gitlab") ||
|
|
strings.Contains(imageNameLower, "gitea") ||
|
|
strings.Contains(imageNameLower, "drone") ||
|
|
strings.Contains(imageNameLower, "argocd") ||
|
|
strings.Contains(imageNameLower, "flux"):
|
|
severity = FindingSeverityWatch
|
|
category = FindingCategoryReliability
|
|
urgencyReason = "CI/CD system"
|
|
recommendation = fmt.Sprintf("Container '%s' is a %s. "+
|
|
"Update during periods when no critical deployments are scheduled. "+
|
|
"Verify all pipelines complete successfully after update.",
|
|
containerName, urgencyReason)
|
|
|
|
// Storage/backup services - critical, but careful updates needed
|
|
case strings.Contains(imageNameLower, "minio") ||
|
|
strings.Contains(imageNameLower, "nextcloud") ||
|
|
strings.Contains(imageNameLower, "seafile") ||
|
|
strings.Contains(imageNameLower, "restic") ||
|
|
strings.Contains(imageNameLower, "duplicati"):
|
|
severity = FindingSeverityWatch
|
|
category = FindingCategoryBackup
|
|
urgencyReason = "storage/backup service"
|
|
recommendation = fmt.Sprintf("Container '%s' is a %s. "+
|
|
"Verify no active uploads/syncs and ensure recent backup exists before updating. "+
|
|
"Check release notes for data migration requirements.",
|
|
containerName, urgencyReason)
|
|
|
|
// Monitoring/observability - generally safe to batch
|
|
case strings.Contains(imageNameLower, "prometheus") ||
|
|
strings.Contains(imageNameLower, "grafana") ||
|
|
strings.Contains(imageNameLower, "loki") ||
|
|
strings.Contains(imageNameLower, "jaeger") ||
|
|
strings.Contains(imageNameLower, "alertmanager") ||
|
|
strings.Contains(imageNameLower, "victoria") ||
|
|
strings.Contains(imageNameLower, "tempo") ||
|
|
strings.Contains(imageNameLower, "mimir"):
|
|
severity = FindingSeverityInfo
|
|
category = FindingCategoryReliability
|
|
urgencyReason = "monitoring/observability"
|
|
recommendation = fmt.Sprintf("Container '%s' is a %s tool. "+
|
|
"Generally safe to update anytime. Brief monitoring gaps during restart are usually acceptable. "+
|
|
"Check for dashboard compatibility if changing major versions.",
|
|
containerName, urgencyReason)
|
|
|
|
// Home automation - update during inactive periods
|
|
case strings.Contains(imageNameLower, "homeassistant") ||
|
|
strings.Contains(imageNameLower, "home-assistant") ||
|
|
strings.Contains(imageNameLower, "nodered") ||
|
|
strings.Contains(imageNameLower, "node-red") ||
|
|
strings.Contains(imageNameLower, "mosquitto"):
|
|
severity = FindingSeverityWatch
|
|
category = FindingCategoryReliability
|
|
urgencyReason = "home automation"
|
|
recommendation = fmt.Sprintf("Container '%s' is a %s service. "+
|
|
"Update when household members are awake and can verify automations work correctly. "+
|
|
"Check for integration compatibility with major version updates.",
|
|
containerName, urgencyReason)
|
|
|
|
// Media services - low priority, update at convenience
|
|
case strings.Contains(imageNameLower, "plex") ||
|
|
strings.Contains(imageNameLower, "jellyfin") ||
|
|
strings.Contains(imageNameLower, "emby") ||
|
|
strings.Contains(imageNameLower, "sonarr") ||
|
|
strings.Contains(imageNameLower, "radarr") ||
|
|
strings.Contains(imageNameLower, "lidarr"):
|
|
severity = FindingSeverityInfo
|
|
category = FindingCategoryReliability
|
|
urgencyReason = "media service"
|
|
recommendation = fmt.Sprintf("Container '%s' is a %s. "+
|
|
"Low priority - update at your convenience when no one is watching. "+
|
|
"Feature updates are common; security updates are rare but worth checking release notes.",
|
|
containerName, urgencyReason)
|
|
|
|
// Default case for unknown containers
|
|
default:
|
|
severity = FindingSeverityWatch
|
|
category = FindingCategoryReliability
|
|
recommendation = fmt.Sprintf("Container '%s' has an image update available. "+
|
|
"Review the changelog at the image registry before updating. "+
|
|
"Consider testing in a non-production environment first.", containerName)
|
|
}
|
|
|
|
// Add time-based urgency escalation
|
|
if pendingHours > 336 { // > 14 days
|
|
severity = FindingSeverityCritical
|
|
recommendation += fmt.Sprintf(" ⚠️ OVERDUE: This update has been pending for %d days. "+
|
|
"Prioritize immediately to avoid potential security vulnerabilities.", pendingHours/24)
|
|
} else if pendingHours > 168 { // > 7 days
|
|
if severity == FindingSeverityInfo || severity == FindingSeverityWatch {
|
|
severity = FindingSeverityWarning
|
|
}
|
|
recommendation += fmt.Sprintf(" This update has been pending for %d days. "+
|
|
"Consider prioritizing to avoid security risk accumulation.", pendingHours/24)
|
|
}
|
|
|
|
return severity, category, urgencyReason, recommendation
|
|
}
|
|
|
|
// resourceKeyFromAlert creates a unique key for the resource in an alert
|
|
func (a *AlertTriggeredAnalyzer) resourceKeyFromAlert(alert *alerts.Alert) string {
|
|
if alert.ResourceID != "" {
|
|
return alert.ResourceID
|
|
}
|
|
if alert.ResourceName != "" && alert.Instance != "" {
|
|
return fmt.Sprintf("%s/%s", alert.Instance, alert.ResourceName)
|
|
}
|
|
if alert.ResourceName != "" {
|
|
return alert.ResourceName
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// CleanupOldCooldowns removes expired cooldown entries to prevent memory growth
|
|
func (a *AlertTriggeredAnalyzer) CleanupOldCooldowns() {
|
|
a.mu.Lock()
|
|
defer a.mu.Unlock()
|
|
|
|
now := time.Now()
|
|
for key, lastTime := range a.lastAnalyzed {
|
|
// Remove entries older than 1 hour
|
|
if now.Sub(lastTime) > time.Hour {
|
|
delete(a.lastAnalyzed, key)
|
|
}
|
|
}
|
|
}
|