diff --git a/README.md b/README.md index 9267d39a4..3389fcd4b 100644 --- a/README.md +++ b/README.md @@ -287,41 +287,44 @@ Configure persistent alert policies in **Settings → Alerts → Custom Rules**: **Use for:** Long-term alert policies like "all database VMs should alert at 90%" -#### Proxmox Tags (Temporary Overrides) -Apply operational overrides directly in Proxmox using VM/CT tags: +#### Proxmox Tags (Direct VM Control) +Control alerts directly on VMs/containers using Proxmox tags - perfect for both permanent and temporary needs: | Tag | Purpose | Use Case | |-----|---------|----------| -| `pulse-no-alerts` | Maintenance mode | Suppress all alerts during maintenance windows | -| `pulse-monitor-only` | Silent monitoring | Development/staging VMs you want to watch but not get paged for | -| `pulse-relaxed` | Temporary tolerance | Expected high load periods (backups, batch jobs) | +| `pulse-no-alerts` | Disable all alerts | VMs that shouldn't alert (dev, testing, or special workloads) | +| `pulse-monitor-only` | Monitor without notifications | See metrics in UI but don't get paged | +| `pulse-relaxed` | Higher thresholds (95%/98%) | Services that naturally run hot (databases, media servers) | -**How tags work:** -- Tags are operational overrides that take priority over custom rules -- Changes apply within 30-60 seconds (no restart needed) -- `pulse-relaxed` sets fixed thresholds: 95% CPU/RAM, 98% disk -- Multiple tags can be combined (e.g., relaxed + monitor-only) +**When to use tags vs custom rules:** +- **Use Tags**: When you want to control a specific VM directly ("this VM is special") +- **Use Custom Rules**: When you want patterns/policies ("all VMs named *-dev should...") -**Examples:** +**Common permanent uses:** ```bash -# Maintenance window - suppress all alerts +# TrueNAS/Samba servers with aggressive caching +pvesh set /nodes/pve/lxc/100/config -tags 'truenas,pulse-relaxed' + +# Development VMs that shouldn't page anyone +pvesh set /nodes/pve/qemu/200/config -tags 'dev,pulse-no-alerts' + +# Staging environment - monitor but don't notify +pvesh set /nodes/pve/lxc/300/config -tags 'staging,pulse-monitor-only' +``` + +**Temporary uses:** +```bash +# Maintenance window pvesh set /nodes/pve/lxc/100/config -tags 'prod,pulse-no-alerts' - -# Development VM - monitor but don't notify -pvesh set /nodes/pve/qemu/200/config -tags 'dev,pulse-monitor-only' - -# Backup server during backup window - relax thresholds -pvesh set /nodes/pve/lxc/300/config -tags 'backup-server,pulse-relaxed' - -# Remove tag after maintenance +# After maintenance, remove the pulse tag pvesh set /nodes/pve/lxc/100/config -tags 'prod' ``` -**Best practices:** -- Use Custom Rules for "what should normally happen" -- Use Tags for "temporary exceptions to normal" -- Remove tags when temporary need ends -- Document tag usage in your runbooks +**Key advantages of tags:** +- No UI navigation needed - manage directly in Proxmox +- Tags stay with the VM (survive Pulse reinstalls/migrations) +- Clear visibility in Proxmox which VMs have special alert handling +- Changes apply within 30-60 seconds ### HTTPS/TLS Configuration Enable HTTPS by setting these environment variables: diff --git a/internal/alerts/alerts.go b/internal/alerts/alerts.go index 612e517f3..86fdf7cda 100644 --- a/internal/alerts/alerts.go +++ b/internal/alerts/alerts.go @@ -433,16 +433,16 @@ func (m *Manager) CheckGuest(guest interface{}, instanceName string) { return } - // Check for Pulse-specific tags (operational overrides - highest priority) - // Tags are meant for temporary operational control, complementing (not replacing) custom rules + // Check for Pulse-specific tags (direct VM control - highest priority) + // Tags provide direct per-VM alert control without UI configuration var suppressAlerts, monitorOnly, useRelaxedThresholds bool for _, tag := range tags { switch tag { - case "pulse-no-alerts": // Maintenance mode - completely suppress all alerts + case "pulse-no-alerts": // Completely suppress all alerts for this VM suppressAlerts = true - case "pulse-monitor-only": // Operational mode - show in UI but suppress notifications + case "pulse-monitor-only": // Show in UI but suppress notifications monitorOnly = true - case "pulse-relaxed": // Temporary override - relax thresholds during expected high load + case "pulse-relaxed": // Use relaxed thresholds (95% CPU/RAM, 98% disk) useRelaxedThresholds = true } } @@ -456,7 +456,7 @@ func (m *Manager) CheckGuest(guest interface{}, instanceName string) { log.Info(). Str("alertID", alertID). Str("guest", name). - Msg("Cleared alert - guest has pulse-no-alerts tag (maintenance mode)") + Msg("Cleared alert - guest has pulse-no-alerts tag") } } m.mu.Unlock() @@ -503,7 +503,7 @@ func (m *Manager) CheckGuest(guest interface{}, instanceName string) { } log.Info(). Str("guest", name). - Msg("Applied pulse-relaxed tag override (95% CPU/RAM, 98% disk) - temporary operational override") + Msg("Applied pulse-relaxed tag (95% CPU/RAM, 98% disk)") } // Check each metric