Pulse/internal/ai/demo.go
rcourtman 27f1a11acb feat: add AI Intelligence system with investigation and forecasting
Major new AI capabilities for infrastructure monitoring:

Investigation System:
- Autonomous finding investigation with configurable autonomy levels
- Investigation orchestrator with rate limiting and guardrails
- Safety checks for read-only mode enforcement
- Chat-based investigation with approval workflows

Forecasting & Remediation:
- Trend forecasting for resource capacity planning
- Remediation engine for generating fix proposals
- Circuit breaker for AI operation protection

Unified Findings:
- Unified store bridging alerts and AI findings
- Correlation and root cause analysis
- Incident coordinator with metrics recording

New Frontend:
- AI Intelligence page with patrol controls
- Investigation drawer for finding details
- Unified findings panel with actions

Supporting Infrastructure:
- Learning store for user preference tracking
- Proxmox event ingestion and correlation
- Enhanced patrol with investigation triggers
2026-01-24 22:41:43 +00:00

471 lines
18 KiB
Go

package ai
import (
"fmt"
"os"
"strings"
"time"
"github.com/rs/zerolog/log"
)
// IsDemoMode returns true if mock/demo mode is enabled
// This checks the PULSE_MOCK_MODE env var used by the mock data system
func IsDemoMode() bool {
return strings.EqualFold(os.Getenv("PULSE_MOCK_MODE"), "true")
}
// mockResourcePatterns contains name patterns that indicate mock/demo resources
var mockResourcePatterns = []string{
"pve1", "pve2", "pve3", "pve4", "pve5", "pve6", "pve7", // mock PVE nodes
"mock-cluster", "mock-", // generic mock prefixes
"Ceres", "Atlas", "Nova", "Orion", "Vega", "Rigel", // mock host agent names
"docker-host-", "k8s-cluster-", // mock Docker/K8s names
"demo-", // demo prefixes
}
// IsMockResource returns true if the resource name/ID appears to be mock data
// This is used to filter out mock resources from heuristic analysis when not in demo mode
func IsMockResource(resourceID, resourceName, node string) bool {
// If we're in demo mode, don't filter anything - we want mock resources
if IsDemoMode() {
return false
}
// Check against mock patterns
toCheck := []string{resourceID, resourceName, node}
for _, value := range toCheck {
if value == "" {
continue
}
for _, pattern := range mockResourcePatterns {
if strings.Contains(value, pattern) {
return true
}
}
}
return false
}
// InjectDemoFindings populates the patrol service with realistic mock findings
// This is used for demo instances to showcase AI features without actual AI API calls
func (p *PatrolService) InjectDemoFindings() {
if p == nil || p.findings == nil {
return
}
log.Info().Msg("Demo mode: Injecting mock AI patrol findings")
now := time.Now()
// Create realistic demo findings
demoFindings := []*Finding{
{
ID: "demo-storage-critical",
Key: "storage:local-zfs:capacity",
Severity: FindingSeverityCritical,
Category: FindingCategoryCapacity,
Title: "ZFS pool 'local-zfs' is 94% full",
Description: "Storage pool local-zfs on pve1 has only 47GB remaining out of 750GB. At current growth rate, pool will be full in approximately 5 days.",
ResourceID: "storage/pve1/local-zfs",
ResourceName: "local-zfs",
ResourceType: "storage",
Node: "pve1",
Recommendation: `**Immediate actions:**
1. Identify large files: ` + "`zfs list -o name,used,refer -t all | sort -k2 -h | tail -20`" + `
2. Check for orphaned VM disks: ` + "`pvesm list local-zfs | grep -v 'vm-'`" + `
3. Remove old snapshots: ` + "`zfs list -t snapshot -o name,used | sort -k2 -h`" + `
**Long-term:**
- Add additional storage or migrate VMs to other pools
- Enable ZFS compression if not already enabled`,
DetectedAt: now.Add(-2 * time.Hour),
LastSeenAt: now.Add(-5 * time.Minute),
TimesRaised: 3,
Source: "patrol",
},
{
ID: "demo-memory-warning",
Key: "guest:vm-102:memory",
Severity: FindingSeverityWarning,
Category: FindingCategoryPerformance,
Title: "VM 'jellyfin' memory usage at 91%",
Description: "jellyfin (VM 102) on pve2 is consistently using 14.5GB of its 16GB allocated memory. Swapping may occur under load, degrading performance.",
ResourceID: "qemu/102",
ResourceName: "jellyfin",
ResourceType: "qemu",
Node: "pve2",
Recommendation: `**Options to consider:**
1. Increase VM memory allocation to 20GB if host has capacity
2. Check for memory leaks in Jellyfin: restart the service
3. Limit transcoding to reduce memory pressure
4. Review Jellyfin cache settings in the dashboard`,
DetectedAt: now.Add(-6 * time.Hour),
LastSeenAt: now.Add(-10 * time.Minute),
TimesRaised: 5,
Source: "patrol",
},
{
ID: "demo-backup-warning",
Key: "guest:vm-105:backup",
Severity: FindingSeverityWarning,
Category: FindingCategoryBackup,
Title: "Container 'postgres' hasn't been backed up in 8 days",
Description: "The postgres container (CT 105) last successful backup was 8 days ago. Your backup schedule targets daily backups.",
ResourceID: "lxc/105",
ResourceName: "postgres",
ResourceType: "lxc",
Node: "pve1",
Recommendation: `**Investigate:**
1. Check PBS backup job status: ` + "`pvesh get /nodes/pve1/tasks --typefilter vzdump`" + `
2. Verify PBS datastore connectivity
3. Check for backup job errors in the Proxmox datacenter backup view
**Manual backup:**
` + "`vzdump 105 --storage pbs --mode snapshot`",
DetectedAt: now.Add(-24 * time.Hour),
LastSeenAt: now.Add(-15 * time.Minute),
TimesRaised: 2,
Source: "patrol",
},
{
ID: "demo-cpu-warning",
Key: "node:pve3:cpu",
Severity: FindingSeverityWarning,
Category: FindingCategoryPerformance,
Title: "Node 'pve3' sustained high CPU (87%)",
Description: "Node pve3 has maintained CPU usage above 85% for the past 2 hours. This may indicate over-provisioning or a runaway process.",
ResourceID: "node/pve3",
ResourceName: "pve3",
ResourceType: "node",
Node: "pve3",
Recommendation: `**Diagnose:**
1. Check top processes: ` + "`ssh pve3 'top -bn1 | head -20'`" + `
2. Identify VM CPU usage: ` + "`pvesh get /nodes/pve3/qemu --output-format json | jq '.[] | {name, cpu}'`" + `
**Consider:**
- Live-migrate a VM to another node: ` + "`qm migrate <vmid> pve1 --online`" + `
- Set CPU limits on high-usage VMs`,
DetectedAt: now.Add(-2 * time.Hour),
LastSeenAt: now.Add(-8 * time.Minute),
TimesRaised: 4,
Source: "patrol",
},
{
ID: "demo-docker-warning",
Key: "docker:portainer:container-restart",
Severity: FindingSeverityWarning,
Category: FindingCategoryReliability,
Title: "Docker container 'uptime-kuma' restarting frequently",
Description: "The uptime-kuma container on docker-host-1 has restarted 7 times in the past 24 hours. This may indicate configuration issues or resource constraints.",
ResourceID: "docker/docker-host-1/uptime-kuma",
ResourceName: "uptime-kuma",
ResourceType: "docker_container",
Node: "docker-host-1",
Recommendation: `**Check logs:**
` + "`docker logs uptime-kuma --tail 100`" + `
**Common causes:**
- OOM kills: check ` + "`docker stats uptime-kuma`" + `
- Configuration errors in environment variables
- Database corruption (check data volume)`,
DetectedAt: now.Add(-12 * time.Hour),
LastSeenAt: now.Add(-20 * time.Minute),
TimesRaised: 3,
Source: "patrol",
},
}
// Add findings to the store
for _, f := range demoFindings {
p.findings.Add(f)
}
// Also add some demo patrol run history
p.injectDemoRunHistory()
log.Info().Int("findings_count", len(demoFindings)).Msg("Demo mode: Mock findings injected")
}
// injectDemoRunHistory adds realistic patrol run history for the demo
func (p *PatrolService) injectDemoRunHistory() {
if p.runHistoryStore == nil {
return
}
now := time.Now()
// Clear existing history first to avoid duplicates on restart
// (Assuming we can't easily clear, we'll just generate new IDs based on time to be idempotent-ish)
// Create a realistic schedule: every 6 hours for the last 3 days
var demoRuns []PatrolRunRecord
// 1. Most recent run (just happened)
demoRuns = append(demoRuns, PatrolRunRecord{
ID: fmt.Sprintf("demo-run-%d", now.Unix()),
StartedAt: now.Add(-15 * time.Minute),
CompletedAt: now.Add(-14*time.Minute + 15*time.Second),
Duration: 75 * time.Second,
Type: "patrol",
ResourcesChecked: 47,
NodesChecked: 5,
GuestsChecked: 32,
DockerChecked: 8,
StorageChecked: 6,
NewFindings: 0,
ExistingFindings: 5,
ResolvedFindings: 0,
FindingsSummary: "2 critical, 3 warnings",
FindingIDs: []string{"demo-storage-critical", "9e1eb083b7109506", "demo-memory-warning", "demo-backup-warning", "demo-cpu-warning"},
Status: "issues_found",
InputTokens: 4250,
OutputTokens: 890,
})
// 2. Scheduled runs (every 6 hours)
for i := 1; i <= 12; i++ {
offset := time.Duration(i*6) * time.Hour
startTime := now.Add(-offset)
// Vary the duration slightly
duration := time.Duration(40+(i%30)) * time.Second
// Outcomes vary over time
var summary string
var status string
var newFindings, existingFindings, resolvedFindings int
var findingIDs []string
if i <= 4 { // Last 24h - steady state of issues
summary = "2 critical, 3 warnings"
status = "issues_found"
existingFindings = 5
findingIDs = []string{"demo-storage-critical", "9e1eb083b7109506", "demo-memory-warning", "demo-backup-warning", "demo-cpu-warning"}
} else if i == 5 { // 30h ago - one issue appeared
summary = "1 new critical, 3 warnings"
status = "issues_found"
newFindings = 1
existingFindings = 3
findingIDs = []string{"demo-storage-critical", "demo-memory-warning", "demo-backup-warning", "demo-cpu-warning"}
} else if i <= 10 { // 2-3 days ago - fewer issues
summary = "3 warnings"
status = "issues_found"
existingFindings = 3
findingIDs = []string{"demo-memory-warning", "demo-backup-warning", "demo-cpu-warning"}
} else { // > 3 days ago - clean state
summary = "No issues found"
status = "healthy"
resolvedFindings = 1
}
demoRuns = append(demoRuns, PatrolRunRecord{
ID: fmt.Sprintf("demo-run-%d", startTime.Unix()),
StartedAt: startTime,
CompletedAt: startTime.Add(duration),
Duration: duration,
Type: "patrol",
ResourcesChecked: 47,
NodesChecked: 5,
GuestsChecked: 32,
DockerChecked: 8,
StorageChecked: 6,
NewFindings: newFindings,
ExistingFindings: existingFindings,
ResolvedFindings: resolvedFindings,
FindingsSummary: summary,
FindingIDs: findingIDs,
Status: status,
InputTokens: 4000 + (i * 10),
OutputTokens: 500 + (i * 5),
})
}
for _, run := range demoRuns {
p.runHistoryStore.Add(run)
}
}
// GenerateDemoAIResponse returns a realistic mock AI response for demo mode
// This allows the demo server to showcase the AI Assistant without a real API key
func GenerateDemoAIResponse(prompt string) *ExecuteResponse {
promptLower := strings.ToLower(prompt)
// Determine response based on prompt content
var response string
switch {
// Detect Patrol Analysis Request
case strings.Contains(promptLower, "analyze") && strings.Contains(promptLower, "infrastructure"):
response = `Based on the infrastructure data provided, I have identified the following issues:
[FINDING]
SEVERITY: critical
CATEGORY: capacity
KEY: storage:local-zfs:capacity
RESOURCE_ID: storage/pve1/local-zfs
RESOURCE_NAME: local-zfs
RESOURCE_TYPE: storage
NODE: pve1
TITLE: ZFS pool 'local-zfs' is 94% full
DESCRIPTION: Storage pool local-zfs on pve1 has critical capacity usage. This endangers VM stability and snapshot creation.
RECOMMENDATION: **Immediate actions:**
1. Identify large files: ` + "`zfs list -o name,used,refer -t all | sort -k2 -h`" + `
2. Check for orphaned VM disks
3. Remove old snapshots
EVIDENCE: Used: 94% (703GB/750GB)
[/FINDING]
[FINDING]
SEVERITY: warning
CATEGORY: performance
KEY: vm:vm-102:memory
RESOURCE_ID: qemu/102
RESOURCE_NAME: vm-database
RESOURCE_TYPE: vm
NODE: pve1
TITLE: High memory pressure on Database VM
DESCRIPTION: VM 'vm-database' is consistently using >90% RAM with significant swap activity.
RECOMMENDATION: Increase memory allocation to 16GB or enable ballooning.
EVIDENCE: Memory: 92% (7.4GB/8GB), Swap: 30%
[/FINDING]
[FINDING]
SEVERITY: warning
CATEGORY: security
KEY: host:pve2:ssh
RESOURCE_ID: node/pve2
RESOURCE_NAME: pve2
RESOURCE_TYPE: node
NODE: pve2
TITLE: Root SSH login enabled
DESCRIPTION: Root SSH login is enabled on node pve2, which is a security risk.
RECOMMENDATION: Disable root login in /etc/ssh/sshd_config and use key-based authentication.
EVIDENCE: PermitRootLogin yes found in config
[/FINDING]
`
case strings.Contains(promptLower, "disk") || strings.Contains(promptLower, "storage") || strings.Contains(promptLower, "full"):
response = "## Disk Usage Analysis\n\n" +
"Based on the current metrics, I can see elevated disk usage. Here are my recommendations:\n\n" +
"### Immediate Actions\n" +
"1. **Check large files**: Run `du -sh /* | sort -rh | head -20` to find the largest directories\n" +
"2. **Review logs**: Old logs often consume significant space. Check `/var/log` and consider log rotation\n" +
"3. **Docker cleanup**: If using Docker, run `docker system prune -a` to remove unused images\n\n" +
"### Long-term Solutions\n" +
"- Set up automated log rotation with logrotate\n" +
"- Configure alerts at 80% to catch issues before they become critical\n" +
"- Consider expanding storage if usage is consistently high\n\n" +
"Would you like me to help investigate any specific directory?"
case strings.Contains(promptLower, "memory") || strings.Contains(promptLower, "ram") || strings.Contains(promptLower, "oom"):
response = "## Memory Analysis\n\n" +
"I can help analyze memory usage patterns. Here's what I recommend:\n\n" +
"### Quick Diagnostics\n" +
"1. **Current usage**: Check top consumers with `ps aux --sort=-%mem | head -15`\n" +
"2. **Memory pressure**: Review `/proc/meminfo` for swap usage\n" +
"3. **OOM events**: Check `dmesg | grep -i oom` for recent kills\n\n" +
"### Optimization Tips\n" +
"- Consider increasing VM memory allocation if the host has capacity\n" +
"- Review application memory limits (especially for Java apps with -Xmx)\n" +
"- Enable memory ballooning for better cluster-wide memory utilization\n\n" +
"This is a **demo instance** - in production, I can run these commands directly on your nodes."
case strings.Contains(promptLower, "backup") || strings.Contains(promptLower, "pbs"):
response = "## Backup Status Review\n\n" +
"Backups are critical for data protection. Here's my analysis:\n\n" +
"### Recommended Checks\n" +
"1. **PBS connectivity**: Verify the Proxmox Backup Server is reachable\n" +
"2. **Job schedules**: Review backup job configurations in Datacenter → Backup\n" +
"3. **Storage capacity**: Ensure PBS datastore has sufficient space for new backups\n\n" +
"### Best Practices\n" +
"- Schedule daily backups during low-usage periods\n" +
"- Keep at least 7 daily + 4 weekly retention\n" +
"- Test restores periodically to verify backup integrity\n\n" +
"Would you like me to help configure backup schedules for specific VMs?"
case strings.Contains(promptLower, "cpu") || strings.Contains(promptLower, "load") || strings.Contains(promptLower, "slow"):
response = "## CPU/Performance Analysis\n\n" +
"High CPU can indicate various issues. Let me help diagnose:\n\n" +
"### Diagnostic Steps\n" +
"1. **Top processes**: `top -bn1 | head -20` shows current CPU consumers\n" +
"2. **Load average**: Check if load > number of CPU cores\n" +
"3. **Per-VM usage**: Review individual guest CPU allocation\n\n" +
"### Common Causes\n" +
"- Overprovisioned guests (total vCPUs > physical cores)\n" +
"- Runaway processes within VMs\n" +
"- Background tasks like backups or replication\n\n" +
"### Quick Wins\n" +
"- Consider live-migrating busy VMs to less loaded nodes\n" +
"- Set CPU limits on non-critical guests\n" +
"- Schedule heavy tasks during off-peak hours"
case strings.Contains(promptLower, "hello") || strings.Contains(promptLower, "hi") || strings.Contains(promptLower, "help"):
response = "## Hello! 👋\n\n" +
"I'm the **Pulse Assistant**, here to help you manage your Proxmox infrastructure.\n\n" +
"### What I Can Help With\n" +
"- **Troubleshooting**: Diagnose disk, memory, CPU, and network issues\n" +
"- **Backups**: Review backup status and configure schedules\n" +
"- **Optimization**: Identify resource bottlenecks and optimization opportunities\n" +
"- **Commands**: Execute maintenance commands on your nodes (with your approval)\n\n" +
"### Try Asking\n" +
"- \"Why is my disk filling up?\"\n" +
"- \"Help me fix the backup failure on vm-102\"\n" +
"- \"Check memory usage on pve1\"\n\n" +
"*Note: This is a demo instance - command execution is disabled, but you can see how Pulse Assistant analysis works!*"
default:
response = "## Analysis\n\n" +
"I can help you with that! In a production environment with Pulse Assistant configured, I would:\n\n" +
"1. **Analyze** the current state of your infrastructure\n" +
"2. **Identify** potential issues or optimization opportunities\n" +
"3. **Recommend** specific actions with commands you can run\n" +
"4. **Execute** approved commands directly on your nodes\n\n" +
"### This Demo Shows\n" +
"- How Pulse Assistant analysis works in Pulse\n" +
"- The types of insights and recommendations you'll receive\n" +
"- Command approvals with manual confirmation\n\n" +
"To enable full Pulse Assistant capabilities in your own Pulse installation:\n" +
"1. Go to **Settings → Pulse Assistant**\n" +
"2. Add your API key (Anthropic, OpenAI, DeepSeek, or Ollama)\n" +
"3. Enable Pulse Assistant features\n\n" +
"*Visit [pulserelay.pro](https://pulserelay.pro) to get Pulse Pro!*"
}
return &ExecuteResponse{
Content: response,
Model: "demo-model",
InputTokens: 150,
OutputTokens: 400,
}
}
// GenerateDemoAIStream acts like GenerateDemoAIResponse but streams content via callback
func GenerateDemoAIStream(prompt string, callback StreamCallback) (*ExecuteResponse, error) {
resp := GenerateDemoAIResponse(prompt)
// Simulate streaming by sending chunks
chunkSize := 10
content := resp.Content
for i := 0; i < len(content); i += chunkSize {
end := i + chunkSize
if end > len(content) {
end = len(content)
}
callback(StreamEvent{
Type: "content",
Data: content[i:end],
})
// Tiny sleep to simulate generation speed
time.Sleep(10 * time.Millisecond)
}
callback(StreamEvent{
Type: "done",
})
return resp, nil
}