mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 11:30:15 +00:00
Major new AI capabilities for infrastructure monitoring: Investigation System: - Autonomous finding investigation with configurable autonomy levels - Investigation orchestrator with rate limiting and guardrails - Safety checks for read-only mode enforcement - Chat-based investigation with approval workflows Forecasting & Remediation: - Trend forecasting for resource capacity planning - Remediation engine for generating fix proposals - Circuit breaker for AI operation protection Unified Findings: - Unified store bridging alerts and AI findings - Correlation and root cause analysis - Incident coordinator with metrics recording New Frontend: - AI Intelligence page with patrol controls - Investigation drawer for finding details - Unified findings panel with actions Supporting Infrastructure: - Learning store for user preference tracking - Proxmox event ingestion and correlation - Enhanced patrol with investigation triggers
471 lines
18 KiB
Go
471 lines
18 KiB
Go
package ai
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/rs/zerolog/log"
|
|
)
|
|
|
|
// IsDemoMode returns true if mock/demo mode is enabled
|
|
// This checks the PULSE_MOCK_MODE env var used by the mock data system
|
|
func IsDemoMode() bool {
|
|
return strings.EqualFold(os.Getenv("PULSE_MOCK_MODE"), "true")
|
|
}
|
|
|
|
// mockResourcePatterns contains name patterns that indicate mock/demo resources
|
|
var mockResourcePatterns = []string{
|
|
"pve1", "pve2", "pve3", "pve4", "pve5", "pve6", "pve7", // mock PVE nodes
|
|
"mock-cluster", "mock-", // generic mock prefixes
|
|
"Ceres", "Atlas", "Nova", "Orion", "Vega", "Rigel", // mock host agent names
|
|
"docker-host-", "k8s-cluster-", // mock Docker/K8s names
|
|
"demo-", // demo prefixes
|
|
}
|
|
|
|
// IsMockResource returns true if the resource name/ID appears to be mock data
|
|
// This is used to filter out mock resources from heuristic analysis when not in demo mode
|
|
func IsMockResource(resourceID, resourceName, node string) bool {
|
|
// If we're in demo mode, don't filter anything - we want mock resources
|
|
if IsDemoMode() {
|
|
return false
|
|
}
|
|
|
|
// Check against mock patterns
|
|
toCheck := []string{resourceID, resourceName, node}
|
|
for _, value := range toCheck {
|
|
if value == "" {
|
|
continue
|
|
}
|
|
for _, pattern := range mockResourcePatterns {
|
|
if strings.Contains(value, pattern) {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// InjectDemoFindings populates the patrol service with realistic mock findings
|
|
// This is used for demo instances to showcase AI features without actual AI API calls
|
|
func (p *PatrolService) InjectDemoFindings() {
|
|
if p == nil || p.findings == nil {
|
|
return
|
|
}
|
|
|
|
log.Info().Msg("Demo mode: Injecting mock AI patrol findings")
|
|
|
|
now := time.Now()
|
|
|
|
// Create realistic demo findings
|
|
demoFindings := []*Finding{
|
|
{
|
|
ID: "demo-storage-critical",
|
|
Key: "storage:local-zfs:capacity",
|
|
Severity: FindingSeverityCritical,
|
|
Category: FindingCategoryCapacity,
|
|
Title: "ZFS pool 'local-zfs' is 94% full",
|
|
Description: "Storage pool local-zfs on pve1 has only 47GB remaining out of 750GB. At current growth rate, pool will be full in approximately 5 days.",
|
|
ResourceID: "storage/pve1/local-zfs",
|
|
ResourceName: "local-zfs",
|
|
ResourceType: "storage",
|
|
Node: "pve1",
|
|
Recommendation: `**Immediate actions:**
|
|
1. Identify large files: ` + "`zfs list -o name,used,refer -t all | sort -k2 -h | tail -20`" + `
|
|
2. Check for orphaned VM disks: ` + "`pvesm list local-zfs | grep -v 'vm-'`" + `
|
|
3. Remove old snapshots: ` + "`zfs list -t snapshot -o name,used | sort -k2 -h`" + `
|
|
|
|
**Long-term:**
|
|
- Add additional storage or migrate VMs to other pools
|
|
- Enable ZFS compression if not already enabled`,
|
|
DetectedAt: now.Add(-2 * time.Hour),
|
|
LastSeenAt: now.Add(-5 * time.Minute),
|
|
TimesRaised: 3,
|
|
Source: "patrol",
|
|
},
|
|
{
|
|
ID: "demo-memory-warning",
|
|
Key: "guest:vm-102:memory",
|
|
Severity: FindingSeverityWarning,
|
|
Category: FindingCategoryPerformance,
|
|
Title: "VM 'jellyfin' memory usage at 91%",
|
|
Description: "jellyfin (VM 102) on pve2 is consistently using 14.5GB of its 16GB allocated memory. Swapping may occur under load, degrading performance.",
|
|
ResourceID: "qemu/102",
|
|
ResourceName: "jellyfin",
|
|
ResourceType: "qemu",
|
|
Node: "pve2",
|
|
Recommendation: `**Options to consider:**
|
|
1. Increase VM memory allocation to 20GB if host has capacity
|
|
2. Check for memory leaks in Jellyfin: restart the service
|
|
3. Limit transcoding to reduce memory pressure
|
|
4. Review Jellyfin cache settings in the dashboard`,
|
|
DetectedAt: now.Add(-6 * time.Hour),
|
|
LastSeenAt: now.Add(-10 * time.Minute),
|
|
TimesRaised: 5,
|
|
Source: "patrol",
|
|
},
|
|
{
|
|
ID: "demo-backup-warning",
|
|
Key: "guest:vm-105:backup",
|
|
Severity: FindingSeverityWarning,
|
|
Category: FindingCategoryBackup,
|
|
Title: "Container 'postgres' hasn't been backed up in 8 days",
|
|
Description: "The postgres container (CT 105) last successful backup was 8 days ago. Your backup schedule targets daily backups.",
|
|
ResourceID: "lxc/105",
|
|
ResourceName: "postgres",
|
|
ResourceType: "lxc",
|
|
Node: "pve1",
|
|
Recommendation: `**Investigate:**
|
|
1. Check PBS backup job status: ` + "`pvesh get /nodes/pve1/tasks --typefilter vzdump`" + `
|
|
2. Verify PBS datastore connectivity
|
|
3. Check for backup job errors in the Proxmox datacenter backup view
|
|
|
|
**Manual backup:**
|
|
` + "`vzdump 105 --storage pbs --mode snapshot`",
|
|
DetectedAt: now.Add(-24 * time.Hour),
|
|
LastSeenAt: now.Add(-15 * time.Minute),
|
|
TimesRaised: 2,
|
|
Source: "patrol",
|
|
},
|
|
{
|
|
ID: "demo-cpu-warning",
|
|
Key: "node:pve3:cpu",
|
|
Severity: FindingSeverityWarning,
|
|
Category: FindingCategoryPerformance,
|
|
Title: "Node 'pve3' sustained high CPU (87%)",
|
|
Description: "Node pve3 has maintained CPU usage above 85% for the past 2 hours. This may indicate over-provisioning or a runaway process.",
|
|
ResourceID: "node/pve3",
|
|
ResourceName: "pve3",
|
|
ResourceType: "node",
|
|
Node: "pve3",
|
|
Recommendation: `**Diagnose:**
|
|
1. Check top processes: ` + "`ssh pve3 'top -bn1 | head -20'`" + `
|
|
2. Identify VM CPU usage: ` + "`pvesh get /nodes/pve3/qemu --output-format json | jq '.[] | {name, cpu}'`" + `
|
|
|
|
**Consider:**
|
|
- Live-migrate a VM to another node: ` + "`qm migrate <vmid> pve1 --online`" + `
|
|
- Set CPU limits on high-usage VMs`,
|
|
DetectedAt: now.Add(-2 * time.Hour),
|
|
LastSeenAt: now.Add(-8 * time.Minute),
|
|
TimesRaised: 4,
|
|
Source: "patrol",
|
|
},
|
|
{
|
|
ID: "demo-docker-warning",
|
|
Key: "docker:portainer:container-restart",
|
|
Severity: FindingSeverityWarning,
|
|
Category: FindingCategoryReliability,
|
|
Title: "Docker container 'uptime-kuma' restarting frequently",
|
|
Description: "The uptime-kuma container on docker-host-1 has restarted 7 times in the past 24 hours. This may indicate configuration issues or resource constraints.",
|
|
ResourceID: "docker/docker-host-1/uptime-kuma",
|
|
ResourceName: "uptime-kuma",
|
|
ResourceType: "docker_container",
|
|
Node: "docker-host-1",
|
|
Recommendation: `**Check logs:**
|
|
` + "`docker logs uptime-kuma --tail 100`" + `
|
|
|
|
**Common causes:**
|
|
- OOM kills: check ` + "`docker stats uptime-kuma`" + `
|
|
- Configuration errors in environment variables
|
|
- Database corruption (check data volume)`,
|
|
DetectedAt: now.Add(-12 * time.Hour),
|
|
LastSeenAt: now.Add(-20 * time.Minute),
|
|
TimesRaised: 3,
|
|
Source: "patrol",
|
|
},
|
|
}
|
|
|
|
// Add findings to the store
|
|
for _, f := range demoFindings {
|
|
p.findings.Add(f)
|
|
}
|
|
|
|
// Also add some demo patrol run history
|
|
p.injectDemoRunHistory()
|
|
|
|
log.Info().Int("findings_count", len(demoFindings)).Msg("Demo mode: Mock findings injected")
|
|
}
|
|
|
|
// injectDemoRunHistory adds realistic patrol run history for the demo
|
|
func (p *PatrolService) injectDemoRunHistory() {
|
|
if p.runHistoryStore == nil {
|
|
return
|
|
}
|
|
|
|
now := time.Now()
|
|
|
|
// Clear existing history first to avoid duplicates on restart
|
|
// (Assuming we can't easily clear, we'll just generate new IDs based on time to be idempotent-ish)
|
|
|
|
// Create a realistic schedule: every 6 hours for the last 3 days
|
|
var demoRuns []PatrolRunRecord
|
|
|
|
// 1. Most recent run (just happened)
|
|
demoRuns = append(demoRuns, PatrolRunRecord{
|
|
ID: fmt.Sprintf("demo-run-%d", now.Unix()),
|
|
StartedAt: now.Add(-15 * time.Minute),
|
|
CompletedAt: now.Add(-14*time.Minute + 15*time.Second),
|
|
Duration: 75 * time.Second,
|
|
Type: "patrol",
|
|
ResourcesChecked: 47,
|
|
NodesChecked: 5,
|
|
GuestsChecked: 32,
|
|
DockerChecked: 8,
|
|
StorageChecked: 6,
|
|
NewFindings: 0,
|
|
ExistingFindings: 5,
|
|
ResolvedFindings: 0,
|
|
FindingsSummary: "2 critical, 3 warnings",
|
|
FindingIDs: []string{"demo-storage-critical", "9e1eb083b7109506", "demo-memory-warning", "demo-backup-warning", "demo-cpu-warning"},
|
|
Status: "issues_found",
|
|
InputTokens: 4250,
|
|
OutputTokens: 890,
|
|
})
|
|
|
|
// 2. Scheduled runs (every 6 hours)
|
|
for i := 1; i <= 12; i++ {
|
|
offset := time.Duration(i*6) * time.Hour
|
|
startTime := now.Add(-offset)
|
|
|
|
// Vary the duration slightly
|
|
duration := time.Duration(40+(i%30)) * time.Second
|
|
|
|
// Outcomes vary over time
|
|
var summary string
|
|
var status string
|
|
var newFindings, existingFindings, resolvedFindings int
|
|
var findingIDs []string
|
|
|
|
if i <= 4 { // Last 24h - steady state of issues
|
|
summary = "2 critical, 3 warnings"
|
|
status = "issues_found"
|
|
existingFindings = 5
|
|
findingIDs = []string{"demo-storage-critical", "9e1eb083b7109506", "demo-memory-warning", "demo-backup-warning", "demo-cpu-warning"}
|
|
} else if i == 5 { // 30h ago - one issue appeared
|
|
summary = "1 new critical, 3 warnings"
|
|
status = "issues_found"
|
|
newFindings = 1
|
|
existingFindings = 3
|
|
findingIDs = []string{"demo-storage-critical", "demo-memory-warning", "demo-backup-warning", "demo-cpu-warning"}
|
|
} else if i <= 10 { // 2-3 days ago - fewer issues
|
|
summary = "3 warnings"
|
|
status = "issues_found"
|
|
existingFindings = 3
|
|
findingIDs = []string{"demo-memory-warning", "demo-backup-warning", "demo-cpu-warning"}
|
|
} else { // > 3 days ago - clean state
|
|
summary = "No issues found"
|
|
status = "healthy"
|
|
resolvedFindings = 1
|
|
}
|
|
|
|
demoRuns = append(demoRuns, PatrolRunRecord{
|
|
ID: fmt.Sprintf("demo-run-%d", startTime.Unix()),
|
|
StartedAt: startTime,
|
|
CompletedAt: startTime.Add(duration),
|
|
Duration: duration,
|
|
Type: "patrol",
|
|
ResourcesChecked: 47,
|
|
NodesChecked: 5,
|
|
GuestsChecked: 32,
|
|
DockerChecked: 8,
|
|
StorageChecked: 6,
|
|
NewFindings: newFindings,
|
|
ExistingFindings: existingFindings,
|
|
ResolvedFindings: resolvedFindings,
|
|
FindingsSummary: summary,
|
|
FindingIDs: findingIDs,
|
|
Status: status,
|
|
InputTokens: 4000 + (i * 10),
|
|
OutputTokens: 500 + (i * 5),
|
|
})
|
|
}
|
|
|
|
for _, run := range demoRuns {
|
|
p.runHistoryStore.Add(run)
|
|
}
|
|
}
|
|
|
|
// GenerateDemoAIResponse returns a realistic mock AI response for demo mode
|
|
// This allows the demo server to showcase the AI Assistant without a real API key
|
|
func GenerateDemoAIResponse(prompt string) *ExecuteResponse {
|
|
promptLower := strings.ToLower(prompt)
|
|
|
|
// Determine response based on prompt content
|
|
var response string
|
|
|
|
switch {
|
|
// Detect Patrol Analysis Request
|
|
case strings.Contains(promptLower, "analyze") && strings.Contains(promptLower, "infrastructure"):
|
|
response = `Based on the infrastructure data provided, I have identified the following issues:
|
|
|
|
[FINDING]
|
|
SEVERITY: critical
|
|
CATEGORY: capacity
|
|
KEY: storage:local-zfs:capacity
|
|
RESOURCE_ID: storage/pve1/local-zfs
|
|
RESOURCE_NAME: local-zfs
|
|
RESOURCE_TYPE: storage
|
|
NODE: pve1
|
|
TITLE: ZFS pool 'local-zfs' is 94% full
|
|
DESCRIPTION: Storage pool local-zfs on pve1 has critical capacity usage. This endangers VM stability and snapshot creation.
|
|
RECOMMENDATION: **Immediate actions:**
|
|
1. Identify large files: ` + "`zfs list -o name,used,refer -t all | sort -k2 -h`" + `
|
|
2. Check for orphaned VM disks
|
|
3. Remove old snapshots
|
|
EVIDENCE: Used: 94% (703GB/750GB)
|
|
[/FINDING]
|
|
|
|
[FINDING]
|
|
SEVERITY: warning
|
|
CATEGORY: performance
|
|
KEY: vm:vm-102:memory
|
|
RESOURCE_ID: qemu/102
|
|
RESOURCE_NAME: vm-database
|
|
RESOURCE_TYPE: vm
|
|
NODE: pve1
|
|
TITLE: High memory pressure on Database VM
|
|
DESCRIPTION: VM 'vm-database' is consistently using >90% RAM with significant swap activity.
|
|
RECOMMENDATION: Increase memory allocation to 16GB or enable ballooning.
|
|
EVIDENCE: Memory: 92% (7.4GB/8GB), Swap: 30%
|
|
[/FINDING]
|
|
|
|
[FINDING]
|
|
SEVERITY: warning
|
|
CATEGORY: security
|
|
KEY: host:pve2:ssh
|
|
RESOURCE_ID: node/pve2
|
|
RESOURCE_NAME: pve2
|
|
RESOURCE_TYPE: node
|
|
NODE: pve2
|
|
TITLE: Root SSH login enabled
|
|
DESCRIPTION: Root SSH login is enabled on node pve2, which is a security risk.
|
|
RECOMMENDATION: Disable root login in /etc/ssh/sshd_config and use key-based authentication.
|
|
EVIDENCE: PermitRootLogin yes found in config
|
|
[/FINDING]
|
|
`
|
|
|
|
case strings.Contains(promptLower, "disk") || strings.Contains(promptLower, "storage") || strings.Contains(promptLower, "full"):
|
|
response = "## Disk Usage Analysis\n\n" +
|
|
"Based on the current metrics, I can see elevated disk usage. Here are my recommendations:\n\n" +
|
|
"### Immediate Actions\n" +
|
|
"1. **Check large files**: Run `du -sh /* | sort -rh | head -20` to find the largest directories\n" +
|
|
"2. **Review logs**: Old logs often consume significant space. Check `/var/log` and consider log rotation\n" +
|
|
"3. **Docker cleanup**: If using Docker, run `docker system prune -a` to remove unused images\n\n" +
|
|
"### Long-term Solutions\n" +
|
|
"- Set up automated log rotation with logrotate\n" +
|
|
"- Configure alerts at 80% to catch issues before they become critical\n" +
|
|
"- Consider expanding storage if usage is consistently high\n\n" +
|
|
"Would you like me to help investigate any specific directory?"
|
|
|
|
case strings.Contains(promptLower, "memory") || strings.Contains(promptLower, "ram") || strings.Contains(promptLower, "oom"):
|
|
response = "## Memory Analysis\n\n" +
|
|
"I can help analyze memory usage patterns. Here's what I recommend:\n\n" +
|
|
"### Quick Diagnostics\n" +
|
|
"1. **Current usage**: Check top consumers with `ps aux --sort=-%mem | head -15`\n" +
|
|
"2. **Memory pressure**: Review `/proc/meminfo` for swap usage\n" +
|
|
"3. **OOM events**: Check `dmesg | grep -i oom` for recent kills\n\n" +
|
|
"### Optimization Tips\n" +
|
|
"- Consider increasing VM memory allocation if the host has capacity\n" +
|
|
"- Review application memory limits (especially for Java apps with -Xmx)\n" +
|
|
"- Enable memory ballooning for better cluster-wide memory utilization\n\n" +
|
|
"This is a **demo instance** - in production, I can run these commands directly on your nodes."
|
|
|
|
case strings.Contains(promptLower, "backup") || strings.Contains(promptLower, "pbs"):
|
|
response = "## Backup Status Review\n\n" +
|
|
"Backups are critical for data protection. Here's my analysis:\n\n" +
|
|
"### Recommended Checks\n" +
|
|
"1. **PBS connectivity**: Verify the Proxmox Backup Server is reachable\n" +
|
|
"2. **Job schedules**: Review backup job configurations in Datacenter → Backup\n" +
|
|
"3. **Storage capacity**: Ensure PBS datastore has sufficient space for new backups\n\n" +
|
|
"### Best Practices\n" +
|
|
"- Schedule daily backups during low-usage periods\n" +
|
|
"- Keep at least 7 daily + 4 weekly retention\n" +
|
|
"- Test restores periodically to verify backup integrity\n\n" +
|
|
"Would you like me to help configure backup schedules for specific VMs?"
|
|
|
|
case strings.Contains(promptLower, "cpu") || strings.Contains(promptLower, "load") || strings.Contains(promptLower, "slow"):
|
|
response = "## CPU/Performance Analysis\n\n" +
|
|
"High CPU can indicate various issues. Let me help diagnose:\n\n" +
|
|
"### Diagnostic Steps\n" +
|
|
"1. **Top processes**: `top -bn1 | head -20` shows current CPU consumers\n" +
|
|
"2. **Load average**: Check if load > number of CPU cores\n" +
|
|
"3. **Per-VM usage**: Review individual guest CPU allocation\n\n" +
|
|
"### Common Causes\n" +
|
|
"- Overprovisioned guests (total vCPUs > physical cores)\n" +
|
|
"- Runaway processes within VMs\n" +
|
|
"- Background tasks like backups or replication\n\n" +
|
|
"### Quick Wins\n" +
|
|
"- Consider live-migrating busy VMs to less loaded nodes\n" +
|
|
"- Set CPU limits on non-critical guests\n" +
|
|
"- Schedule heavy tasks during off-peak hours"
|
|
|
|
case strings.Contains(promptLower, "hello") || strings.Contains(promptLower, "hi") || strings.Contains(promptLower, "help"):
|
|
response = "## Hello! 👋\n\n" +
|
|
"I'm the **Pulse Assistant**, here to help you manage your Proxmox infrastructure.\n\n" +
|
|
"### What I Can Help With\n" +
|
|
"- **Troubleshooting**: Diagnose disk, memory, CPU, and network issues\n" +
|
|
"- **Backups**: Review backup status and configure schedules\n" +
|
|
"- **Optimization**: Identify resource bottlenecks and optimization opportunities\n" +
|
|
"- **Commands**: Execute maintenance commands on your nodes (with your approval)\n\n" +
|
|
"### Try Asking\n" +
|
|
"- \"Why is my disk filling up?\"\n" +
|
|
"- \"Help me fix the backup failure on vm-102\"\n" +
|
|
"- \"Check memory usage on pve1\"\n\n" +
|
|
"*Note: This is a demo instance - command execution is disabled, but you can see how Pulse Assistant analysis works!*"
|
|
|
|
default:
|
|
response = "## Analysis\n\n" +
|
|
"I can help you with that! In a production environment with Pulse Assistant configured, I would:\n\n" +
|
|
"1. **Analyze** the current state of your infrastructure\n" +
|
|
"2. **Identify** potential issues or optimization opportunities\n" +
|
|
"3. **Recommend** specific actions with commands you can run\n" +
|
|
"4. **Execute** approved commands directly on your nodes\n\n" +
|
|
"### This Demo Shows\n" +
|
|
"- How Pulse Assistant analysis works in Pulse\n" +
|
|
"- The types of insights and recommendations you'll receive\n" +
|
|
"- Command approvals with manual confirmation\n\n" +
|
|
"To enable full Pulse Assistant capabilities in your own Pulse installation:\n" +
|
|
"1. Go to **Settings → Pulse Assistant**\n" +
|
|
"2. Add your API key (Anthropic, OpenAI, DeepSeek, or Ollama)\n" +
|
|
"3. Enable Pulse Assistant features\n\n" +
|
|
"*Visit [pulserelay.pro](https://pulserelay.pro) to get Pulse Pro!*"
|
|
}
|
|
|
|
return &ExecuteResponse{
|
|
Content: response,
|
|
Model: "demo-model",
|
|
InputTokens: 150,
|
|
OutputTokens: 400,
|
|
}
|
|
}
|
|
|
|
// GenerateDemoAIStream acts like GenerateDemoAIResponse but streams content via callback
|
|
func GenerateDemoAIStream(prompt string, callback StreamCallback) (*ExecuteResponse, error) {
|
|
resp := GenerateDemoAIResponse(prompt)
|
|
|
|
// Simulate streaming by sending chunks
|
|
chunkSize := 10
|
|
content := resp.Content
|
|
|
|
for i := 0; i < len(content); i += chunkSize {
|
|
end := i + chunkSize
|
|
if end > len(content) {
|
|
end = len(content)
|
|
}
|
|
|
|
callback(StreamEvent{
|
|
Type: "content",
|
|
Data: content[i:end],
|
|
})
|
|
|
|
// Tiny sleep to simulate generation speed
|
|
time.Sleep(10 * time.Millisecond)
|
|
}
|
|
|
|
callback(StreamEvent{
|
|
Type: "done",
|
|
})
|
|
|
|
return resp, nil
|
|
}
|