Pulse/internal/api/diagnostics.go
rcourtman cff4226531 Pass stored fingerprint into PVE diagnostic test client
The /api/diagnostics handler builds its own test client per PVE node
to run a live connectivity probe. The PBS branch already passed
node.Fingerprint into the test client config, but the PVE branch did
not. With VerifySSL=true and a self-signed Proxmox cert (the standard
configuration), tlsutil.CreateHTTPClientWithTimeout falls into
default-secure mode and validates against the system CA chain, which
fails the handshake even when the actual poller — which DOES pass
the fingerprint — is connecting fine.

The result was that /api/diagnostics reported delly + pi as
"Failed to connect to Proxmox API" while /api/resources was happily
ingesting all 27 workloads from the same hosts. Mirror the PBS
branch by passing node.Fingerprint into the PVE testCfg so the
diagnostic probe uses the same TLS verification path as the runtime
poller.

Add a regression test that spins up an httptest TLS server, captures
its leaf cert SHA-256, configures a PVE instance with VerifySSL=true
and that fingerprint, and asserts computeDiagnostics reports
Connected=true. The pre-fix code fails this with a "tls: bad
certificate" handshake error.
2026-05-08 20:58:32 +01:00

1853 lines
58 KiB
Go

package api
import (
"context"
"encoding/json"
"errors"
"fmt"
"net/http"
"os"
"os/user"
"runtime"
"sort"
"strconv"
"strings"
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/rcourtman/pulse-go-rewrite/internal/config"
"github.com/rcourtman/pulse-go-rewrite/internal/monitoring"
"github.com/rcourtman/pulse-go-rewrite/internal/unifiedresources"
"github.com/rcourtman/pulse-go-rewrite/internal/updates"
"github.com/rcourtman/pulse-go-rewrite/pkg/pbs"
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
"github.com/rs/zerolog/log"
"golang.org/x/crypto/ssh"
)
// DiagnosticsInfo contains comprehensive diagnostic information
type DiagnosticsInfo struct {
Version string `json:"version"`
Runtime string `json:"runtime"`
Uptime float64 `json:"uptime"`
Nodes []NodeDiagnostic `json:"nodes"`
PBS []PBSDiagnostic `json:"pbs"`
System SystemDiagnostic `json:"system"`
MetricsStore *MetricsStoreDiagnostic `json:"metricsStore,omitempty"`
Discovery *DiscoveryDiagnostic `json:"discovery,omitempty"`
APITokens *APITokenDiagnostic `json:"apiTokens,omitempty"`
DockerAgents *DockerAgentDiagnostic `json:"dockerAgents,omitempty"`
Alerts *AlertsDiagnostic `json:"alerts,omitempty"`
AIChat *AIChatDiagnostic `json:"aiChat,omitempty"`
Errors []string `json:"errors"`
// NodeSnapshots captures the raw memory payload and derived usage Pulse last observed per node.
NodeSnapshots []monitoring.NodeMemorySnapshot `json:"nodeSnapshots"`
// GuestSnapshots captures recent per-guest memory breakdowns (VM/LXC) with the raw Proxmox fields.
GuestSnapshots []monitoring.GuestMemorySnapshot `json:"guestSnapshots"`
// MemorySources summarizes how many nodes currently rely on each memory source per instance.
MemorySources []MemorySourceStat `json:"memorySources"`
// MemorySourceBreakdown captures node and guest source quality, trust, and fallback reasons.
MemorySourceBreakdown []MemorySourceBreakdown `json:"memorySourceBreakdown"`
}
func EmptyDiagnosticsInfo() DiagnosticsInfo {
return DiagnosticsInfo{}.NormalizeCollections()
}
func (d DiagnosticsInfo) NormalizeCollections() DiagnosticsInfo {
if d.Nodes == nil {
d.Nodes = []NodeDiagnostic{}
}
if d.PBS == nil {
d.PBS = []PBSDiagnostic{}
}
if d.Errors == nil {
d.Errors = []string{}
}
if d.NodeSnapshots == nil {
d.NodeSnapshots = []monitoring.NodeMemorySnapshot{}
}
if d.GuestSnapshots == nil {
d.GuestSnapshots = []monitoring.GuestMemorySnapshot{}
}
if d.MemorySources == nil {
d.MemorySources = []MemorySourceStat{}
}
if d.MemorySourceBreakdown == nil {
d.MemorySourceBreakdown = []MemorySourceBreakdown{}
}
for i := range d.Nodes {
d.Nodes[i] = d.Nodes[i].NormalizeCollections()
}
for i := range d.MemorySourceBreakdown {
d.MemorySourceBreakdown[i] = d.MemorySourceBreakdown[i].NormalizeCollections()
}
if d.MetricsStore != nil {
normalized := d.MetricsStore.NormalizeCollections()
d.MetricsStore = &normalized
}
if d.Discovery != nil {
normalized := d.Discovery.NormalizeCollections()
d.Discovery = &normalized
}
if d.APITokens != nil {
normalized := d.APITokens.NormalizeCollections()
d.APITokens = &normalized
}
if d.DockerAgents != nil {
normalized := d.DockerAgents.NormalizeCollections()
d.DockerAgents = &normalized
}
if d.Alerts != nil {
normalized := d.Alerts.NormalizeCollections()
d.Alerts = &normalized
}
if d.AIChat != nil {
normalized := d.AIChat.NormalizeCollections()
d.AIChat = &normalized
}
return d
}
// DiscoveryDiagnostic summarizes discovery configuration and recent activity.
type DiscoveryDiagnostic struct {
Enabled bool `json:"enabled"`
ConfiguredSubnet string `json:"configuredSubnet,omitempty"`
ActiveSubnet string `json:"activeSubnet,omitempty"`
EnvironmentOverride string `json:"environmentOverride,omitempty"`
SubnetAllowlist []string `json:"subnetAllowlist"`
SubnetBlocklist []string `json:"subnetBlocklist"`
Scanning bool `json:"scanning"`
ScanInterval string `json:"scanInterval,omitempty"`
LastScanStartedAt string `json:"lastScanStartedAt,omitempty"`
LastResultTimestamp string `json:"lastResultTimestamp,omitempty"`
LastResultServers int `json:"lastResultServers,omitempty"`
LastResultErrors int `json:"lastResultErrors,omitempty"`
History []DiscoveryHistoryItem `json:"history"`
}
func (d DiscoveryDiagnostic) NormalizeCollections() DiscoveryDiagnostic {
if d.SubnetAllowlist == nil {
d.SubnetAllowlist = []string{}
}
if d.SubnetBlocklist == nil {
d.SubnetBlocklist = []string{}
}
if d.History == nil {
d.History = []DiscoveryHistoryItem{}
}
return d
}
// DiscoveryHistoryItem summarizes the outcome of a recent discovery scan.
type DiscoveryHistoryItem struct {
StartedAt string `json:"startedAt"`
CompletedAt string `json:"completedAt"`
Duration string `json:"duration"`
DurationMs int64 `json:"durationMs"`
Subnet string `json:"subnet"`
ServerCount int `json:"serverCount"`
ErrorCount int `json:"errorCount"`
BlocklistLength int `json:"blocklistLength"`
Status string `json:"status"`
}
// MemorySourceStat aggregates memory-source usage per instance.
type MemorySourceStat struct {
Instance string `json:"instance"`
Source string `json:"source"`
NodeCount int `json:"nodeCount"`
LastUpdated string `json:"lastUpdated"`
Fallback bool `json:"fallback"`
Trust string `json:"trust,omitempty"`
}
// MemorySourceBreakdown captures diagnostics for node and guest memory source selection.
type MemorySourceBreakdown struct {
Instance string `json:"instance"`
Scope string `json:"scope"`
Source string `json:"source"`
Count int `json:"count"`
LastUpdated string `json:"lastUpdated"`
Fallback bool `json:"fallback"`
Trust string `json:"trust"`
FallbackReasons []string `json:"fallbackReasons"`
}
func (b MemorySourceBreakdown) NormalizeCollections() MemorySourceBreakdown {
if b.FallbackReasons == nil {
b.FallbackReasons = []string{}
}
return b
}
// MetricsStoreDiagnostic summarizes metrics store health and data availability.
type MetricsStoreDiagnostic struct {
Enabled bool `json:"enabled"`
Status string `json:"status"`
DBSize int64 `json:"dbSize,omitempty"`
RawCount int64 `json:"rawCount,omitempty"`
MinuteCount int64 `json:"minuteCount,omitempty"`
HourlyCount int64 `json:"hourlyCount,omitempty"`
DailyCount int64 `json:"dailyCount,omitempty"`
TotalPoints int64 `json:"totalPoints,omitempty"`
BufferSize int `json:"bufferSize,omitempty"`
Notes []string `json:"notes"`
Error string `json:"error,omitempty"`
}
func (d MetricsStoreDiagnostic) NormalizeCollections() MetricsStoreDiagnostic {
if d.Notes == nil {
d.Notes = []string{}
}
return d
}
func isFallbackMemorySource(source string) bool {
return monitoring.MemorySourceIsFallback(source)
}
func classifyMemorySourceTrust(source string) string {
return monitoring.MemorySourceTrust(source)
}
func buildMemorySourceDiagnostics(snapshots monitoring.DiagnosticSnapshotSet) ([]MemorySourceStat, []MemorySourceBreakdown) {
type nodeAgg struct {
stat MemorySourceStat
latest time.Time
}
type breakdownAgg struct {
stat MemorySourceBreakdown
latest time.Time
fallbackReasons map[string]struct{}
}
nodeStats := make(map[string]*nodeAgg)
breakdown := make(map[string]*breakdownAgg)
appendBreakdown := func(instance, scope, source string, retrievedAt time.Time, fallbackReason string) {
source = monitoring.CanonicalMemorySource(source)
if source == "" {
source = "unknown"
}
if strings.TrimSpace(fallbackReason) == "" {
fallbackReason = monitoring.MemorySourceFallbackReason(source)
}
key := fmt.Sprintf("%s|%s|%s", instance, scope, source)
entry, ok := breakdown[key]
if !ok {
trust := classifyMemorySourceTrust(source)
entry = &breakdownAgg{
stat: MemorySourceBreakdown{
Instance: instance,
Scope: scope,
Source: source,
Fallback: isFallbackMemorySource(source),
Trust: trust,
},
fallbackReasons: make(map[string]struct{}),
}
breakdown[key] = entry
}
entry.stat.Count++
if fallbackReason = strings.TrimSpace(fallbackReason); fallbackReason != "" {
entry.fallbackReasons[fallbackReason] = struct{}{}
}
if retrievedAt.After(entry.latest) {
entry.latest = retrievedAt
}
}
for _, snap := range snapshots.Nodes {
source := monitoring.CanonicalMemorySource(snap.MemorySource)
if source == "" {
source = "unknown"
}
key := fmt.Sprintf("%s|%s", snap.Instance, source)
entry, ok := nodeStats[key]
if !ok {
trust := classifyMemorySourceTrust(source)
entry = &nodeAgg{
stat: MemorySourceStat{
Instance: snap.Instance,
Source: source,
Fallback: isFallbackMemorySource(source),
Trust: trust,
},
}
nodeStats[key] = entry
}
entry.stat.NodeCount++
if snap.RetrievedAt.After(entry.latest) {
entry.latest = snap.RetrievedAt
}
appendBreakdown(snap.Instance, "node", source, snap.RetrievedAt, snap.FallbackReason)
}
for _, snap := range snapshots.Guests {
appendBreakdown(snap.Instance, "guest", snap.MemorySource, snap.RetrievedAt, snap.FallbackReason)
}
nodeSourceStats := make([]MemorySourceStat, 0, len(nodeStats))
for _, entry := range nodeStats {
if !entry.latest.IsZero() {
entry.stat.LastUpdated = entry.latest.UTC().Format(time.RFC3339)
}
nodeSourceStats = append(nodeSourceStats, entry.stat)
}
sort.Slice(nodeSourceStats, func(i, j int) bool {
if nodeSourceStats[i].Instance == nodeSourceStats[j].Instance {
return nodeSourceStats[i].Source < nodeSourceStats[j].Source
}
return nodeSourceStats[i].Instance < nodeSourceStats[j].Instance
})
breakdownStats := make([]MemorySourceBreakdown, 0, len(breakdown))
for _, entry := range breakdown {
if !entry.latest.IsZero() {
entry.stat.LastUpdated = entry.latest.UTC().Format(time.RFC3339)
}
if len(entry.fallbackReasons) > 0 {
entry.stat.FallbackReasons = make([]string, 0, len(entry.fallbackReasons))
for reason := range entry.fallbackReasons {
entry.stat.FallbackReasons = append(entry.stat.FallbackReasons, reason)
}
sort.Strings(entry.stat.FallbackReasons)
}
breakdownStats = append(breakdownStats, entry.stat)
}
sort.Slice(breakdownStats, func(i, j int) bool {
if breakdownStats[i].Instance == breakdownStats[j].Instance {
if breakdownStats[i].Scope == breakdownStats[j].Scope {
return breakdownStats[i].Source < breakdownStats[j].Source
}
return breakdownStats[i].Scope < breakdownStats[j].Scope
}
return breakdownStats[i].Instance < breakdownStats[j].Instance
})
return nodeSourceStats, breakdownStats
}
func buildMetricsStoreDiagnostic(monitor *monitoring.Monitor) *MetricsStoreDiagnostic {
if monitor == nil {
return &MetricsStoreDiagnostic{
Enabled: false,
Status: "unavailable",
Error: "monitor not initialized",
}
}
store := monitor.GetMetricsStore()
if store == nil {
return &MetricsStoreDiagnostic{
Enabled: false,
Status: "unavailable",
Error: "metrics store not initialized",
}
}
stats := store.GetStats()
total := stats.RawCount + stats.MinuteCount + stats.HourlyCount + stats.DailyCount
status := "healthy"
notes := []string{}
switch {
case total == 0 && stats.BufferSize > 0:
status = "buffering"
notes = append(notes, "Metrics are buffered but not yet flushed")
case total == 0:
status = "empty"
notes = append(notes, "No historical metrics written yet")
}
return &MetricsStoreDiagnostic{
Enabled: true,
Status: status,
DBSize: stats.DBSize,
RawCount: stats.RawCount,
MinuteCount: stats.MinuteCount,
HourlyCount: stats.HourlyCount,
DailyCount: stats.DailyCount,
TotalPoints: total,
BufferSize: stats.BufferSize,
Notes: notes,
}
}
const diagnosticsCacheTTL = 45 * time.Second
type cachedDiagnosticsEntry struct {
diag DiagnosticsInfo
cachedAt time.Time
}
var (
diagnosticsMetricsOnce sync.Once
diagnosticsCacheMu sync.RWMutex
diagnosticsCache = map[string]cachedDiagnosticsEntry{}
diagnosticsCacheHits = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "pulse",
Subsystem: "diagnostics",
Name: "cache_hits_total",
Help: "Total number of diagnostics cache hits.",
})
diagnosticsCacheMisses = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "pulse",
Subsystem: "diagnostics",
Name: "cache_misses_total",
Help: "Total number of diagnostics cache misses.",
})
diagnosticsRefreshDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "pulse",
Subsystem: "diagnostics",
Name: "refresh_duration_seconds",
Help: "Duration of diagnostics refresh operations in seconds.",
Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 20, 30},
})
)
func diagnosticsScopeKey(ctx context.Context) string {
if orgID := strings.TrimSpace(GetOrgID(ctx)); orgID != "" {
return orgID
}
return "__default__"
}
// NodeDiagnostic contains diagnostic info for a Proxmox node
type NodeDiagnostic struct {
ID string `json:"id"`
Name string `json:"name"`
Host string `json:"host"`
Type string `json:"type"`
AuthMethod string `json:"authMethod"`
Connected bool `json:"connected"`
Error string `json:"error,omitempty"`
Details *NodeDetails `json:"details,omitempty"`
LastPoll string `json:"lastPoll,omitempty"`
ClusterInfo *ClusterInfo `json:"clusterInfo,omitempty"`
VMDiskCheck *VMDiskCheckResult `json:"vmDiskCheck,omitempty"`
PhysicalDisks *PhysicalDiskCheck `json:"physicalDisks,omitempty"`
}
func (d NodeDiagnostic) NormalizeCollections() NodeDiagnostic {
if d.VMDiskCheck != nil {
normalized := d.VMDiskCheck.NormalizeCollections()
d.VMDiskCheck = &normalized
}
if d.PhysicalDisks != nil {
normalized := d.PhysicalDisks.NormalizeCollections()
d.PhysicalDisks = &normalized
}
return d
}
// NodeDetails contains node-specific details
type NodeDetails struct {
NodeCount int `json:"node_count,omitempty"`
Version string `json:"version,omitempty"`
}
// VMDiskCheckResult contains VM disk monitoring diagnostic results
type VMDiskCheckResult struct {
VMsFound int `json:"vmsFound"`
VMsWithAgent int `json:"vmsWithAgent"`
VMsWithDiskData int `json:"vmsWithDiskData"`
TestVMID int `json:"testVMID,omitempty"`
TestVMName string `json:"testVMName,omitempty"`
TestResult string `json:"testResult,omitempty"`
Permissions []string `json:"permissions"`
Recommendations []string `json:"recommendations"`
ProblematicVMs []VMDiskIssue `json:"problematicVMs"`
FilesystemsFound []FilesystemDetail `json:"filesystemsFound"`
}
func (r VMDiskCheckResult) NormalizeCollections() VMDiskCheckResult {
if r.Permissions == nil {
r.Permissions = []string{}
}
if r.Recommendations == nil {
r.Recommendations = []string{}
}
if r.ProblematicVMs == nil {
r.ProblematicVMs = []VMDiskIssue{}
}
if r.FilesystemsFound == nil {
r.FilesystemsFound = []FilesystemDetail{}
}
return r
}
type VMDiskIssue struct {
VMID int `json:"vmid"`
Name string `json:"name"`
Status string `json:"status"`
Issue string `json:"issue"`
}
type FilesystemDetail struct {
Mountpoint string `json:"mountpoint"`
Type string `json:"type"`
Total uint64 `json:"total"`
Used uint64 `json:"used"`
Filtered bool `json:"filtered"`
Reason string `json:"reason,omitempty"`
}
// PhysicalDiskCheck contains diagnostic results for physical disk detection
type PhysicalDiskCheck struct {
NodesChecked int `json:"nodesChecked"`
NodesWithDisks int `json:"nodesWithDisks"`
TotalDisks int `json:"totalDisks"`
NodeResults []NodeDiskResult `json:"nodeResults"`
TestResult string `json:"testResult,omitempty"`
Recommendations []string `json:"recommendations"`
}
func (c PhysicalDiskCheck) NormalizeCollections() PhysicalDiskCheck {
if c.NodeResults == nil {
c.NodeResults = []NodeDiskResult{}
}
if c.Recommendations == nil {
c.Recommendations = []string{}
}
for i := range c.NodeResults {
c.NodeResults[i] = c.NodeResults[i].NormalizeCollections()
}
return c
}
type NodeDiskResult struct {
NodeName string `json:"nodeName"`
DiskCount int `json:"diskCount"`
Error string `json:"error,omitempty"`
DiskDevices []string `json:"diskDevices"`
APIResponse string `json:"apiResponse,omitempty"`
}
func (r NodeDiskResult) NormalizeCollections() NodeDiskResult {
if r.DiskDevices == nil {
r.DiskDevices = []string{}
}
return r
}
// ClusterInfo contains cluster information
type ClusterInfo struct {
Nodes int `json:"nodes"`
}
// PBSDiagnostic contains diagnostic info for a PBS instance
type PBSDiagnostic struct {
ID string `json:"id"`
Name string `json:"name"`
Host string `json:"host"`
Connected bool `json:"connected"`
Error string `json:"error,omitempty"`
Details *PBSDetails `json:"details,omitempty"`
}
// PBSDetails contains PBS-specific details
type PBSDetails struct {
Version string `json:"version,omitempty"`
}
// SystemDiagnostic contains system-level diagnostic info
type SystemDiagnostic struct {
OS string `json:"os"`
Arch string `json:"arch"`
GoVersion string `json:"goVersion"`
NumCPU int `json:"numCPU"`
NumGoroutine int `json:"numGoroutine"`
MemoryMB uint64 `json:"memoryMB"`
}
// APITokenDiagnostic reports on the state of the multi-token authentication system.
type APITokenDiagnostic struct {
Enabled bool `json:"enabled"`
TokenCount int `json:"tokenCount"`
RecommendTokenSetup bool `json:"recommendTokenSetup"`
UnusedTokenCount int `json:"unusedTokenCount,omitempty"`
Notes []string `json:"notes"`
Tokens []APITokenSummary `json:"tokens"`
Usage []APITokenUsage `json:"usage"`
}
func (d APITokenDiagnostic) NormalizeCollections() APITokenDiagnostic {
if d.Notes == nil {
d.Notes = []string{}
}
if d.Tokens == nil {
d.Tokens = []APITokenSummary{}
}
if d.Usage == nil {
d.Usage = []APITokenUsage{}
}
for i := range d.Usage {
d.Usage[i] = d.Usage[i].NormalizeCollections()
}
return d
}
// APITokenSummary provides sanitized token metadata for diagnostics display.
type APITokenSummary struct {
ID string `json:"id"`
Name string `json:"name"`
Hint string `json:"hint,omitempty"`
CreatedAt string `json:"createdAt,omitempty"`
LastUsedAt string `json:"lastUsedAt,omitempty"`
Source string `json:"source,omitempty"`
}
// APITokenUsage summarises how tokens are consumed by connected agents.
type APITokenUsage struct {
TokenID string `json:"tokenId"`
AgentCount int `json:"agentCount"`
Agents []string `json:"agents"`
}
func (u APITokenUsage) NormalizeCollections() APITokenUsage {
if u.Agents == nil {
u.Agents = []string{}
}
return u
}
// DockerAgentDiagnostic summarizes adoption of the Docker agent command system.
type DockerAgentDiagnostic struct {
AgentsTotal int `json:"agentsTotal"`
AgentsOnline int `json:"agentsOnline"`
AgentsReportingVersion int `json:"agentsReportingVersion"`
AgentsWithTokenBinding int `json:"agentsWithTokenBinding"`
AgentsWithoutTokenBinding int `json:"agentsWithoutTokenBinding"`
AgentsWithoutVersion int `json:"agentsWithoutVersion,omitempty"`
AgentsOutdatedVersion int `json:"agentsOutdatedVersion,omitempty"`
AgentsWithStaleCommand int `json:"agentsWithStaleCommand,omitempty"`
AgentsPendingUninstall int `json:"agentsPendingUninstall,omitempty"`
AgentsNeedingAttention int `json:"agentsNeedingAttention"`
RecommendedAgentVersion string `json:"recommendedAgentVersion,omitempty"`
Attention []DockerAgentAttention `json:"attention"`
Notes []string `json:"notes"`
}
func (d DockerAgentDiagnostic) NormalizeCollections() DockerAgentDiagnostic {
if d.Attention == nil {
d.Attention = []DockerAgentAttention{}
}
if d.Notes == nil {
d.Notes = []string{}
}
for i := range d.Attention {
d.Attention[i] = d.Attention[i].NormalizeCollections()
}
return d
}
// DockerAgentAttention captures an individual agent that requires user action.
type DockerAgentAttention struct {
AgentID string `json:"agentId"`
Name string `json:"name"`
Status string `json:"status"`
AgentVersion string `json:"agentVersion,omitempty"`
TokenHint string `json:"tokenHint,omitempty"`
LastSeen string `json:"lastSeen,omitempty"`
Issues []string `json:"issues"`
}
func (a DockerAgentAttention) NormalizeCollections() DockerAgentAttention {
if a.Issues == nil {
a.Issues = []string{}
}
return a
}
// AlertsDiagnostic summarises alert configuration migration state.
type AlertsDiagnostic struct {
MissingCooldown bool `json:"missingCooldown"`
MissingGroupingWindow bool `json:"missingGroupingWindow"`
Notes []string `json:"notes"`
}
func (d AlertsDiagnostic) NormalizeCollections() AlertsDiagnostic {
if d.Notes == nil {
d.Notes = []string{}
}
return d
}
// AIChatDiagnostic reports on the AI chat service status.
type AIChatDiagnostic struct {
Enabled bool `json:"enabled"`
Running bool `json:"running"`
Healthy bool `json:"healthy"`
Port int `json:"port,omitempty"`
URL string `json:"url,omitempty"`
Model string `json:"model,omitempty"`
MCPConnected bool `json:"mcpConnected"`
MCPToolCount int `json:"mcpToolCount,omitempty"`
Notes []string `json:"notes"`
}
func (d AIChatDiagnostic) NormalizeCollections() AIChatDiagnostic {
if d.Notes == nil {
d.Notes = []string{}
}
return d
}
// handleDiagnostics returns comprehensive diagnostic information
func (r *Router) handleDiagnostics(w http.ResponseWriter, req *http.Request) {
diagnosticsMetricsOnce.Do(func() {
prometheus.MustRegister(diagnosticsCacheHits, diagnosticsCacheMisses, diagnosticsRefreshDuration)
})
now := time.Now()
scopeKey := diagnosticsScopeKey(req.Context())
diagnosticsCacheMu.RLock()
cachedEntry, ok := diagnosticsCache[scopeKey]
diagnosticsCacheMu.RUnlock()
if ok && !cachedEntry.cachedAt.IsZero() && now.Sub(cachedEntry.cachedAt) <= diagnosticsCacheTTL {
diagnosticsCacheHits.Inc()
writeDiagnosticsResponse(w, cachedEntry.diag, cachedEntry.cachedAt)
return
}
diagnosticsCacheMisses.Inc()
ctx, cancel := context.WithTimeout(req.Context(), 30*time.Second)
defer cancel()
start := time.Now()
fresh := r.computeDiagnostics(ctx)
diagnosticsRefreshDuration.Observe(time.Since(start).Seconds())
diagnosticsCacheMu.Lock()
cachedAt := time.Now()
diagnosticsCache[scopeKey] = cachedDiagnosticsEntry{
diag: fresh,
cachedAt: cachedAt,
}
diagnosticsCacheMu.Unlock()
writeDiagnosticsResponse(w, fresh, cachedAt)
}
func writeDiagnosticsResponse(w http.ResponseWriter, diag DiagnosticsInfo, cachedAt time.Time) {
diag = diag.NormalizeCollections()
w.Header().Set("Content-Type", "application/json")
if !cachedAt.IsZero() {
w.Header().Set("X-Diagnostics-Cached-At", cachedAt.UTC().Format(time.RFC3339))
}
if err := json.NewEncoder(w).Encode(diag); err != nil {
log.Error().Err(err).Msg("Failed to encode diagnostics")
http.Error(w, "Failed to generate diagnostics", http.StatusInternalServerError)
}
}
func (r *Router) computeDiagnostics(ctx context.Context) DiagnosticsInfo {
diag := EmptyDiagnosticsInfo()
// Version info
if versionInfo, err := updates.GetCurrentVersion(); err == nil {
diag.Version = versionInfo.Version
diag.Runtime = versionInfo.Runtime
} else {
diag.Version = "unknown"
diag.Runtime = "go"
}
// Uptime
diag.Uptime = time.Since(r.monitor.GetStartTime()).Seconds()
// System info
var memStats runtime.MemStats
runtime.ReadMemStats(&memStats)
diag.System = SystemDiagnostic{
OS: runtime.GOOS,
Arch: runtime.GOARCH,
GoVersion: runtime.Version(),
NumCPU: runtime.NumCPU(),
NumGoroutine: runtime.NumGoroutine(),
MemoryMB: memStats.Alloc / 1024 / 1024,
}
diag.APITokens = buildAPITokenDiagnostic(r.config, r.monitor)
diag.MetricsStore = buildMetricsStoreDiagnostic(r.monitor)
// Test each configured node
for _, node := range r.config.PVEInstances {
nodeDiag := NodeDiagnostic{
ID: node.Name,
Name: node.Name,
Host: node.Host,
Type: "pve",
}
// Determine auth method (sanitized - don't expose actual values)
if node.TokenName != "" && node.TokenValue != "" {
nodeDiag.AuthMethod = "api_token"
} else if node.User != "" && node.Password != "" {
nodeDiag.AuthMethod = "username_password"
} else {
nodeDiag.AuthMethod = "none"
nodeDiag.Error = "No authentication configured"
}
// Test connection
testCfg := proxmox.ClientConfig{
Host: node.Host,
User: node.User,
Password: node.Password,
TokenName: node.TokenName,
TokenValue: node.TokenValue,
Fingerprint: node.Fingerprint,
VerifySSL: node.VerifySSL,
}
client, err := proxmox.NewClient(testCfg)
if err != nil {
nodeDiag.Connected = false
nodeDiag.Error = "Failed to initialize connection"
log.Error().Err(err).Str("node", node.Name).Msg("Diagnostics: Proxmox client init failed")
} else {
nodes, err := client.GetNodes(ctx)
if err != nil {
nodeDiag.Connected = false
nodeDiag.Error = "Failed to connect to Proxmox API"
log.Error().Err(err).Str("node", node.Name).Msg("Diagnostics: Proxmox API connection failed")
} else {
nodeDiag.Connected = true
if len(nodes) > 0 {
nodeDiag.Details = &NodeDetails{
NodeCount: len(nodes),
}
if status, err := client.GetNodeStatus(ctx, nodes[0].Node); err == nil && status != nil {
if status.PVEVersion != "" {
nodeDiag.Details.Version = status.PVEVersion
}
}
}
if clusterStatus, err := client.GetClusterStatus(ctx); err == nil {
nodeDiag.ClusterInfo = &ClusterInfo{Nodes: len(clusterStatus)}
} else {
log.Debug().Str("node", node.Name).Msg("Cluster status not available (likely standalone node)")
nodeDiag.ClusterInfo = &ClusterInfo{Nodes: 1}
}
nodeDiag.VMDiskCheck = r.checkVMDiskMonitoring(ctx, client, node.Name)
nodeDiag.PhysicalDisks = r.checkPhysicalDisks(ctx, client, node.Name)
}
}
diag.Nodes = append(diag.Nodes, nodeDiag)
}
// Test PBS instances
for _, pbsNode := range r.config.PBSInstances {
pbsDiag := PBSDiagnostic{
ID: pbsNode.Name,
Name: pbsNode.Name,
Host: pbsNode.Host,
}
testCfg := pbs.ClientConfig{
Host: pbsNode.Host,
User: pbsNode.User,
Password: pbsNode.Password,
TokenName: pbsNode.TokenName,
TokenValue: pbsNode.TokenValue,
Fingerprint: pbsNode.Fingerprint,
VerifySSL: pbsNode.VerifySSL,
}
client, err := pbs.NewClient(testCfg)
if err != nil {
pbsDiag.Connected = false
pbsDiag.Error = "Failed to initialize connection"
log.Error().Err(err).Str("pbs", pbsNode.Name).Msg("Diagnostics: PBS client init failed")
} else {
if version, err := client.GetVersion(ctx); err != nil {
pbsDiag.Connected = false
pbsDiag.Error = "Connection established but version check failed"
log.Error().Err(err).Str("pbs", pbsNode.Name).Msg("Diagnostics: PBS version check failed")
} else {
pbsDiag.Connected = true
pbsDiag.Details = &PBSDetails{Version: version.Version}
}
}
diag.PBS = append(diag.PBS, pbsDiag)
}
diag.DockerAgents = buildDockerAgentDiagnostic(r.monitor, diag.Version)
diag.Alerts = buildAlertsDiagnostic(r.monitor)
diag.AIChat = buildAIChatDiagnostic(r.config, r.aiHandler)
diag.Discovery = buildDiscoveryDiagnostic(r.config, r.monitor)
if r.monitor != nil {
snapshots := r.monitor.GetDiagnosticSnapshots()
if len(snapshots.Nodes) > 0 {
diag.NodeSnapshots = snapshots.Nodes
}
if len(snapshots.Guests) > 0 {
diag.GuestSnapshots = snapshots.Guests
}
diag.MemorySources, diag.MemorySourceBreakdown = buildMemorySourceDiagnostics(snapshots)
}
return diag.NormalizeCollections()
}
func copyStringSlice(values []string) []string {
if len(values) == 0 {
return []string{}
}
return append([]string(nil), values...)
}
func buildDiscoveryDiagnostic(cfg *config.Config, monitor *monitoring.Monitor) *DiscoveryDiagnostic {
if cfg == nil {
return nil
}
discovery := &DiscoveryDiagnostic{
Enabled: cfg.DiscoveryEnabled,
ConfiguredSubnet: strings.TrimSpace(cfg.DiscoverySubnet),
EnvironmentOverride: strings.TrimSpace(cfg.Discovery.EnvironmentOverride),
SubnetAllowlist: copyStringSlice(cfg.Discovery.SubnetAllowlist),
SubnetBlocklist: copyStringSlice(cfg.Discovery.SubnetBlocklist),
}
if discovery.ConfiguredSubnet == "" {
discovery.ConfiguredSubnet = "auto"
}
if discovery.SubnetAllowlist == nil {
discovery.SubnetAllowlist = []string{}
}
if discovery.SubnetBlocklist == nil {
discovery.SubnetBlocklist = []string{}
}
if monitor != nil {
if svc := monitor.GetDiscoveryService(); svc != nil {
status := svc.GetStatus()
if val, ok := status["subnet"].(string); ok {
discovery.ActiveSubnet = val
}
if val, ok := status["is_scanning"].(bool); ok {
discovery.Scanning = val
}
if val, ok := status["interval"].(string); ok {
discovery.ScanInterval = val
}
if val, ok := status["last_scan"].(time.Time); ok && !val.IsZero() {
discovery.LastScanStartedAt = val.UTC().Format(time.RFC3339)
}
if result, updated := svc.GetCachedResult(); result != nil {
discovery.LastResultServers = len(result.Servers)
discovery.LastResultErrors = len(result.StructuredErrors)
if !updated.IsZero() {
discovery.LastResultTimestamp = updated.UTC().Format(time.RFC3339)
}
}
history := svc.GetHistory(10)
if len(history) > 0 {
items := make([]DiscoveryHistoryItem, 0, len(history))
for _, entry := range history {
item := DiscoveryHistoryItem{
StartedAt: entry.StartedAt().UTC().Format(time.RFC3339),
CompletedAt: entry.CompletedAt().UTC().Format(time.RFC3339),
Duration: entry.Duration().Truncate(time.Millisecond).String(),
DurationMs: entry.Duration().Milliseconds(),
Subnet: entry.Subnet(),
ServerCount: entry.ServerCount(),
ErrorCount: entry.ErrorCount(),
BlocklistLength: entry.BlocklistLength(),
Status: entry.Status(),
}
items = append(items, item)
}
discovery.History = items
}
}
}
return discovery
}
func buildAPITokenDiagnostic(cfg *config.Config, monitor *monitoring.Monitor) *APITokenDiagnostic {
if cfg == nil {
return nil
}
diag := &APITokenDiagnostic{
Enabled: cfg.HasAPITokens(),
TokenCount: len(cfg.APITokens),
}
appendNote := func(note string) {
if note == "" || contains(diag.Notes, note) {
return
}
diag.Notes = append(diag.Notes, note)
}
diag.RecommendTokenSetup = len(cfg.APITokens) == 0
if diag.RecommendTokenSetup {
appendNote("No API tokens are configured. Open Settings → Security to generate dedicated tokens for each automation or agent.")
}
tokens := make([]APITokenSummary, 0, len(cfg.APITokens))
unusedCount := 0
for _, record := range cfg.APITokens {
summary := APITokenSummary{
ID: record.ID,
Name: record.Name,
}
if !record.CreatedAt.IsZero() {
summary.CreatedAt = record.CreatedAt.UTC().Format(time.RFC3339)
}
if record.LastUsedAt != nil && !record.LastUsedAt.IsZero() {
summary.LastUsedAt = record.LastUsedAt.UTC().Format(time.RFC3339)
} else {
unusedCount++
}
switch {
case record.Prefix != "" && record.Suffix != "":
summary.Hint = fmt.Sprintf("%s…%s", record.Prefix, record.Suffix)
case record.Prefix != "":
summary.Hint = record.Prefix + "…"
case record.Suffix != "":
summary.Hint = "…" + record.Suffix
}
summary.Source = "user"
tokens = append(tokens, summary)
}
diag.Tokens = tokens
diag.UnusedTokenCount = unusedCount
if len(cfg.APITokens) > 0 {
if unusedCount == len(cfg.APITokens) {
appendNote("Configured API tokens have not been used yet. Update your agents or automations to switch to the new tokens.")
} else if unusedCount > 0 {
appendNote(fmt.Sprintf("%d API token(s) have never been used. Remove unused tokens or update the corresponding agents.", unusedCount))
}
}
tokenUsage := make(map[string][]string)
if monitor != nil {
if readState := monitor.GetUnifiedReadStateOrSnapshot(); readState != nil {
for _, host := range readState.DockerHosts() {
name := preferredDockerHostName(host)
if strings.TrimSpace(host.TokenID()) == "" {
continue
}
tokenID := strings.TrimSpace(host.TokenID())
tokenUsage[tokenID] = append(tokenUsage[tokenID], name)
}
}
}
if len(tokenUsage) > 0 {
keys := make([]string, 0, len(tokenUsage))
for tokenID := range tokenUsage {
keys = append(keys, tokenID)
}
sort.Strings(keys)
diag.Usage = make([]APITokenUsage, 0, len(keys))
for _, tokenID := range keys {
agents := tokenUsage[tokenID]
sort.Strings(agents)
diag.Usage = append(diag.Usage, APITokenUsage{
TokenID: tokenID,
AgentCount: len(agents),
Agents: agents,
})
}
}
return diag
}
func buildDockerAgentDiagnostic(m *monitoring.Monitor, serverVersion string) *DockerAgentDiagnostic {
if m == nil {
return nil
}
readState := m.GetUnifiedReadStateOrSnapshot()
if readState == nil {
return nil
}
hosts := readState.DockerHosts()
diag := &DockerAgentDiagnostic{
AgentsTotal: len(hosts),
RecommendedAgentVersion: normalizeVersionLabel(serverVersion),
}
appendNote := func(note string) {
if note == "" || contains(diag.Notes, note) {
return
}
diag.Notes = append(diag.Notes, note)
}
if len(hosts) == 0 {
appendNote("No Docker / Podman agents have reported in yet. Use Settings → Infrastructure to install the Docker / Podman agent and unlock remote commands.")
return diag
}
var (
serverVer *updates.Version
recommendedLabel = diag.RecommendedAgentVersion
)
if serverVersion != "" {
if parsed, err := updates.ParseVersion(serverVersion); err == nil {
serverVer = parsed
recommendedLabel = normalizeVersionLabel(parsed.String())
diag.RecommendedAgentVersion = recommendedLabel
}
}
now := time.Now().UTC()
legacyTokenHosts := 0
for _, host := range hosts {
status := strings.ToLower(strings.TrimSpace(string(host.Status())))
if status == "online" {
diag.AgentsOnline++
}
versionStr := strings.TrimSpace(host.AgentVersion())
if versionStr != "" {
diag.AgentsReportingVersion++
} else {
diag.AgentsWithoutVersion++
}
if strings.TrimSpace(host.TokenID()) != "" {
diag.AgentsWithTokenBinding++
} else {
legacyTokenHosts++
}
issues := make([]string, 0, 4)
if status != "online" && status != "" {
issues = append(issues, fmt.Sprintf("Docker / Podman agent reports status %q.", status))
}
if versionStr == "" {
issues = append(issues, "Agent has not reported a version (pre v4.24). Reinstall using Settings → Infrastructure.")
} else if serverVer != nil {
if agentVer, err := updates.ParseVersion(versionStr); err == nil {
if agentVer.Compare(serverVer) < 0 {
diag.AgentsOutdatedVersion++
issues = append(issues, fmt.Sprintf("Agent version %s lags behind the recommended %s. Re-run the installer to update.", normalizeVersionLabel(versionStr), recommendedLabel))
}
} else {
issues = append(issues, fmt.Sprintf("Unrecognized agent version string %q. Reinstall to ensure command support.", versionStr))
}
}
if strings.TrimSpace(host.TokenID()) == "" {
issues = append(issues, "Docker / Podman agent is still using the shared API token. Generate a dedicated token in Settings → Security and rerun the installer.")
}
if !host.LastSeen().IsZero() && now.Sub(host.LastSeen().UTC()) > 10*time.Minute {
issues = append(issues, fmt.Sprintf("No heartbeat since %s. Verify the agent container is running.", host.LastSeen().UTC().Format(time.RFC3339)))
}
if command := host.Command(); command != nil {
cmdStatus := strings.ToLower(strings.TrimSpace(command.Status))
switch cmdStatus {
case monitoring.DockerCommandStatusQueued, monitoring.DockerCommandStatusDispatched, monitoring.DockerCommandStatusAcknowledged:
message := fmt.Sprintf("Command %s is still in progress.", cmdStatus)
if !command.UpdatedAt.IsZero() && now.Sub(command.UpdatedAt.UTC()) > 15*time.Minute {
diag.AgentsWithStaleCommand++
message = fmt.Sprintf("Command %s has been pending since %s; consider allowing re-enrolment.", cmdStatus, command.UpdatedAt.UTC().Format(time.RFC3339))
}
issues = append(issues, message)
}
}
if host.PendingUninstall() {
diag.AgentsPendingUninstall++
issues = append(issues, "Docker / Podman agent is pending uninstall; confirm the agent container stopped or clear the flag.")
}
if len(issues) == 0 {
continue
}
hostID := host.HostSourceID()
if hostID == "" {
hostID = host.ID()
}
diag.Attention = append(diag.Attention, DockerAgentAttention{
AgentID: hostID,
Name: preferredDockerHostName(host),
Status: string(host.Status()),
AgentVersion: versionStr,
TokenHint: host.TokenHint(),
LastSeen: formatTimeMaybe(host.LastSeen()),
Issues: issues,
})
}
diag.AgentsWithoutTokenBinding = legacyTokenHosts
diag.AgentsNeedingAttention = len(diag.Attention)
if legacyTokenHosts > 0 {
appendNote(fmt.Sprintf("%s still %s on the shared API token. Migrate each agent to a dedicated token via Settings → Security and rerun the installer.", dockerPodmanAgentCount(legacyTokenHosts), pluralVerb(legacyTokenHosts, "relies", "rely")))
}
if diag.AgentsOutdatedVersion > 0 {
appendNote(fmt.Sprintf("%s %s out of date. Re-run the installer from Settings → Infrastructure to upgrade.", dockerPodmanAgentCount(diag.AgentsOutdatedVersion), pluralVerb(diag.AgentsOutdatedVersion, "is", "are")))
}
if diag.AgentsWithoutVersion > 0 {
appendNote(fmt.Sprintf("%s %s not reported an agent version yet. Reinstall from Settings → Infrastructure to enable the new command system.", dockerPodmanAgentCount(diag.AgentsWithoutVersion), pluralVerb(diag.AgentsWithoutVersion, "has", "have")))
}
if diag.AgentsWithStaleCommand > 0 {
appendNote(fmt.Sprintf("%s %s stuck. Use the 'Allow reconnect' action in Settings → Infrastructure to reset.", dockerPodmanAgentCommandCount(diag.AgentsWithStaleCommand), pluralVerb(diag.AgentsWithStaleCommand, "appears", "appear")))
}
if diag.AgentsPendingUninstall > 0 {
appendNote(fmt.Sprintf("%s %s pending uninstall. Confirm the uninstall or clear the flag from Settings → Infrastructure.", dockerPodmanAgentCount(diag.AgentsPendingUninstall), pluralVerb(diag.AgentsPendingUninstall, "is", "are")))
}
if diag.AgentsNeedingAttention == 0 {
appendNote("All Docker / Podman agents are reporting with dedicated tokens and the expected version.")
}
return diag
}
func dockerPodmanAgentCount(count int) string {
if count == 1 {
return "1 Docker / Podman agent"
}
return fmt.Sprintf("%d Docker / Podman agents", count)
}
func dockerPodmanAgentCommandCount(count int) string {
if count == 1 {
return "1 Docker / Podman agent command"
}
return fmt.Sprintf("%d Docker / Podman agent commands", count)
}
func pluralVerb(count int, singular, plural string) string {
if count == 1 {
return singular
}
return plural
}
func buildAlertsDiagnostic(m *monitoring.Monitor) *AlertsDiagnostic {
if m == nil {
return nil
}
manager := m.GetAlertManager()
if manager == nil {
return nil
}
config := manager.GetConfig()
diag := &AlertsDiagnostic{}
appendNote := func(note string) {
if note == "" || contains(diag.Notes, note) {
return
}
diag.Notes = append(diag.Notes, note)
}
if config.Schedule.Cooldown <= 0 {
diag.MissingCooldown = true
appendNote("Alert cooldown is not configured. Set a cooldown under Alerts → Schedule to prevent alert storms.")
}
if config.Schedule.Grouping.Window <= 0 {
diag.MissingGroupingWindow = true
appendNote("Alert grouping window is disabled. Configure a grouping window to bundle related alerts.")
}
return diag
}
func fingerprintPublicKey(pub string) (string, error) {
pub = strings.TrimSpace(pub)
if pub == "" {
return "", fmt.Errorf("empty public key")
}
key, _, _, _, err := ssh.ParseAuthorizedKey([]byte(pub))
if err != nil {
return "", err
}
return ssh.FingerprintSHA256(key), nil
}
func resolveUserName(uid uint32) string {
uidStr := strconv.FormatUint(uint64(uid), 10)
if usr, err := user.LookupId(uidStr); err == nil && usr.Username != "" {
return usr.Username
}
return "uid:" + uidStr
}
func resolveGroupName(gid uint32) string {
gidStr := strconv.FormatUint(uint64(gid), 10)
if grp, err := user.LookupGroupId(gidStr); err == nil && grp != nil && grp.Name != "" {
return grp.Name
}
return "gid:" + gidStr
}
func countLegacySSHKeys(dir string) (int, error) {
entries, err := os.ReadDir(dir)
if err != nil {
if errors.Is(err, os.ErrNotExist) {
return 0, nil
}
return 0, err
}
count := 0
for _, entry := range entries {
if entry.IsDir() {
continue
}
name := entry.Name()
if strings.HasPrefix(name, "id_") {
count++
}
}
return count, nil
}
func preferredDockerHostName(host *unifiedresources.DockerHostView) string {
if host == nil {
return ""
}
if name := strings.TrimSpace(host.Name()); name != "" {
return name
}
if name := strings.TrimSpace(host.Hostname()); name != "" {
return name
}
if name := strings.TrimSpace(host.AgentID()); name != "" {
return name
}
if name := strings.TrimSpace(host.HostSourceID()); name != "" {
return name
}
return host.ID()
}
func formatTimeMaybe(t time.Time) string {
if t.IsZero() {
return ""
}
return t.UTC().Format(time.RFC3339)
}
func normalizeVersionLabel(raw string) string {
value := strings.TrimSpace(raw)
if value == "" {
return ""
}
if strings.HasPrefix(value, "v") {
return value
}
first := value[0]
if first < '0' || first > '9' {
return value
}
return "v" + value
}
// checkVMDiskMonitoring performs diagnostic checks for VM disk monitoring
func (r *Router) checkVMDiskMonitoring(ctx context.Context, client *proxmox.Client, _ string) *VMDiskCheckResult {
result := &VMDiskCheckResult{
Recommendations: []string{},
Permissions: []string{},
}
// Get all nodes to check
nodes, err := client.GetNodes(ctx)
if err != nil {
log.Error().Err(err).Msg("VM disk check: failed to get nodes")
result.TestResult = "Failed to get nodes"
return result
}
if len(nodes) == 0 {
result.TestResult = "No nodes found"
return result
}
// Fetch VMs once per node and keep lookup map
nodeVMMap := make(map[string][]proxmox.VM)
var allVMs []proxmox.VM
for _, node := range nodes {
vmCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
vms, err := client.GetVMs(vmCtx, node.Node)
cancel()
if err != nil {
log.Debug().Err(err).Str("node", node.Node).Msg("Failed to get VMs from node")
continue
}
nodeVMMap[node.Node] = vms
allVMs = append(allVMs, vms...)
}
result.VMsFound = len(allVMs)
vms := allVMs
if len(vms) == 0 {
result.TestResult = "No VMs found to test"
result.Recommendations = append(result.Recommendations, "Create a test VM to verify disk monitoring")
return result
}
// Check VMs for agent and disk data
var testVM *proxmox.VM
var testVMNode string
result.ProblematicVMs = []VMDiskIssue{}
for i := range vms {
vm := vms[i]
if vm.Template == 0 && vm.Status == "running" {
vmNode := strings.TrimSpace(vm.Node)
if vmNode == "" {
continue
}
// Check if agent is configured
statusCtx, statusCancel := context.WithTimeout(ctx, 10*time.Second)
vmStatus, err := client.GetVMStatus(statusCtx, vmNode, vm.VMID)
statusCancel()
if err != nil {
log.Error().Err(err).Int("vmid", vm.VMID).Msg("VM disk check: failed to get VM status")
result.ProblematicVMs = append(result.ProblematicVMs, VMDiskIssue{
VMID: vm.VMID,
Name: vm.Name,
Status: vm.Status,
Issue: "Failed to get VM status",
})
} else if vmStatus != nil && vmStatus.Agent.Value > 0 {
result.VMsWithAgent++
// Try to get filesystem info
fsCtx, fsCancel := context.WithTimeout(ctx, 10*time.Second)
fsInfo, err := client.GetVMFSInfo(fsCtx, vmNode, vm.VMID)
fsCancel()
if err != nil {
result.ProblematicVMs = append(result.ProblematicVMs, VMDiskIssue{
VMID: vm.VMID,
Name: vm.Name,
Status: vm.Status,
Issue: "Agent enabled but failed to get filesystem info",
})
if testVM == nil {
testVM = &vms[i]
testVMNode = vmNode
}
} else if len(fsInfo) == 0 {
result.ProblematicVMs = append(result.ProblematicVMs, VMDiskIssue{
VMID: vm.VMID,
Name: vm.Name,
Status: vm.Status,
Issue: "Agent returned no filesystem info",
})
if testVM == nil {
testVM = &vms[i]
testVMNode = vmNode
}
} else {
// Check if we get usable disk data
hasUsableFS := false
for _, fs := range fsInfo {
if fs.Type != "tmpfs" && fs.Type != "devtmpfs" &&
!strings.HasPrefix(fs.Mountpoint, "/dev") &&
!strings.HasPrefix(fs.Mountpoint, "/proc") &&
!strings.HasPrefix(fs.Mountpoint, "/sys") &&
fs.TotalBytes > 0 {
hasUsableFS = true
break
}
}
if hasUsableFS {
result.VMsWithDiskData++
} else {
result.ProblematicVMs = append(result.ProblematicVMs, VMDiskIssue{
VMID: vm.VMID,
Name: vm.Name,
Status: vm.Status,
Issue: fmt.Sprintf("Agent returned %d filesystems but none are usable for disk metrics", len(fsInfo)),
})
}
if testVM == nil {
testVM = &vms[i]
testVMNode = vmNode
}
}
} else if vmStatus != nil {
// Agent not enabled
result.ProblematicVMs = append(result.ProblematicVMs, VMDiskIssue{
VMID: vm.VMID,
Name: vm.Name,
Status: vm.Status,
Issue: "Guest agent not enabled in VM configuration",
})
}
}
}
// Perform detailed test on one VM
if testVM != nil {
result.TestVMID = testVM.VMID
result.TestVMName = testVM.Name
// Check VM status for agent
statusCtx, statusCancel := context.WithTimeout(ctx, 10*time.Second)
vmStatus, err := client.GetVMStatus(statusCtx, testVMNode, testVM.VMID)
statusCancel()
if err != nil {
errStr := err.Error()
log.Error().Err(err).Int("vmid", testVM.VMID).Msg("VM disk check: failed to get VM status for test VM")
result.TestResult = "Failed to get VM status"
if errors.Is(err, context.DeadlineExceeded) || strings.Contains(errStr, "context deadline exceeded") {
result.Recommendations = append(result.Recommendations,
"VM status request timed out; check network connectivity to the node",
"If this persists, increase the diagnostics timeout or reduce VM load during checks",
)
} else if strings.Contains(errStr, "403") || strings.Contains(errStr, "401") {
result.Recommendations = append(result.Recommendations,
"Ensure API token has PVEAuditor role for baseline access",
"Add VM.GuestAgent.Audit (PVE 9) or VM.Monitor (PVE 8) privileges; Pulse setup adds these via the PulseMonitor role",
"Include Sys.Audit when available for Ceph metrics",
)
} else {
result.Recommendations = append(result.Recommendations,
"Verify the node is reachable and API token is valid",
)
}
} else if vmStatus == nil || vmStatus.Agent.Value == 0 {
result.TestResult = "Guest agent not enabled in VM configuration"
result.Recommendations = append(result.Recommendations,
"Enable QEMU Guest Agent in VM Options",
"Install qemu-guest-agent package in the VM")
} else {
// Try to get filesystem info
fsCtx, fsCancel := context.WithTimeout(ctx, 10*time.Second)
fsInfo, err := client.GetVMFSInfo(fsCtx, testVMNode, testVM.VMID)
fsCancel()
if err != nil {
errStr := err.Error()
if strings.Contains(errStr, "500") || strings.Contains(errStr, "not running") {
result.TestResult = "Guest agent not running inside VM"
result.Recommendations = append(result.Recommendations,
"SSH into VM and run: systemctl status qemu-guest-agent",
"If not installed: apt install qemu-guest-agent",
"If installed but not running: systemctl start qemu-guest-agent")
} else if strings.Contains(errStr, "403") || strings.Contains(errStr, "401") {
result.TestResult = "Permission denied accessing guest agent"
result.Recommendations = append(result.Recommendations,
"Ensure API token has PVEAuditor role for baseline access",
"Add VM.GuestAgent.Audit (PVE 9) or VM.Monitor (PVE 8) privileges; Pulse setup adds these via the PulseMonitor role",
"Include Sys.Audit when available for Ceph metrics")
} else if errors.Is(err, context.DeadlineExceeded) || strings.Contains(errStr, "context deadline exceeded") {
result.TestResult = "Guest agent request timed out"
result.Recommendations = append(result.Recommendations,
"Ensure the VM responds to guest agent queries promptly",
"Consider increasing the diagnostics timeout if the environment is large",
)
} else {
log.Error().Err(err).Int("vmid", testVM.VMID).Msg("VM disk check: failed to get guest agent data")
result.TestResult = "Failed to get guest agent data"
}
} else if len(fsInfo) == 0 {
result.TestResult = "Guest agent returned no filesystem info"
result.Recommendations = append(result.Recommendations,
"Guest agent may need restart inside VM",
"Check VM has mounted filesystems")
} else {
// Calculate disk usage from filesystem info
var totalBytes, usedBytes uint64
result.FilesystemsFound = []FilesystemDetail{}
for _, fs := range fsInfo {
fsDetail := FilesystemDetail{
Mountpoint: fs.Mountpoint,
Type: fs.Type,
Total: fs.TotalBytes,
Used: fs.UsedBytes,
}
// Check if this filesystem should be filtered
if fs.Type == "tmpfs" || fs.Type == "devtmpfs" {
fsDetail.Filtered = true
fsDetail.Reason = "Special filesystem type"
} else if strings.HasPrefix(fs.Mountpoint, "/dev") ||
strings.HasPrefix(fs.Mountpoint, "/proc") ||
strings.HasPrefix(fs.Mountpoint, "/sys") ||
strings.HasPrefix(fs.Mountpoint, "/run") ||
fs.Mountpoint == "/boot/efi" {
fsDetail.Filtered = true
fsDetail.Reason = "System mount point"
} else if fs.TotalBytes == 0 {
fsDetail.Filtered = true
fsDetail.Reason = "Zero total bytes"
} else {
// This filesystem counts toward disk usage
totalBytes += fs.TotalBytes
usedBytes += fs.UsedBytes
}
result.FilesystemsFound = append(result.FilesystemsFound, fsDetail)
}
if totalBytes > 0 {
percent := float64(usedBytes) / float64(totalBytes) * 100
result.TestResult = fmt.Sprintf("SUCCESS: Guest agent working! Disk usage: %.1f%% (%d/%d bytes)",
percent, usedBytes, totalBytes)
} else {
result.TestResult = fmt.Sprintf("Guest agent returned %d filesystems but no usable disk data (all filtered out)", len(fsInfo))
}
}
}
} else {
result.TestResult = "No running VMs found to test"
result.Recommendations = append(result.Recommendations, "Start a VM to test disk monitoring")
}
// Add general recommendations based on results
if result.VMsWithAgent > 0 && result.VMsWithDiskData == 0 {
result.Recommendations = append(result.Recommendations,
"Guest agent is configured but not providing disk data",
"Check guest agent is running inside VMs",
"Verify API token permissions")
}
return result
}
// checkPhysicalDisks performs diagnostic checks for physical disk detection
func (r *Router) checkPhysicalDisks(ctx context.Context, client *proxmox.Client, _ string) *PhysicalDiskCheck {
result := &PhysicalDiskCheck{
Recommendations: []string{},
NodeResults: []NodeDiskResult{},
}
// Get all nodes
nodes, err := client.GetNodes(ctx)
if err != nil {
log.Error().Err(err).Msg("Physical disk check: failed to get nodes")
result.TestResult = "Failed to get nodes"
return result
}
result.NodesChecked = len(nodes)
// Check each node for physical disks
for _, node := range nodes {
nodeResult := NodeDiskResult{
NodeName: node.Node,
}
// Skip offline nodes
if node.Status != "online" {
nodeResult.Error = "Node is offline"
result.NodeResults = append(result.NodeResults, nodeResult)
continue
}
// Try to get disk list
diskCtx, diskCancel := context.WithTimeout(ctx, 10*time.Second)
disks, err := client.GetDisks(diskCtx, node.Node)
diskCancel()
if err != nil {
errStr := err.Error()
log.Error().Err(err).Str("node", node.Node).Msg("Physical disk check: failed to get disks")
nodeResult.Error = "Failed to query disk information"
// Provide specific recommendations based on error
if strings.Contains(errStr, "401") || strings.Contains(errStr, "403") {
nodeResult.APIResponse = "Permission denied"
if !contains(result.Recommendations, "Check API token has sufficient permissions for disk monitoring") {
result.Recommendations = append(result.Recommendations,
"Check API token has sufficient permissions for disk monitoring",
"Token needs at least PVEAuditor role on the node")
}
} else if errors.Is(err, context.DeadlineExceeded) || strings.Contains(errStr, "context deadline exceeded") {
nodeResult.APIResponse = "Timeout"
if !contains(result.Recommendations, "Disk query timed out; verify node connectivity and load") {
result.Recommendations = append(result.Recommendations,
"Disk query timed out; verify node connectivity and load",
"Increase diagnostics timeout if nodes are slow to respond")
}
} else if strings.Contains(errStr, "404") || strings.Contains(errStr, "501") {
nodeResult.APIResponse = "Endpoint not available"
if !contains(result.Recommendations, "Node may be running older Proxmox version without disk API support") {
result.Recommendations = append(result.Recommendations,
"Node may be running older Proxmox version without disk API support",
"Check if node is running on non-standard hardware (Raspberry Pi, etc)")
}
} else {
nodeResult.APIResponse = "API error"
}
} else {
nodeResult.DiskCount = len(disks)
if len(disks) > 0 {
result.NodesWithDisks++
result.TotalDisks += len(disks)
// List disk devices
for _, disk := range disks {
nodeResult.DiskDevices = append(nodeResult.DiskDevices, disk.DevPath)
}
} else {
nodeResult.APIResponse = "Empty response (no traditional disks found)"
// This could be normal for SD card/USB based systems
if !contains(result.Recommendations, "Some nodes returned no disks - may be using SD cards or USB storage") {
result.Recommendations = append(result.Recommendations,
"Some nodes returned no disks - may be using SD cards or USB storage",
"Proxmox disk API only returns SATA/NVMe/SAS disks, not SD cards")
}
}
}
result.NodeResults = append(result.NodeResults, nodeResult)
}
// Generate summary
if result.NodesChecked == 0 {
result.TestResult = "No nodes found to check"
} else if result.NodesWithDisks == 0 {
result.TestResult = fmt.Sprintf("Checked %d nodes, none returned physical disks", result.NodesChecked)
} else {
result.TestResult = fmt.Sprintf("Found %d disks across %d of %d nodes",
result.TotalDisks, result.NodesWithDisks, result.NodesChecked)
}
return result
}
// Helper function to check if slice contains string
func contains(slice []string, str string) bool {
for _, s := range slice {
if s == str {
return true
}
}
return false
}
func containsFold(slice []string, candidate string) bool {
target := strings.ToLower(strings.TrimSpace(candidate))
if target == "" {
return false
}
for _, s := range slice {
if strings.ToLower(strings.TrimSpace(s)) == target {
return true
}
}
return false
}
func interfaceToStringSlice(value interface{}) []string {
switch v := value.(type) {
case []string:
out := make([]string, len(v))
copy(out, v)
return out
case []interface{}:
result := make([]string, 0, len(v))
for _, item := range v {
if str, ok := item.(string); ok {
result = append(result, str)
}
}
return result
default:
return nil
}
}
func buildAIChatDiagnostic(cfg *config.Config, aiHandler *AIHandler) *AIChatDiagnostic {
if cfg == nil {
return nil
}
diag := &AIChatDiagnostic{
Enabled: false,
Notes: []string{},
}
// Calculate enabled state based on AI config
// NOTE: aiHandler might be nil during early startup
if aiHandler != nil {
ctx := context.Background()
aiCfg := aiHandler.GetAIConfig(ctx)
if aiCfg != nil {
diag.Enabled = aiCfg.Enabled
diag.Model = aiCfg.GetChatModel()
}
svc := aiHandler.GetService(ctx)
if svc != nil {
diag.Running = svc.IsRunning()
diag.Healthy = svc.IsRunning() // Consolidate for now
// Get connection details
baseURL := svc.GetBaseURL()
if baseURL != "" {
diag.URL = baseURL
// Parse port from URL
if parts := strings.Split(baseURL, ":"); len(parts) > 2 {
if port, err := strconv.Atoi(parts[2]); err == nil {
diag.Port = port
}
}
}
// Check MCP connection (if we had access to check it)
diag.MCPConnected = diag.Running // Assume connected if running for now
if !diag.Running && diag.Enabled {
diag.Notes = append(diag.Notes, "Pulse Assistant service is enabled but not running")
}
} else if diag.Enabled {
diag.Notes = append(diag.Notes, "Pulse Assistant service is nil")
}
} else {
diag.Notes = append(diag.Notes, "Pulse Assistant handler not initialized")
}
return diag
}