Pulse/internal/api/diagnostics.go
Pulse Monitor 776fec7018 fix: properly handle PBS connection timeouts with granular timeout settings
The real issue was not the overall timeout duration, but that DNS resolution and TLS handshake could hang indefinitely. Added specific timeouts for:
- DNS resolution/connection: 10 seconds
- TLS handshake: 10 seconds
- Response headers: 10 seconds

This prevents the connection from hanging on DNS lookup (like with pve-backup.lan) or during TLS negotiation, which was causing the 'context deadline exceeded' errors. (addresses #424)
2025-09-06 10:07:10 +00:00

511 lines
No EOL
16 KiB
Go

package api
import (
"context"
"encoding/json"
"fmt"
"net/http"
"runtime"
"strings"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/updates"
"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
"github.com/rcourtman/pulse-go-rewrite/pkg/pbs"
"github.com/rs/zerolog/log"
)
// DiagnosticsInfo contains comprehensive diagnostic information
type DiagnosticsInfo struct {
Version string `json:"version"`
Runtime string `json:"runtime"`
Uptime float64 `json:"uptime"`
Nodes []NodeDiagnostic `json:"nodes"`
PBS []PBSDiagnostic `json:"pbs"`
System SystemDiagnostic `json:"system"`
Errors []string `json:"errors"`
}
// NodeDiagnostic contains diagnostic info for a Proxmox node
type NodeDiagnostic struct {
ID string `json:"id"`
Name string `json:"name"`
Host string `json:"host"`
Type string `json:"type"`
AuthMethod string `json:"authMethod"`
Connected bool `json:"connected"`
Error string `json:"error,omitempty"`
Details *NodeDetails `json:"details,omitempty"`
LastPoll string `json:"lastPoll,omitempty"`
ClusterInfo *ClusterInfo `json:"clusterInfo,omitempty"`
VMDiskCheck *VMDiskCheckResult `json:"vmDiskCheck,omitempty"`
}
// NodeDetails contains node-specific details
type NodeDetails struct {
NodeCount int `json:"node_count,omitempty"`
Version string `json:"version,omitempty"`
}
// VMDiskCheckResult contains VM disk monitoring diagnostic results
type VMDiskCheckResult struct {
VMsFound int `json:"vmsFound"`
VMsWithAgent int `json:"vmsWithAgent"`
VMsWithDiskData int `json:"vmsWithDiskData"`
TestVMID int `json:"testVMID,omitempty"`
TestVMName string `json:"testVMName,omitempty"`
TestResult string `json:"testResult,omitempty"`
Permissions []string `json:"permissions,omitempty"`
Recommendations []string `json:"recommendations,omitempty"`
ProblematicVMs []VMDiskIssue `json:"problematicVMs,omitempty"`
FilesystemsFound []FilesystemDetail `json:"filesystemsFound,omitempty"`
}
type VMDiskIssue struct {
VMID int `json:"vmid"`
Name string `json:"name"`
Status string `json:"status"`
Issue string `json:"issue"`
}
type FilesystemDetail struct {
Mountpoint string `json:"mountpoint"`
Type string `json:"type"`
Total uint64 `json:"total"`
Used uint64 `json:"used"`
Filtered bool `json:"filtered"`
Reason string `json:"reason,omitempty"`
}
// ClusterInfo contains cluster information
type ClusterInfo struct {
Nodes int `json:"nodes"`
}
// PBSDiagnostic contains diagnostic info for a PBS instance
type PBSDiagnostic struct {
ID string `json:"id"`
Name string `json:"name"`
Host string `json:"host"`
Connected bool `json:"connected"`
Error string `json:"error,omitempty"`
Details *PBSDetails `json:"details,omitempty"`
}
// PBSDetails contains PBS-specific details
type PBSDetails struct {
Version string `json:"version,omitempty"`
}
// SystemDiagnostic contains system-level diagnostic info
type SystemDiagnostic struct {
OS string `json:"os"`
Arch string `json:"arch"`
GoVersion string `json:"goVersion"`
NumCPU int `json:"numCPU"`
NumGoroutine int `json:"numGoroutine"`
MemoryMB uint64 `json:"memoryMB"`
}
// handleDiagnostics returns comprehensive diagnostic information
func (r *Router) handleDiagnostics(w http.ResponseWriter, req *http.Request) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
diag := DiagnosticsInfo{
Errors: []string{},
}
// Version info
if versionInfo, err := updates.GetCurrentVersion(); err == nil {
diag.Version = versionInfo.Version
diag.Runtime = versionInfo.Runtime
} else {
diag.Version = "unknown"
diag.Runtime = "go"
}
// Uptime
diag.Uptime = time.Since(r.monitor.GetStartTime()).Seconds()
// System info
var memStats runtime.MemStats
runtime.ReadMemStats(&memStats)
diag.System = SystemDiagnostic{
OS: runtime.GOOS,
Arch: runtime.GOARCH,
GoVersion: runtime.Version(),
NumCPU: runtime.NumCPU(),
NumGoroutine: runtime.NumGoroutine(),
MemoryMB: memStats.Alloc / 1024 / 1024,
}
// Test each configured node
for _, node := range r.config.PVEInstances {
nodeDiag := NodeDiagnostic{
ID: node.Name,
Name: node.Name,
Host: node.Host,
Type: "pve",
}
// Determine auth method (sanitized - don't expose actual values)
if node.TokenName != "" && node.TokenValue != "" {
nodeDiag.AuthMethod = "api_token"
} else if node.User != "" && node.Password != "" {
nodeDiag.AuthMethod = "username_password"
} else {
nodeDiag.AuthMethod = "none"
nodeDiag.Error = "No authentication configured"
}
// Test connection
testCfg := proxmox.ClientConfig{
Host: node.Host,
User: node.User,
Password: node.Password,
TokenName: node.TokenName,
TokenValue: node.TokenValue,
VerifySSL: node.VerifySSL,
}
client, err := proxmox.NewClient(testCfg)
if err != nil {
nodeDiag.Connected = false
nodeDiag.Error = err.Error()
} else {
// Try to get nodes first (this should work for both clustered and standalone)
nodes, err := client.GetNodes(ctx)
if err != nil {
nodeDiag.Connected = false
nodeDiag.Error = "Failed to connect to Proxmox API: " + err.Error()
} else {
nodeDiag.Connected = true
// Set node details
if len(nodes) > 0 {
nodeDiag.Details = &NodeDetails{
NodeCount: len(nodes),
}
// Get version from first node
if status, err := client.GetNodeStatus(ctx, nodes[0].Node); err == nil && status != nil {
if status.PVEVersion != "" {
nodeDiag.Details.Version = status.PVEVersion
}
}
}
// Try to get cluster status (this may fail for standalone nodes, which is OK)
if clusterStatus, err := client.GetClusterStatus(ctx); err == nil {
nodeDiag.ClusterInfo = &ClusterInfo{
Nodes: len(clusterStatus),
}
} else {
// Standalone node or cluster status not available
// This is not an error - standalone nodes don't have cluster status
log.Debug().Str("node", node.Name).Msg("Cluster status not available (likely standalone node)")
nodeDiag.ClusterInfo = &ClusterInfo{
Nodes: 1, // Standalone node
}
}
// Run VM disk monitoring check
nodeDiag.VMDiskCheck = r.checkVMDiskMonitoring(ctx, client, node.Name)
}
}
diag.Nodes = append(diag.Nodes, nodeDiag)
}
// Test PBS instances
for _, pbsNode := range r.config.PBSInstances {
pbsDiag := PBSDiagnostic{
ID: pbsNode.Name,
Name: pbsNode.Name,
Host: pbsNode.Host,
}
// Test connection
testCfg := pbs.ClientConfig{
Host: pbsNode.Host,
User: pbsNode.User,
Password: pbsNode.Password,
TokenName: pbsNode.TokenName,
TokenValue: pbsNode.TokenValue,
Fingerprint: pbsNode.Fingerprint,
VerifySSL: pbsNode.VerifySSL,
}
client, err := pbs.NewClient(testCfg)
if err != nil {
pbsDiag.Connected = false
pbsDiag.Error = err.Error()
} else {
// Try to get version
if version, err := client.GetVersion(ctx); err != nil {
pbsDiag.Connected = false
pbsDiag.Error = "Connection established but version check failed: " + err.Error()
} else {
pbsDiag.Connected = true
pbsDiag.Details = &PBSDetails{
Version: version.Version,
}
}
}
diag.PBS = append(diag.PBS, pbsDiag)
}
// Add any recent errors from logs (this would need a log collector)
// For now, just check basic connectivity
w.Header().Set("Content-Type", "application/json")
if err := json.NewEncoder(w).Encode(diag); err != nil {
log.Error().Err(err).Msg("Failed to encode diagnostics")
http.Error(w, "Failed to generate diagnostics", http.StatusInternalServerError)
}
}
// checkVMDiskMonitoring performs diagnostic checks for VM disk monitoring
func (r *Router) checkVMDiskMonitoring(ctx context.Context, client *proxmox.Client, nodeName string) *VMDiskCheckResult {
result := &VMDiskCheckResult{
Recommendations: []string{},
Permissions: []string{},
}
// Get all nodes to check
nodes, err := client.GetNodes(ctx)
if err != nil {
result.TestResult = "Failed to get nodes: " + err.Error()
return result
}
if len(nodes) == 0 {
result.TestResult = "No nodes found"
return result
}
// Check all nodes for VMs
var allVMs []proxmox.VM
for _, node := range nodes {
vms, err := client.GetVMs(ctx, node.Node)
if err != nil {
log.Debug().Err(err).Str("node", node.Node).Msg("Failed to get VMs from node")
continue
}
allVMs = append(allVMs, vms...)
}
result.VMsFound = len(allVMs)
vms := allVMs
if len(vms) == 0 {
result.TestResult = "No VMs found to test"
result.Recommendations = append(result.Recommendations, "Create a test VM to verify disk monitoring")
return result
}
// Check VMs for agent and disk data
var testVM *proxmox.VM
var testVMNode string
result.ProblematicVMs = []VMDiskIssue{}
for _, vm := range vms {
if vm.Template == 0 && vm.Status == "running" {
// Find which node this VM is on
vmNode := ""
for _, node := range nodes {
nodeVMs, _ := client.GetVMs(ctx, node.Node)
for _, nvm := range nodeVMs {
if nvm.VMID == vm.VMID {
vmNode = node.Node
break
}
}
if vmNode != "" {
break
}
}
if vmNode == "" {
continue
}
// Check if agent is configured
vmStatus, err := client.GetVMStatus(ctx, vmNode, vm.VMID)
if err != nil {
result.ProblematicVMs = append(result.ProblematicVMs, VMDiskIssue{
VMID: vm.VMID,
Name: vm.Name,
Status: vm.Status,
Issue: "Failed to get VM status: " + err.Error(),
})
} else if vmStatus != nil && vmStatus.Agent > 0 {
result.VMsWithAgent++
// Try to get filesystem info
fsInfo, err := client.GetVMFSInfo(ctx, vmNode, vm.VMID)
if err != nil {
result.ProblematicVMs = append(result.ProblematicVMs, VMDiskIssue{
VMID: vm.VMID,
Name: vm.Name,
Status: vm.Status,
Issue: "Agent enabled but failed to get filesystem info: " + err.Error(),
})
if testVM == nil {
testVM = &vm
testVMNode = vmNode
}
} else if len(fsInfo) == 0 {
result.ProblematicVMs = append(result.ProblematicVMs, VMDiskIssue{
VMID: vm.VMID,
Name: vm.Name,
Status: vm.Status,
Issue: "Agent returned no filesystem info",
})
if testVM == nil {
testVM = &vm
testVMNode = vmNode
}
} else {
// Check if we get usable disk data
hasUsableFS := false
for _, fs := range fsInfo {
if fs.Type != "tmpfs" && fs.Type != "devtmpfs" &&
!strings.HasPrefix(fs.Mountpoint, "/dev") &&
!strings.HasPrefix(fs.Mountpoint, "/proc") &&
!strings.HasPrefix(fs.Mountpoint, "/sys") &&
fs.TotalBytes > 0 {
hasUsableFS = true
break
}
}
if hasUsableFS {
result.VMsWithDiskData++
} else {
result.ProblematicVMs = append(result.ProblematicVMs, VMDiskIssue{
VMID: vm.VMID,
Name: vm.Name,
Status: vm.Status,
Issue: fmt.Sprintf("Agent returned %d filesystems but none are usable for disk metrics", len(fsInfo)),
})
}
if testVM == nil {
testVM = &vm
testVMNode = vmNode
}
}
} else if vmStatus != nil {
// Agent not enabled
result.ProblematicVMs = append(result.ProblematicVMs, VMDiskIssue{
VMID: vm.VMID,
Name: vm.Name,
Status: vm.Status,
Issue: "Guest agent not enabled in VM configuration",
})
}
}
}
// Perform detailed test on one VM
if testVM != nil {
result.TestVMID = testVM.VMID
result.TestVMName = testVM.Name
// Check VM status for agent
vmStatus, err := client.GetVMStatus(ctx, testVMNode, testVM.VMID)
if err != nil {
result.TestResult = "Failed to get VM status: " + err.Error()
result.Recommendations = append(result.Recommendations, "Check API token has PVEAuditor role")
} else if vmStatus == nil || vmStatus.Agent == 0 {
result.TestResult = "Guest agent not enabled in VM configuration"
result.Recommendations = append(result.Recommendations,
"Enable QEMU Guest Agent in VM Options",
"Install qemu-guest-agent package in the VM")
} else {
// Try to get filesystem info
fsInfo, err := client.GetVMFSInfo(ctx, testVMNode, testVM.VMID)
if err != nil {
errStr := err.Error()
if strings.Contains(errStr, "500") || strings.Contains(errStr, "not running") {
result.TestResult = "Guest agent not running inside VM"
result.Recommendations = append(result.Recommendations,
"SSH into VM and run: systemctl status qemu-guest-agent",
"If not installed: apt install qemu-guest-agent",
"If installed but not running: systemctl start qemu-guest-agent")
} else if strings.Contains(errStr, "403") || strings.Contains(errStr, "401") {
result.TestResult = "Permission denied accessing guest agent"
result.Recommendations = append(result.Recommendations,
"Ensure API token has PVEAuditor role",
"For PVE 9: PVEAuditor includes VM.GuestAgent.Audit",
"For PVE 8: May need additional VM.Monitor permission")
} else {
result.TestResult = "Failed to get guest agent data: " + errStr
}
} else if len(fsInfo) == 0 {
result.TestResult = "Guest agent returned no filesystem info"
result.Recommendations = append(result.Recommendations,
"Guest agent may need restart inside VM",
"Check VM has mounted filesystems")
} else {
// Calculate disk usage from filesystem info
var totalBytes, usedBytes uint64
result.FilesystemsFound = []FilesystemDetail{}
for _, fs := range fsInfo {
fsDetail := FilesystemDetail{
Mountpoint: fs.Mountpoint,
Type: fs.Type,
Total: fs.TotalBytes,
Used: fs.UsedBytes,
}
// Check if this filesystem should be filtered
if fs.Type == "tmpfs" || fs.Type == "devtmpfs" {
fsDetail.Filtered = true
fsDetail.Reason = "Special filesystem type"
} else if strings.HasPrefix(fs.Mountpoint, "/dev") ||
strings.HasPrefix(fs.Mountpoint, "/proc") ||
strings.HasPrefix(fs.Mountpoint, "/sys") ||
strings.HasPrefix(fs.Mountpoint, "/run") ||
fs.Mountpoint == "/boot/efi" {
fsDetail.Filtered = true
fsDetail.Reason = "System mount point"
} else if fs.TotalBytes == 0 {
fsDetail.Filtered = true
fsDetail.Reason = "Zero total bytes"
} else {
// This filesystem counts toward disk usage
totalBytes += fs.TotalBytes
usedBytes += fs.UsedBytes
}
result.FilesystemsFound = append(result.FilesystemsFound, fsDetail)
}
if totalBytes > 0 {
percent := float64(usedBytes) / float64(totalBytes) * 100
result.TestResult = fmt.Sprintf("SUCCESS: Guest agent working! Disk usage: %.1f%% (%d/%d bytes)",
percent, usedBytes, totalBytes)
} else {
result.TestResult = fmt.Sprintf("Guest agent returned %d filesystems but no usable disk data (all filtered out)", len(fsInfo))
}
}
}
} else {
result.TestResult = "No running VMs found to test"
result.Recommendations = append(result.Recommendations, "Start a VM to test disk monitoring")
}
// Add general recommendations based on results
if result.VMsWithAgent > 0 && result.VMsWithDiskData == 0 {
result.Recommendations = append(result.Recommendations,
"Guest agent is configured but not providing disk data",
"Check guest agent is running inside VMs",
"Verify API token permissions")
}
return result
}