diff --git a/frontend-modern/src/components/Settings/Settings.tsx b/frontend-modern/src/components/Settings/Settings.tsx index de4eb897a..27235697b 100644 --- a/frontend-modern/src/components/Settings/Settings.tsx +++ b/frontend-modern/src/components/Settings/Settings.tsx @@ -45,8 +45,7 @@ import { ReportingPanel } from './ReportingPanel'; import { PveNodesTable, PbsNodesTable, - PmgNodesTable, - type TemperatureTransportInfo, + PmgNodesTable } from './ConfiguredNodeTables'; import { SettingsSectionNav } from './SettingsSectionNav'; import { SettingsAPI } from '@/api/settings'; @@ -140,58 +139,6 @@ interface SystemDiagnostic { memoryMB: number; } -interface TemperatureProxyHTTPStatus { - node: string; - url?: string; - reachable: boolean; - error?: string; -} - -interface TemperatureProxyControlPlaneState { - instance: string; - lastSync?: string; - refreshIntervalSeconds?: number; - secondsBehind?: number; - status?: string; -} - -interface TemperatureProxySocketHost { - node?: string; - host?: string; - cooldownUntil?: string; - secondsRemaining?: number; - lastError?: string; -} - -type TemperatureSocketCooldownInfo = { - secondsRemaining?: number; - until?: string; - lastError?: string; -}; - -// HostProxySummary removed - pulse-sensor-proxy is deprecated in v5 - -interface TemperatureProxyDiagnostic { - legacySSHDetected: boolean; - recommendProxyUpgrade: boolean; - socketFound: boolean; - socketPath?: string; - socketPermissions?: string; - socketOwner?: string; - socketGroup?: string; - proxyReachable?: boolean; - proxyVersion?: string; - proxyPublicKeySha256?: string; - proxySshDirectory?: string; - legacySshKeyCount?: number; - proxyCapabilities?: string[]; - notes?: string[]; - httpProxies?: TemperatureProxyHTTPStatus[]; - controlPlaneEnabled?: boolean; - controlPlaneStates?: TemperatureProxyControlPlaneState[]; - socketHostCooldowns?: TemperatureProxySocketHost[]; -} - interface APITokenSummary { id: string; name: string; @@ -278,7 +225,6 @@ interface DiagnosticsData { nodes: DiagnosticsNode[]; pbs: DiagnosticsPBS[]; system: SystemDiagnostic; - temperatureProxy?: TemperatureProxyDiagnostic | null; apiTokens?: APITokenDiagnostic | null; dockerAgents?: DockerAgentDiagnostic | null; alerts?: AlertsDiagnostic | null; @@ -659,7 +605,6 @@ const Settings: Component = (props) => { const [envOverrides, setEnvOverrides] = createSignal>({}); const [temperatureMonitoringEnabled, setTemperatureMonitoringEnabled] = createSignal(true); const [savingTemperatureSetting, setSavingTemperatureSetting] = createSignal(false); - // hostProxyStatus removed - pulse-sensor-proxy is deprecated in v5 const [hideLocalLogin, setHideLocalLogin] = createSignal(false); const [savingHideLocalLogin, setSavingHideLocalLogin] = createSignal(false); @@ -876,7 +821,7 @@ const Settings: Component = (props) => { }; // Diagnostics - const [diagnosticsData, setDiagnosticsData] = createSignal(null); + const [_diagnosticsData, setDiagnosticsData] = createSignal(null); const [_runningDiagnostics, setRunningDiagnostics] = createSignal(false); // Security @@ -926,112 +871,12 @@ const Settings: Component = (props) => { }; - const normalizeHostKey = (value?: string | null) => { - if (!value) { - return ''; - } - let result = value.trim().toLowerCase(); - if (!result) { - return ''; - } - result = result.replace(/^https?:\/\//, ''); - const slashIndex = result.indexOf('/'); - if (slashIndex !== -1) { - result = result.slice(0, slashIndex); - } - const colonIndex = result.indexOf(':'); - if (colonIndex !== -1) { - result = result.slice(0, colonIndex); - } - return result; - }; - - const emitTemperatureProxyWarnings = (diag: DiagnosticsData | null) => { - if (!diag?.temperatureProxy) { - return; - } - if (diag.temperatureProxy.httpProxies) { - const failing = (diag.temperatureProxy.httpProxies as TemperatureProxyHTTPStatus[]).filter( - (proxy) => proxy && proxy.node && !proxy.reachable, - ); - if (failing.length > 0) { - const nodes = failing.map((proxy) => proxy.node || 'Unknown').join(', '); - notificationStore.warning(`Pulse cannot reach HTTPS temperature proxy on: ${nodes}`); - } - } - if (diag.temperatureProxy.controlPlaneStates) { - const stale = (diag.temperatureProxy.controlPlaneStates as TemperatureProxyControlPlaneState[]).filter( - (state) => state && (state.status === 'stale' || state.status === 'offline'), - ); - if (stale.length > 0) { - const names = stale.map((state) => state.instance || 'Proxy').join(', '); - notificationStore.warning(`Temperature proxy control plane is behind on: ${names}`); - } - } - if (diag.temperatureProxy.socketHostCooldowns) { - const cooling = (diag.temperatureProxy.socketHostCooldowns as TemperatureProxySocketHost[]).filter( - (entry) => entry && (entry.node || entry.host), - ); - if (cooling.length > 0) { - const hosts = cooling.map((entry) => entry.node || entry.host || 'proxy').join(', '); - notificationStore.warning(`Temperature proxy is cooling down the following hosts: ${hosts}`); - } - } - }; - - const temperatureTransportInfo = createMemo(() => { - const diag = diagnosticsData(); - if (!diag?.temperatureProxy) { - return null; - } - const httpMap: TemperatureTransportInfo['httpMap'] = {}; - const proxies = diag.temperatureProxy.httpProxies || []; - proxies.forEach((proxy) => { - if (!proxy || !proxy.node) { - return; - } - const key = proxy.node.trim().toLowerCase(); - if (!key) { - return; - } - httpMap[key] = { - reachable: Boolean(proxy.reachable), - error: proxy.error || undefined, - url: proxy.url || undefined, - }; - }); - const socketStatus: TemperatureTransportInfo['socketStatus'] = - diag.temperatureProxy.socketFound && diag.temperatureProxy.proxyReachable - ? 'healthy' - : diag.temperatureProxy.socketFound - ? 'error' - : 'missing'; - const cooldowns: Record = {}; - const socketHosts = diag.temperatureProxy.socketHostCooldowns || []; - (socketHosts as TemperatureProxySocketHost[]).forEach((entry) => { - const key = normalizeHostKey(entry.node) || normalizeHostKey(entry.host); - if (!key) { - return; - } - cooldowns[key] = { - secondsRemaining: entry.secondsRemaining, - until: entry.cooldownUntil, - lastError: entry.lastError || undefined, - }; - }); - return { httpMap, socketStatus, socketCooldowns: cooldowns }; - }); - - - const runDiagnostics = async () => { setRunningDiagnostics(true); try { const response = await apiFetch('/api/diagnostics'); const diag = await response.json(); setDiagnosticsData(diag); - emitTemperatureProxyWarnings(diag); - // hostProxyStatus removed - pulse-sensor-proxy is deprecated in v5 } catch (err) { logger.error('Failed to fetch diagnostics', err); notificationStore.error('Failed to run diagnostics'); @@ -1040,8 +885,6 @@ const Settings: Component = (props) => { } }; - // refreshHostProxyStatus removed - pulse-sensor-proxy is deprecated in v5 - createEffect(() => { if (typeof window === 'undefined') { return; @@ -2730,7 +2573,6 @@ const Settings: Component = (props) => { stateNodes={state.nodes ?? []} stateHosts={state.hosts ?? []} globalTemperatureMonitoringEnabled={temperatureMonitoringEnabled()} - temperatureTransports={temperatureTransportInfo()} onTestConnection={testNodeConnection} onEdit={(node) => { setEditingNode(node); diff --git a/frontend-modern/src/components/shared/NodeSummaryTable.tsx b/frontend-modern/src/components/shared/NodeSummaryTable.tsx index 653e0ae92..60a82d368 100644 --- a/frontend-modern/src/components/shared/NodeSummaryTable.tsx +++ b/frontend-modern/src/components/shared/NodeSummaryTable.tsx @@ -545,6 +545,17 @@ export const NodeSummaryTable: Component = (props) => { +Agent + 0}> + = 10 + ? 'bg-orange-100 text-orange-700 dark:bg-orange-900/30 dark:text-orange-400' + : 'bg-yellow-100 text-yellow-700 dark:bg-yellow-900/30 dark:text-yellow-400' + }`} + title={`${node!.pendingUpdates} pending apt update${node!.pendingUpdates !== 1 ? 's' : ''}`} + > + {node!.pendingUpdates} updates + + PBS diff --git a/frontend-modern/src/types/api.ts b/frontend-modern/src/types/api.ts index 77dc021e6..29c7851ed 100644 --- a/frontend-modern/src/types/api.ts +++ b/frontend-modern/src/types/api.ts @@ -148,6 +148,8 @@ export interface Node { cpuInfo: CPUInfo; temperature?: Temperature; // CPU/NVMe temperatures temperatureMonitoringEnabled?: boolean | null; // Per-node temperature monitoring override + pendingUpdates?: number; // Number of pending apt updates + pendingUpdatesCheckedAt?: string; // When updates were last checked lastSeen: string; connectionHealth: string; isClusterMember?: boolean; // True if part of a cluster diff --git a/internal/models/models.go b/internal/models/models.go index 84cb54f64..436ace46b 100644 --- a/internal/models/models.go +++ b/internal/models/models.go @@ -89,6 +89,10 @@ type Node struct { IsClusterMember bool `json:"isClusterMember"` // True if part of a cluster ClusterName string `json:"clusterName"` // Name of cluster (empty if standalone) + // Package updates - polled less frequently (every 30 mins) + PendingUpdates int `json:"pendingUpdates"` // Number of pending apt updates + PendingUpdatesCheckedAt time.Time `json:"pendingUpdatesCheckedAt,omitempty"` // When updates were last checked + // Linking: When a host agent is running on this PVE node, link them together LinkedHostAgentID string `json:"linkedHostAgentId,omitempty"` // ID of the host agent running on this node } @@ -1061,7 +1065,7 @@ type Temperature struct { MaxRecorded time.Time `json:"maxRecorded,omitempty"` // When maximum temperature was recorded Cores []CoreTemp `json:"cores,omitempty"` // Individual core temperatures GPU []GPUTemp `json:"gpu,omitempty"` // GPU temperatures - NVMe []NVMeTemp `json:"nvme,omitempty"` // NVMe drive temperatures (legacy, from sensor proxy) + NVMe []NVMeTemp `json:"nvme,omitempty"` // NVMe drive temperatures SMART []DiskTemp `json:"smart,omitempty"` // Physical disk temperatures from SMART data Available bool `json:"available"` // Whether any temperature data is available HasCPU bool `json:"hasCPU"` // Whether CPU temperature data is available diff --git a/internal/monitoring/monitor.go b/internal/monitoring/monitor.go index e00c5e6e8..1ac6ef118 100644 --- a/internal/monitoring/monitor.go +++ b/internal/monitoring/monitor.go @@ -85,6 +85,7 @@ type PVEClientInterface interface { GetZFSPoolStatus(ctx context.Context, node string) ([]proxmox.ZFSPoolStatus, error) GetZFSPoolsWithDetails(ctx context.Context, node string) ([]proxmox.ZFSPoolInfo, error) GetDisks(ctx context.Context, node string) ([]proxmox.Disk, error) + GetNodePendingUpdates(ctx context.Context, node string) ([]proxmox.AptPackage, error) GetCephStatus(ctx context.Context) (*proxmox.CephStatus, error) GetCephDF(ctx context.Context) (*proxmox.CephDF, error) } @@ -770,8 +771,9 @@ type Monitor struct { instanceInfoCache map[string]*instanceInfo pollStatusMap map[string]*pollStatus dlqInsightMap map[string]*dlqInsight - nodeLastOnline map[string]time.Time // Track last time each node was seen online (for grace period) - resourceStore ResourceStoreInterface // Optional unified resource store for polling optimization + nodeLastOnline map[string]time.Time // Track last time each node was seen online (for grace period) + nodePendingUpdatesCache map[string]pendingUpdatesCache // Cache pending updates per node (checked every 30 min) + resourceStore ResourceStoreInterface // Optional unified resource store for polling optimization mockMetricsCancel context.CancelFunc mockMetricsWg sync.WaitGroup dockerChecker DockerChecker // Optional Docker checker for LXC containers @@ -787,6 +789,15 @@ type rrdMemCacheEntry struct { fetchedAt time.Time } +// pendingUpdatesCache caches apt pending updates count per node +type pendingUpdatesCache struct { + count int + checkedAt time.Time +} + +// TTL for pending updates cache (30 minutes - balance between freshness and API load) +const pendingUpdatesCacheTTL = 30 * time.Minute + // agentProfileCacheEntry caches agent profiles and assignments to avoid disk I/O on every agent report. // TTL is 60 seconds to balance freshness with performance. type agentProfileCacheEntry struct { @@ -3388,13 +3399,6 @@ func New(cfg *config.Config) (*Monitor, error) { // Security warning if running in container with SSH temperature monitoring checkContainerizedTempMonitoring() - if cfg.TemperatureMonitoringEnabled { - isContainer := os.Getenv("PULSE_DOCKER") == "true" || system.InContainer() - if isContainer && tempCollector != nil && !tempCollector.SocketProxyAvailable() { - log.Warn().Msg("Temperature monitoring is enabled but the container does not have access to pulse-sensor-proxy. Install the proxy on the host or disable temperatures until it is available.") - } - } - stalenessTracker := NewStalenessTracker(getPollMetrics()) stalenessTracker.SetBounds(cfg.AdaptivePollingBaseInterval, cfg.AdaptivePollingMaxInterval) taskQueue := NewTaskQueue() @@ -3546,6 +3550,7 @@ func New(cfg *config.Config) (*Monitor, error) { pollStatusMap: make(map[string]*pollStatus), dlqInsightMap: make(map[string]*dlqInsight), nodeLastOnline: make(map[string]time.Time), + nodePendingUpdatesCache: make(map[string]pendingUpdatesCache), } m.breakerBaseRetry = 5 * time.Second diff --git a/internal/monitoring/monitor_memory_test.go b/internal/monitoring/monitor_memory_test.go index c610640a9..48d5d38f1 100644 --- a/internal/monitoring/monitor_memory_test.go +++ b/internal/monitoring/monitor_memory_test.go @@ -132,6 +132,10 @@ func (s *stubPVEClient) GetCephDF(ctx context.Context) (*proxmox.CephDF, error) return nil, nil } +func (s *stubPVEClient) GetNodePendingUpdates(ctx context.Context, node string) ([]proxmox.AptPackage, error) { + return nil, nil +} + func floatPtr(v float64) *float64 { return &v } func TestPollPVEInstanceUsesRRDMemUsedFallback(t *testing.T) { diff --git a/internal/monitoring/monitor_polling.go b/internal/monitoring/monitor_polling.go index 9d97f9591..d9d9a6534 100644 --- a/internal/monitoring/monitor_polling.go +++ b/internal/monitoring/monitor_polling.go @@ -2205,7 +2205,7 @@ func (m *Monitor) pollPVENode( } // If no host agent temp or we need additional data (SMART), try SSH/proxy collection - var proxyTemp *models.Temperature + var sshTemp *models.Temperature var err error if m.tempCollector != nil { // Temperature collection is best-effort - use a short timeout to avoid blocking node polling @@ -2248,15 +2248,11 @@ func (m *Monitor) pollPVENode( sshHost = node.Node } - // Skip SSH/proxy collection if we already have host agent data and no proxy is configured - // (proxy might provide additional SMART data that host agent doesn't have) - skipProxyCollection := hostAgentTemp != nil && - strings.TrimSpace(instanceCfg.TemperatureProxyURL) == "" && - !m.HasSocketTemperatureProxy() + // Skip SSH collection if we already have host agent data. + skipSSHCollection := hostAgentTemp != nil - if !skipProxyCollection { - // Use HTTP proxy if configured for this instance, otherwise fall back to socket/SSH - proxyTemp, err = m.tempCollector.CollectTemperatureWithProxy(tempCtx, sshHost, node.Node, instanceCfg.TemperatureProxyURL, instanceCfg.TemperatureProxyToken) + if !skipSSHCollection { + sshTemp, err = m.tempCollector.CollectTemperature(tempCtx, sshHost, node.Node) if err != nil && hostAgentTemp == nil { log.Debug(). Str("node", node.Node). @@ -2267,25 +2263,25 @@ func (m *Monitor) pollPVENode( } } - // Debug: log proxy temp details before merge - if proxyTemp != nil { + // Debug: log SSH temp details before merge + if sshTemp != nil { log.Debug(). Str("node", node.Node). - Bool("proxyTempAvailable", proxyTemp.Available). - Bool("proxyHasSMART", proxyTemp.HasSMART). - Int("proxySMARTCount", len(proxyTemp.SMART)). - Bool("proxyHasNVMe", proxyTemp.HasNVMe). - Int("proxyNVMeCount", len(proxyTemp.NVMe)). - Msg("Proxy temperature data before merge") + Bool("sshTempAvailable", sshTemp.Available). + Bool("sshHasSMART", sshTemp.HasSMART). + Int("sshSMARTCount", len(sshTemp.SMART)). + Bool("sshHasNVMe", sshTemp.HasNVMe). + Int("sshNVMeCount", len(sshTemp.NVMe)). + Msg("SSH temperature data before merge") } else { log.Debug(). Str("node", node.Node). - Msg("Proxy temperature data is nil") + Msg("SSH temperature data is nil") } } - // Merge host agent and proxy temperatures - temp := mergeTemperatureData(hostAgentTemp, proxyTemp) + // Merge host agent and SSH temperatures + temp := mergeTemperatureData(hostAgentTemp, sshTemp) if temp != nil && temp.Available { // Get the current CPU temperature (prefer package, fall back to max) @@ -2333,11 +2329,11 @@ func (m *Monitor) pollPVENode( modelNode.Temperature = temp // Determine source for logging - tempSource := "proxy/ssh" - if hostAgentTemp != nil && proxyTemp == nil { + tempSource := "ssh" + if hostAgentTemp != nil && sshTemp == nil { tempSource = "host-agent" - } else if hostAgentTemp != nil && proxyTemp != nil { - tempSource = "host-agent+proxy" + } else if hostAgentTemp != nil && sshTemp != nil { + tempSource = "host-agent+ssh" } log.Debug(). @@ -2381,6 +2377,54 @@ func (m *Monitor) pollPVENode( } } + // Poll pending apt updates (less frequently - every 30 minutes) + // Only for online nodes to avoid wasting API calls on offline nodes + if effectiveStatus == "online" { + now := time.Now() + m.mu.RLock() + cached, hasCached := m.nodePendingUpdatesCache[nodeID] + m.mu.RUnlock() + + if !hasCached || now.Sub(cached.checkedAt) >= pendingUpdatesCacheTTL { + // Time to check for updates + pendingPkgs, err := client.GetNodePendingUpdates(ctx, node.Node) + if err != nil { + // API call failed - preserve cached value if available, don't spam logs + log.Debug(). + Err(err). + Str("node", node.Node). + Str("instance", instanceName). + Msg("Could not check pending apt updates (may require Sys.Audit permission)") + if hasCached { + modelNode.PendingUpdates = cached.count + modelNode.PendingUpdatesCheckedAt = cached.checkedAt + } + } else { + updateCount := len(pendingPkgs) + modelNode.PendingUpdates = updateCount + modelNode.PendingUpdatesCheckedAt = now + + // Cache the result + m.mu.Lock() + m.nodePendingUpdatesCache[nodeID] = pendingUpdatesCache{ + count: updateCount, + checkedAt: now, + } + m.mu.Unlock() + + log.Debug(). + Str("node", node.Node). + Str("instance", instanceName). + Int("pendingUpdates", updateCount). + Msg("Checked pending apt updates") + } + } else { + // Use cached value + modelNode.PendingUpdates = cached.count + modelNode.PendingUpdatesCheckedAt = cached.checkedAt + } + } + if m.pollMetrics != nil { nodeNameLabel := strings.TrimSpace(node.Node) if nodeNameLabel == "" { diff --git a/internal/monitoring/monitor_snapshots_test.go b/internal/monitoring/monitor_snapshots_test.go index a6f6134dd..186e76d91 100644 --- a/internal/monitoring/monitor_snapshots_test.go +++ b/internal/monitoring/monitor_snapshots_test.go @@ -96,6 +96,9 @@ func (f fakeSnapshotClient) GetCephStatus(ctx context.Context) (*proxmox.CephSta return nil, nil } func (f fakeSnapshotClient) GetCephDF(ctx context.Context) (*proxmox.CephDF, error) { return nil, nil } +func (f fakeSnapshotClient) GetNodePendingUpdates(ctx context.Context, node string) ([]proxmox.AptPackage, error) { + return nil, nil +} func TestCollectSnapshotSizes(t *testing.T) { m := &Monitor{} diff --git a/internal/monitoring/monitor_storage_test.go b/internal/monitoring/monitor_storage_test.go index 20cb68caf..4dbd3a638 100644 --- a/internal/monitoring/monitor_storage_test.go +++ b/internal/monitoring/monitor_storage_test.go @@ -132,6 +132,10 @@ func (f *fakeStorageClient) GetCephDF(ctx context.Context) (*proxmox.CephDF, err return nil, nil } +func (f *fakeStorageClient) GetNodePendingUpdates(ctx context.Context, node string) ([]proxmox.AptPackage, error) { + return nil, nil +} + func TestPollStorageWithNodesOptimizedRecordsMetricsAndAlerts(t *testing.T) { t.Setenv("PULSE_DATA_DIR", t.TempDir()) diff --git a/pkg/proxmox/client.go b/pkg/proxmox/client.go index 9b5576678..7cc330a42 100644 --- a/pkg/proxmox/client.go +++ b/pkg/proxmox/client.go @@ -2003,3 +2003,34 @@ func (c *Client) GetDisks(ctx context.Context, node string) ([]Disk, error) { return result.Data, nil } + +// AptPackage represents a pending package update from apt +type AptPackage struct { + Package string `json:"Package"` // Package name + Title string `json:"Title"` // Human-readable title + Description string `json:"Description"` // Package description + OldVersion string `json:"OldVersion"` // Currently installed version + NewVersion string `json:"Version"` // Available version + Priority string `json:"Priority"` // Update priority (e.g., "important", "optional") + Section string `json:"Section"` // Package section + Origin string `json:"Origin"` // Repository origin +} + +// GetNodePendingUpdates returns the list of pending apt updates for a node +// Requires Sys.Audit permission on /nodes/{node} +func (c *Client) GetNodePendingUpdates(ctx context.Context, node string) ([]AptPackage, error) { + resp, err := c.get(ctx, fmt.Sprintf("/nodes/%s/apt/update", node)) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + var result struct { + Data []AptPackage `json:"data"` + } + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return nil, err + } + + return result.Data, nil +} diff --git a/pkg/proxmox/cluster_client.go b/pkg/proxmox/cluster_client.go index 001ec39ae..6bbf360da 100644 --- a/pkg/proxmox/cluster_client.go +++ b/pkg/proxmox/cluster_client.go @@ -1360,6 +1360,32 @@ func (cc *ClusterClient) GetDisks(ctx context.Context, node string) ([]Disk, err return result, err } +// GetNodePendingUpdates returns pending apt updates for a node with failover support +func (cc *ClusterClient) GetNodePendingUpdates(ctx context.Context, node string) ([]AptPackage, error) { + var result []AptPackage + err := cc.executeWithFailover(ctx, func(client *Client) error { + pkgs, err := client.GetNodePendingUpdates(ctx, node) + if err != nil { + return err + } + result = pkgs + return nil + }) + + // Don't return error for transient connectivity issues or permission issues + if err != nil && (strings.Contains(err.Error(), "no healthy nodes available") || + strings.Contains(err.Error(), "403") || strings.Contains(err.Error(), "permission")) { + log.Debug(). + Str("cluster", cc.name). + Str("node", node). + Err(err). + Msg("Could not get pending updates - returning empty list") + return []AptPackage{}, nil + } + + return result, err +} + // GetClusterStatus returns the cluster status including all nodes with failover support. func (cc *ClusterClient) GetClusterStatus(ctx context.Context) ([]ClusterStatus, error) { var result []ClusterStatus