mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 03:20:11 +00:00
Prevent removed host agents from resurrecting via in-flight reports (#1331)
Host agents removed from the UI would reappear on the next report cycle because there was no rejection mechanism — unlike Docker agents which already had resurrection prevention. Mirror the Docker agent pattern: - Track removed host IDs in a `removedHosts` map with 24hr TTL - Persist removal records in `State.RemovedHosts` for frontend display - Reject reports from removed hosts in `ApplyHostReport()` - Add `AllowHostReenroll()` + API route to clear the block - Show removed host agents in the Settings UI with "Allow re-enroll" - Sync removed-agent maps from state on startup for all agent types - Fix mock integration snapshot missing `RemovedDockerHosts` field
This commit is contained in:
parent
9b531c547d
commit
a4b0771974
15 changed files with 327 additions and 7 deletions
|
|
@ -294,6 +294,7 @@ function App() {
|
|||
containers: [],
|
||||
dockerHosts: [],
|
||||
removedDockerHosts: [],
|
||||
removedHosts: [],
|
||||
hosts: [],
|
||||
storage: [],
|
||||
cephClusters: [],
|
||||
|
|
|
|||
|
|
@ -186,6 +186,36 @@ export class MonitoringAPI {
|
|||
}
|
||||
}
|
||||
|
||||
static async allowHostReenroll(hostId: string): Promise<void> {
|
||||
const url = `${this.baseUrl}/agents/host/${encodeURIComponent(hostId)}/allow-reenroll`;
|
||||
|
||||
const response = await apiFetch(url, {
|
||||
method: 'POST',
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
let message = `Failed with status ${response.status}`;
|
||||
try {
|
||||
const text = await response.text();
|
||||
if (text?.trim()) {
|
||||
message = text.trim();
|
||||
try {
|
||||
const parsed = JSON.parse(text);
|
||||
if (typeof parsed?.error === 'string' && parsed.error.trim()) {
|
||||
message = parsed.error.trim();
|
||||
}
|
||||
} catch (_err) {
|
||||
// ignore parse error, use raw text
|
||||
}
|
||||
}
|
||||
} catch (_err) {
|
||||
// ignore read error
|
||||
}
|
||||
|
||||
throw new Error(message);
|
||||
}
|
||||
}
|
||||
|
||||
static async allowDockerHostReenroll(hostId: string): Promise<void> {
|
||||
const url = `${this.baseUrl}/agents/docker/hosts/${encodeURIComponent(hostId)}/allow-reenroll`;
|
||||
|
||||
|
|
|
|||
|
|
@ -606,6 +606,11 @@ export const UnifiedAgents: Component = () => {
|
|||
return removed.sort((a, b) => b.removedAt - a.removedAt);
|
||||
});
|
||||
|
||||
const removedHosts = createMemo(() => {
|
||||
const removed = state.removedHosts || [];
|
||||
return removed.sort((a, b) => b.removedAt - a.removedAt);
|
||||
});
|
||||
|
||||
const kubernetesClusters = createMemo(() => {
|
||||
const clusters = state.kubernetesClusters || [];
|
||||
return clusters.slice().sort((a, b) => (a.displayName || a.name || a.id).localeCompare(b.displayName || b.name || b.id));
|
||||
|
|
@ -704,6 +709,22 @@ export const UnifiedAgents: Component = () => {
|
|||
});
|
||||
});
|
||||
|
||||
removedHosts().forEach(host => {
|
||||
const name = host.displayName || host.hostname || host.id;
|
||||
rows.push({
|
||||
rowKey: `removed-host-${host.id}`,
|
||||
id: host.id,
|
||||
name,
|
||||
hostname: host.hostname,
|
||||
displayName: host.displayName,
|
||||
types: ['host'],
|
||||
status: 'removed',
|
||||
removedAt: host.removedAt,
|
||||
scope: getScopeInfo(undefined),
|
||||
searchText: [name, host.hostname, host.id].filter(Boolean).join(' ').toLowerCase(),
|
||||
});
|
||||
});
|
||||
|
||||
removedKubernetesClusters().forEach(cluster => {
|
||||
const name = cluster.displayName || cluster.name || cluster.id;
|
||||
rows.push({
|
||||
|
|
@ -867,6 +888,16 @@ export const UnifiedAgents: Component = () => {
|
|||
}
|
||||
};
|
||||
|
||||
const handleAllowHostReenroll = async (hostId: string, hostname?: string) => {
|
||||
try {
|
||||
await MonitoringAPI.allowHostReenroll(hostId);
|
||||
notificationStore.success(`Re-enrollment allowed for ${hostname || hostId}. Restart the agent to reconnect.`);
|
||||
} catch (err) {
|
||||
logger.error('Failed to allow host re-enrollment', err);
|
||||
notificationStore.error('Failed to allow re-enrollment');
|
||||
}
|
||||
};
|
||||
|
||||
const handleRemoveKubernetesCluster = async (clusterId: string) => {
|
||||
if (!confirm('Are you sure you want to remove this Kubernetes cluster? This will stop monitoring but will not uninstall the agent from the cluster.')) return;
|
||||
|
||||
|
|
@ -1718,12 +1749,21 @@ export const UnifiedAgents: Component = () => {
|
|||
</Show>
|
||||
}>
|
||||
<Show when={row.types.includes('docker')} fallback={
|
||||
<button
|
||||
onClick={() => handleAllowKubernetesReenroll(row.id, row.name)}
|
||||
class="text-blue-600 hover:text-blue-900 dark:text-blue-400 dark:hover:text-blue-300"
|
||||
>
|
||||
Allow re-enroll
|
||||
</button>
|
||||
<Show when={row.types.includes('host')} fallback={
|
||||
<button
|
||||
onClick={() => handleAllowKubernetesReenroll(row.id, row.name)}
|
||||
class="text-blue-600 hover:text-blue-900 dark:text-blue-400 dark:hover:text-blue-300"
|
||||
>
|
||||
Allow re-enroll
|
||||
</button>
|
||||
}>
|
||||
<button
|
||||
onClick={() => handleAllowHostReenroll(row.id, row.hostname)}
|
||||
class="text-blue-600 hover:text-blue-900 dark:text-blue-400 dark:hover:text-blue-300"
|
||||
>
|
||||
Allow re-enroll
|
||||
</button>
|
||||
</Show>
|
||||
}>
|
||||
<button
|
||||
onClick={() => handleAllowReenroll(row.id, row.hostname)}
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ import { render, fireEvent, screen, waitFor, cleanup, within } from '@solidjs/te
|
|||
import { createStore } from 'solid-js/store';
|
||||
import { Router, Route } from '@solidjs/router';
|
||||
import { UnifiedAgents } from '../UnifiedAgents';
|
||||
import type { Host, DockerHost, KubernetesCluster, RemovedDockerHost, RemovedKubernetesCluster } from '@/types/api';
|
||||
import type { Host, DockerHost, KubernetesCluster, RemovedDockerHost, RemovedHost, RemovedKubernetesCluster } from '@/types/api';
|
||||
|
||||
let mockWsStore: {
|
||||
state: {
|
||||
|
|
@ -11,6 +11,7 @@ let mockWsStore: {
|
|||
dockerHosts: DockerHost[];
|
||||
kubernetesClusters?: KubernetesCluster[];
|
||||
removedDockerHosts?: RemovedDockerHost[];
|
||||
removedHosts?: RemovedHost[];
|
||||
removedKubernetesClusters?: RemovedKubernetesCluster[];
|
||||
};
|
||||
connected: () => boolean;
|
||||
|
|
@ -140,12 +141,14 @@ const setupComponent = (
|
|||
kubernetesClusters: KubernetesCluster[] = [],
|
||||
removedDockerHosts: RemovedDockerHost[] = [],
|
||||
removedKubernetesClusters: RemovedKubernetesCluster[] = [],
|
||||
removedHosts: RemovedHost[] = [],
|
||||
) => {
|
||||
const [state] = createStore({
|
||||
hosts,
|
||||
dockerHosts,
|
||||
kubernetesClusters,
|
||||
removedDockerHosts,
|
||||
removedHosts,
|
||||
removedKubernetesClusters,
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ import type {
|
|||
Host,
|
||||
KubernetesCluster,
|
||||
RemovedDockerHost,
|
||||
RemovedHost,
|
||||
RemovedKubernetesCluster,
|
||||
} from '@/types/api';
|
||||
import type { ActivationState as ActivationStateType } from '@/types/alerts';
|
||||
|
|
@ -37,6 +38,7 @@ export function createWebSocketStore(url: string) {
|
|||
removedDockerHosts: [],
|
||||
kubernetesClusters: [],
|
||||
removedKubernetesClusters: [],
|
||||
removedHosts: [],
|
||||
hosts: [],
|
||||
replicationJobs: [],
|
||||
storage: [],
|
||||
|
|
@ -667,6 +669,12 @@ export function createWebSocketStore(url: string) {
|
|||
: [];
|
||||
setState('removedKubernetesClusters', reconcile(removed, { key: 'id' }));
|
||||
}
|
||||
if (message.data.removedHosts !== undefined) {
|
||||
const removed = Array.isArray(message.data.removedHosts)
|
||||
? (message.data.removedHosts as RemovedHost[])
|
||||
: [];
|
||||
setState('removedHosts', reconcile(removed, { key: 'id' }));
|
||||
}
|
||||
if (message.data.storage !== undefined) setState('storage', reconcile(message.data.storage, { key: 'id' }));
|
||||
if (message.data.cephClusters !== undefined)
|
||||
setState('cephClusters', reconcile(message.data.cephClusters, { key: 'id' }));
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ export interface State {
|
|||
removedDockerHosts?: RemovedDockerHost[];
|
||||
kubernetesClusters?: KubernetesCluster[];
|
||||
removedKubernetesClusters?: RemovedKubernetesCluster[];
|
||||
removedHosts?: RemovedHost[];
|
||||
hosts: Host[];
|
||||
replicationJobs: ReplicationJob[];
|
||||
storage: Storage[];
|
||||
|
|
@ -64,6 +65,13 @@ export interface KubernetesCluster {
|
|||
deployments?: KubernetesDeployment[];
|
||||
}
|
||||
|
||||
export interface RemovedHost {
|
||||
id: string;
|
||||
hostname?: string;
|
||||
displayName?: string;
|
||||
removedAt: number;
|
||||
}
|
||||
|
||||
export interface RemovedKubernetesCluster {
|
||||
id: string;
|
||||
name?: string;
|
||||
|
|
|
|||
|
|
@ -230,6 +230,34 @@ func (h *HostAgentHandlers) HandleLookup(w http.ResponseWriter, r *http.Request)
|
|||
}
|
||||
}
|
||||
|
||||
// HandleAllowReenroll clears the removal block for a host agent to permit future reports.
|
||||
func (h *HostAgentHandlers) HandleAllowReenroll(w http.ResponseWriter, r *http.Request) {
|
||||
// Extract host ID from URL path
|
||||
// Expected format: /api/agents/host/{hostId}/allow-reenroll
|
||||
trimmedPath := strings.TrimPrefix(r.URL.Path, "/api/agents/host/")
|
||||
trimmedPath = strings.TrimSuffix(trimmedPath, "/allow-reenroll")
|
||||
hostID := strings.TrimSpace(trimmedPath)
|
||||
if hostID == "" {
|
||||
writeErrorResponse(w, http.StatusBadRequest, "missing_host_id", "Host ID is required", nil)
|
||||
return
|
||||
}
|
||||
|
||||
if err := h.getMonitor(r.Context()).AllowHostReenroll(hostID); err != nil {
|
||||
writeErrorResponse(w, http.StatusBadRequest, "host_reenroll_failed", err.Error(), nil)
|
||||
return
|
||||
}
|
||||
|
||||
go h.wsHub.BroadcastState(h.getMonitor(r.Context()).GetState().ToFrontend())
|
||||
|
||||
if err := utils.WriteJSONResponse(w, map[string]any{
|
||||
"success": true,
|
||||
"hostId": hostID,
|
||||
"message": "Host agent removal block cleared",
|
||||
}); err != nil {
|
||||
log.Error().Err(err).Msg("Failed to serialize host allow reenroll response")
|
||||
}
|
||||
}
|
||||
|
||||
// HandleDeleteHost removes a host from the shared state.
|
||||
func (h *HostAgentHandlers) HandleDeleteHost(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodDelete {
|
||||
|
|
|
|||
|
|
@ -331,6 +331,16 @@ func (r *Router) setupRoutes() {
|
|||
r.hostAgentHandlers.HandleConfig(w, req)
|
||||
return
|
||||
}
|
||||
// Route POST /api/agents/host/{id}/allow-reenroll to HandleAllowReenroll
|
||||
if strings.HasSuffix(req.URL.Path, "/allow-reenroll") && req.Method == http.MethodPost {
|
||||
RequireAdmin(r.config, func(w http.ResponseWriter, req *http.Request) {
|
||||
if !ensureScope(w, req, config.ScopeSettingsWrite) {
|
||||
return
|
||||
}
|
||||
r.hostAgentHandlers.HandleAllowReenroll(w, req)
|
||||
})(w, req)
|
||||
return
|
||||
}
|
||||
// Route DELETE /api/agents/host/{id} to HandleDeleteHost
|
||||
// SECURITY: Require settings:write (not just host_manage) to prevent compromised host tokens from deleting other hosts
|
||||
if req.Method == http.MethodDelete {
|
||||
|
|
|
|||
|
|
@ -378,6 +378,8 @@ func cloneState(state models.StateSnapshot) models.StateSnapshot {
|
|||
DockerHosts: append([]models.DockerHost(nil), state.DockerHosts...),
|
||||
KubernetesClusters: kubernetesClusters,
|
||||
RemovedKubernetesClusters: append([]models.RemovedKubernetesCluster(nil), state.RemovedKubernetesClusters...),
|
||||
RemovedDockerHosts: append([]models.RemovedDockerHost(nil), state.RemovedDockerHosts...),
|
||||
RemovedHosts: append([]models.RemovedHost(nil), state.RemovedHosts...),
|
||||
Hosts: append([]models.Host(nil), state.Hosts...),
|
||||
PMGInstances: append([]models.PMGInstance(nil), state.PMGInstances...),
|
||||
Storage: append([]models.Storage(nil), state.Storage...),
|
||||
|
|
|
|||
|
|
@ -460,6 +460,16 @@ func (r RemovedKubernetesCluster) ToFrontend() RemovedKubernetesClusterFrontend
|
|||
}
|
||||
}
|
||||
|
||||
// ToFrontend converts a RemovedHost to its frontend representation.
|
||||
func (r RemovedHost) ToFrontend() RemovedHostFrontend {
|
||||
return RemovedHostFrontend{
|
||||
ID: r.ID,
|
||||
Hostname: r.Hostname,
|
||||
DisplayName: r.DisplayName,
|
||||
RemovedAt: r.RemovedAt.Unix() * 1000,
|
||||
}
|
||||
}
|
||||
|
||||
// ToFrontend converts a Host to HostFrontend.
|
||||
func (h Host) ToFrontend() HostFrontend {
|
||||
host := HostFrontend{
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ type State struct {
|
|||
RemovedDockerHosts []RemovedDockerHost `json:"removedDockerHosts"`
|
||||
KubernetesClusters []KubernetesCluster `json:"kubernetesClusters"`
|
||||
RemovedKubernetesClusters []RemovedKubernetesCluster `json:"removedKubernetesClusters"`
|
||||
RemovedHosts []RemovedHost `json:"removedHosts"`
|
||||
Hosts []Host `json:"hosts"`
|
||||
Storage []Storage `json:"storage"`
|
||||
CephClusters []CephCluster `json:"cephClusters"`
|
||||
|
|
@ -439,6 +440,14 @@ type RemovedDockerHost struct {
|
|||
RemovedAt time.Time `json:"removedAt"`
|
||||
}
|
||||
|
||||
// RemovedHost tracks a host agent that was deliberately removed and blocked from reporting.
|
||||
type RemovedHost struct {
|
||||
ID string `json:"id"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
DisplayName string `json:"displayName,omitempty"`
|
||||
RemovedAt time.Time `json:"removedAt"`
|
||||
}
|
||||
|
||||
// DockerContainer represents the state of a Docker container on a monitored host.
|
||||
type DockerContainer struct {
|
||||
ID string `json:"id"`
|
||||
|
|
@ -2088,6 +2097,52 @@ func (s *State) GetRemovedDockerHosts() []RemovedDockerHost {
|
|||
return entries
|
||||
}
|
||||
|
||||
// AddRemovedHost records a removed host agent entry.
|
||||
func (s *State) AddRemovedHost(entry RemovedHost) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
replaced := false
|
||||
for i, existing := range s.RemovedHosts {
|
||||
if existing.ID == entry.ID {
|
||||
s.RemovedHosts[i] = entry
|
||||
replaced = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !replaced {
|
||||
s.RemovedHosts = append(s.RemovedHosts, entry)
|
||||
}
|
||||
sort.Slice(s.RemovedHosts, func(i, j int) bool {
|
||||
return s.RemovedHosts[i].RemovedAt.After(s.RemovedHosts[j].RemovedAt)
|
||||
})
|
||||
s.LastUpdate = time.Now()
|
||||
}
|
||||
|
||||
// RemoveRemovedHost deletes a removed host agent entry by ID.
|
||||
func (s *State) RemoveRemovedHost(hostID string) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
|
||||
for i, entry := range s.RemovedHosts {
|
||||
if entry.ID == hostID {
|
||||
s.RemovedHosts = append(s.RemovedHosts[:i], s.RemovedHosts[i+1:]...)
|
||||
s.LastUpdate = time.Now()
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// GetRemovedHosts returns a copy of removed host agent entries.
|
||||
func (s *State) GetRemovedHosts() []RemovedHost {
|
||||
s.mu.RLock()
|
||||
defer s.mu.RUnlock()
|
||||
|
||||
entries := make([]RemovedHost, len(s.RemovedHosts))
|
||||
copy(entries, s.RemovedHosts)
|
||||
return entries
|
||||
}
|
||||
|
||||
// UpsertKubernetesCluster inserts or updates a Kubernetes cluster in state.
|
||||
func (s *State) UpsertKubernetesCluster(cluster KubernetesCluster) {
|
||||
s.mu.Lock()
|
||||
|
|
|
|||
|
|
@ -235,6 +235,14 @@ type KubernetesDeploymentFrontend struct {
|
|||
Labels map[string]string `json:"labels,omitempty"`
|
||||
}
|
||||
|
||||
// RemovedHostFrontend represents a blocked host agent entry for the frontend.
|
||||
type RemovedHostFrontend struct {
|
||||
ID string `json:"id"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
DisplayName string `json:"displayName,omitempty"`
|
||||
RemovedAt int64 `json:"removedAt"`
|
||||
}
|
||||
|
||||
// RemovedKubernetesClusterFrontend represents a blocked kubernetes cluster entry for the frontend.
|
||||
type RemovedKubernetesClusterFrontend struct {
|
||||
ID string `json:"id"`
|
||||
|
|
@ -559,6 +567,7 @@ type StateFrontend struct {
|
|||
RemovedDockerHosts []RemovedDockerHostFrontend `json:"removedDockerHosts"`
|
||||
KubernetesClusters []KubernetesClusterFrontend `json:"kubernetesClusters"`
|
||||
RemovedKubernetesClusters []RemovedKubernetesClusterFrontend `json:"removedKubernetesClusters"`
|
||||
RemovedHosts []RemovedHostFrontend `json:"removedHosts"`
|
||||
Hosts []HostFrontend `json:"hosts"`
|
||||
Storage []StorageFrontend `json:"storage"`
|
||||
CephClusters []CephClusterFrontend `json:"cephClusters"`
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ type StateSnapshot struct {
|
|||
RemovedDockerHosts []RemovedDockerHost `json:"removedDockerHosts"`
|
||||
KubernetesClusters []KubernetesCluster `json:"kubernetesClusters"`
|
||||
RemovedKubernetesClusters []RemovedKubernetesCluster `json:"removedKubernetesClusters"`
|
||||
RemovedHosts []RemovedHost `json:"removedHosts"`
|
||||
Hosts []Host `json:"hosts"`
|
||||
Storage []Storage `json:"storage"`
|
||||
CephClusters []CephCluster `json:"cephClusters"`
|
||||
|
|
@ -54,6 +55,7 @@ func (s *State) GetSnapshot() StateSnapshot {
|
|||
RemovedDockerHosts: append([]RemovedDockerHost{}, s.RemovedDockerHosts...),
|
||||
KubernetesClusters: append([]KubernetesCluster{}, s.KubernetesClusters...),
|
||||
RemovedKubernetesClusters: append([]RemovedKubernetesCluster{}, s.RemovedKubernetesClusters...),
|
||||
RemovedHosts: append([]RemovedHost{}, s.RemovedHosts...),
|
||||
Hosts: append([]Host{}, s.Hosts...),
|
||||
Storage: append([]Storage{}, s.Storage...),
|
||||
CephClusters: append([]CephCluster{}, s.CephClusters...),
|
||||
|
|
@ -343,6 +345,11 @@ func (s StateSnapshot) ToFrontend() StateFrontend {
|
|||
removedKubernetesClusters[i] = entry.ToFrontend()
|
||||
}
|
||||
|
||||
removedHosts := make([]RemovedHostFrontend, len(s.RemovedHosts))
|
||||
for i, entry := range s.RemovedHosts {
|
||||
removedHosts[i] = entry.ToFrontend()
|
||||
}
|
||||
|
||||
hosts := make([]HostFrontend, len(s.Hosts))
|
||||
for i, host := range s.Hosts {
|
||||
hosts[i] = host.ToFrontend()
|
||||
|
|
@ -392,6 +399,7 @@ func (s StateSnapshot) ToFrontend() StateFrontend {
|
|||
RemovedDockerHosts: removedDockerHosts,
|
||||
KubernetesClusters: kubernetesClusters,
|
||||
RemovedKubernetesClusters: removedKubernetesClusters,
|
||||
RemovedHosts: removedHosts,
|
||||
Hosts: hosts,
|
||||
Storage: storage,
|
||||
CephClusters: cephClusters,
|
||||
|
|
|
|||
|
|
@ -875,6 +875,7 @@ type Monitor struct {
|
|||
vmAgentMemCache map[string]agentMemCacheEntry // Guest agent /proc/meminfo cache
|
||||
removedDockerHosts map[string]time.Time // Track deliberately removed Docker hosts (ID -> removal time)
|
||||
dockerTokenBindings map[string]string // Track token ID -> agent ID bindings to enforce uniqueness
|
||||
removedHosts map[string]time.Time // Track deliberately removed host agents (ID -> removal time)
|
||||
removedKubernetesClusters map[string]time.Time // Track deliberately removed Kubernetes clusters (ID -> removal time)
|
||||
kubernetesTokenBindings map[string]string // Track token ID -> agent ID bindings to enforce uniqueness
|
||||
hostTokenBindings map[string]string // Track tokenID:hostname -> host identity bindings
|
||||
|
|
@ -1902,6 +1903,20 @@ func (m *Monitor) RemoveHostAgent(hostID string) (models.Host, error) {
|
|||
Msg("Unbound host agent token bindings after host removal")
|
||||
}
|
||||
|
||||
// Track removal to prevent resurrection from cached/in-flight reports
|
||||
removedAt := time.Now()
|
||||
|
||||
m.mu.Lock()
|
||||
m.removedHosts[hostID] = removedAt
|
||||
m.mu.Unlock()
|
||||
|
||||
m.state.AddRemovedHost(models.RemovedHost{
|
||||
ID: hostID,
|
||||
Hostname: host.Hostname,
|
||||
DisplayName: host.DisplayName,
|
||||
RemovedAt: removedAt,
|
||||
})
|
||||
|
||||
m.state.RemoveConnectionHealth(hostConnectionPrefix + hostID)
|
||||
|
||||
// Clear LinkedHostAgentID from any nodes that were linked to this host agent
|
||||
|
|
@ -1928,6 +1943,33 @@ func (m *Monitor) RemoveHostAgent(hostID string) (models.Host, error) {
|
|||
return host, nil
|
||||
}
|
||||
|
||||
// AllowHostReenroll removes a host ID from the removal blocklist so it can report again.
|
||||
func (m *Monitor) AllowHostReenroll(hostID string) error {
|
||||
hostID = strings.TrimSpace(hostID)
|
||||
if hostID == "" {
|
||||
return fmt.Errorf("host id is required")
|
||||
}
|
||||
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
|
||||
if _, exists := m.removedHosts[hostID]; !exists {
|
||||
log.Info().
|
||||
Str("hostID", hostID).
|
||||
Msg("Allow re-enroll requested but host was not blocked; ignoring")
|
||||
return nil
|
||||
}
|
||||
|
||||
delete(m.removedHosts, hostID)
|
||||
m.state.RemoveRemovedHost(hostID)
|
||||
|
||||
log.Info().
|
||||
Str("hostID", hostID).
|
||||
Msg("Host agent removal block cleared; host may report again")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// LinkHostAgent manually links a host agent to a specific PVE node.
|
||||
// This is used when auto-linking can't disambiguate (e.g., multiple nodes with hostname "pve").
|
||||
// After linking, the host agent's temperature/sensor data will appear on the correct node.
|
||||
|
|
@ -2971,6 +3013,22 @@ func (m *Monitor) ApplyHostReport(report agentshost.Report, tokenRecord *config.
|
|||
}
|
||||
}
|
||||
|
||||
// Check if this host was deliberately removed - reject report to prevent resurrection.
|
||||
// Unlike Docker agents, host agents don't have legacy IDs, so we only check the
|
||||
// resolved identifier. The identifier is deterministic given the same machine/agent/hostname
|
||||
// inputs, so a running agent will always resolve to the same blocked ID.
|
||||
m.mu.RLock()
|
||||
removedAt, wasRemoved := m.removedHosts[identifier]
|
||||
m.mu.RUnlock()
|
||||
|
||||
if wasRemoved {
|
||||
log.Info().
|
||||
Str("hostID", identifier).
|
||||
Time("removedAt", removedAt).
|
||||
Msg("Rejecting report from deliberately removed host agent")
|
||||
return models.Host{}, fmt.Errorf("host %q was removed at %v and cannot report again. Use Allow re-enroll in Settings -> Agents to clear this block", identifier, removedAt.Format(time.RFC3339))
|
||||
}
|
||||
|
||||
var previous models.Host
|
||||
var hasPrevious bool
|
||||
for _, candidate := range existingHosts {
|
||||
|
|
@ -3394,6 +3452,7 @@ func (m *Monitor) linkNodeToHostAgent(nodeID, hostAgentID string) {
|
|||
|
||||
const (
|
||||
removedDockerHostsTTL = 24 * time.Hour // Clean up removed hosts tracking after 24 hours
|
||||
removedHostsTTL = 24 * time.Hour // Clean up removed host agents tracking after 24 hours
|
||||
)
|
||||
|
||||
// recoverFromPanic recovers from panics in monitoring goroutines and logs them.
|
||||
|
|
@ -3437,6 +3496,33 @@ func (m *Monitor) cleanupRemovedDockerHosts(now time.Time) {
|
|||
}
|
||||
}
|
||||
|
||||
// cleanupRemovedHosts removes entries from the removed host agents map that are older than 24 hours.
|
||||
func (m *Monitor) cleanupRemovedHosts(now time.Time) {
|
||||
var toRemove []string
|
||||
|
||||
m.mu.Lock()
|
||||
for hostID, removedAt := range m.removedHosts {
|
||||
if now.Sub(removedAt) > removedHostsTTL {
|
||||
toRemove = append(toRemove, hostID)
|
||||
}
|
||||
}
|
||||
m.mu.Unlock()
|
||||
|
||||
for _, hostID := range toRemove {
|
||||
m.state.RemoveRemovedHost(hostID)
|
||||
|
||||
m.mu.Lock()
|
||||
removedAt := m.removedHosts[hostID]
|
||||
delete(m.removedHosts, hostID)
|
||||
m.mu.Unlock()
|
||||
|
||||
log.Debug().
|
||||
Str("hostID", hostID).
|
||||
Time("removedAt", removedAt).
|
||||
Msg("Cleaned up old removed host agent entry")
|
||||
}
|
||||
}
|
||||
|
||||
// cleanupGuestMetadataCache removes stale guest metadata entries.
|
||||
// Entries older than 2x the cache TTL (10 minutes) are removed to prevent unbounded growth
|
||||
// when VMs are deleted or moved.
|
||||
|
|
@ -4266,6 +4352,7 @@ func New(cfg *config.Config) (*Monitor, error) {
|
|||
vmAgentMemCache: make(map[string]agentMemCacheEntry),
|
||||
removedDockerHosts: make(map[string]time.Time),
|
||||
dockerTokenBindings: make(map[string]string),
|
||||
removedHosts: make(map[string]time.Time),
|
||||
removedKubernetesClusters: make(map[string]time.Time),
|
||||
kubernetesTokenBindings: make(map[string]string),
|
||||
hostTokenBindings: make(map[string]string),
|
||||
|
|
@ -4311,6 +4398,19 @@ func New(cfg *config.Config) (*Monitor, error) {
|
|||
m.restorePersistedHostAgents()
|
||||
m.RebuildTokenBindings()
|
||||
|
||||
// Sync removed-agent maps from state (currently a no-op on cold start since
|
||||
// state begins empty, but ensures consistency if state is ever restored from
|
||||
// a snapshot or populated before Start is called).
|
||||
for _, entry := range m.state.GetRemovedDockerHosts() {
|
||||
m.removedDockerHosts[entry.ID] = entry.RemovedAt
|
||||
}
|
||||
for _, entry := range m.state.GetRemovedHosts() {
|
||||
m.removedHosts[entry.ID] = entry.RemovedAt
|
||||
}
|
||||
for _, entry := range m.state.GetRemovedKubernetesClusters() {
|
||||
m.removedKubernetesClusters[entry.ID] = entry.RemovedAt
|
||||
}
|
||||
|
||||
if m.pollMetrics != nil {
|
||||
m.pollMetrics.ResetQueueDepth(0)
|
||||
}
|
||||
|
|
@ -4976,6 +5076,7 @@ func (m *Monitor) Start(ctx context.Context, wsHub *websocket.Hub) {
|
|||
m.evaluateKubernetesAgents(now)
|
||||
m.evaluateHostAgents(now)
|
||||
m.cleanupRemovedDockerHosts(now)
|
||||
m.cleanupRemovedHosts(now)
|
||||
m.cleanupRemovedKubernetesClusters(now)
|
||||
m.cleanupGuestMetadataCache(now)
|
||||
m.cleanupDiagnosticSnapshots(now)
|
||||
|
|
|
|||
|
|
@ -136,6 +136,7 @@ func TestApplyHostReportPersistsAndRemoveHostAgentClearsRuntime(t *testing.T) {
|
|||
state: models.NewState(),
|
||||
alertManager: alerts.NewManager(),
|
||||
hostTokenBindings: make(map[string]string),
|
||||
removedHosts: make(map[string]time.Time),
|
||||
config: &config.Config{},
|
||||
rateTracker: NewRateTracker(),
|
||||
hostRuntimeStore: runtimeStore,
|
||||
|
|
@ -326,6 +327,7 @@ func TestApplyHostReportDisambiguatesCollidingIdentifiersAcrossTokens(t *testing
|
|||
state: models.NewState(),
|
||||
alertManager: alerts.NewManager(),
|
||||
hostTokenBindings: make(map[string]string),
|
||||
removedHosts: make(map[string]time.Time),
|
||||
config: &config.Config{},
|
||||
rateTracker: NewRateTracker(),
|
||||
}
|
||||
|
|
@ -409,6 +411,7 @@ func TestRemoveHostAgentUnbindsToken(t *testing.T) {
|
|||
state: models.NewState(),
|
||||
alertManager: alerts.NewManager(),
|
||||
hostTokenBindings: make(map[string]string),
|
||||
removedHosts: make(map[string]time.Time),
|
||||
config: &config.Config{},
|
||||
}
|
||||
t.Cleanup(func() { monitor.alertManager.Stop() })
|
||||
|
|
@ -757,6 +760,7 @@ func TestRemoveHostAgent_EmptyHostID(t *testing.T) {
|
|||
monitor := &Monitor{
|
||||
state: models.NewState(),
|
||||
alertManager: alerts.NewManager(),
|
||||
removedHosts: make(map[string]time.Time),
|
||||
config: &config.Config{},
|
||||
}
|
||||
t.Cleanup(func() { monitor.alertManager.Stop() })
|
||||
|
|
@ -782,6 +786,7 @@ func TestRemoveHostAgent_NotFound(t *testing.T) {
|
|||
state: models.NewState(),
|
||||
alertManager: alerts.NewManager(),
|
||||
hostTokenBindings: make(map[string]string),
|
||||
removedHosts: make(map[string]time.Time),
|
||||
config: &config.Config{},
|
||||
}
|
||||
t.Cleanup(func() { monitor.alertManager.Stop() })
|
||||
|
|
@ -806,6 +811,7 @@ func TestRemoveHostAgent_NoTokenBinding(t *testing.T) {
|
|||
state: models.NewState(),
|
||||
alertManager: alerts.NewManager(),
|
||||
hostTokenBindings: make(map[string]string),
|
||||
removedHosts: make(map[string]time.Time),
|
||||
config: &config.Config{},
|
||||
}
|
||||
t.Cleanup(func() { monitor.alertManager.Stop() })
|
||||
|
|
@ -834,6 +840,7 @@ func TestRemoveHostAgent_NilAlertManager(t *testing.T) {
|
|||
state: models.NewState(),
|
||||
alertManager: nil, // No alert manager
|
||||
hostTokenBindings: make(map[string]string),
|
||||
removedHosts: make(map[string]time.Time),
|
||||
config: &config.Config{},
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue