Pulse/internal/monitoring/monitor_pve_guest_poll.go

package monitoring

import (
	"context"

	"github.com/rcourtman/pulse-go-rewrite/internal/models"
	"github.com/rcourtman/pulse-go-rewrite/pkg/proxmox"
	"github.com/rs/zerolog/log"
)

// pollVMsAndContainersEfficient uses the cluster/resources endpoint to get all VMs and containers in one call
// This works on both clustered and standalone nodes for efficient polling
// When the instance is part of a cluster, the cluster name is used for guest IDs to prevent duplicates
// when multiple cluster nodes are configured as separate PVE instances.
func (m *Monitor) pollVMsAndContainersEfficient(ctx context.Context, instanceName string, clusterName string, isCluster bool, client PVEClientInterface, nodeEffectiveStatus map[string]string) bool {
	log.Debug().
		Str("instance", instanceName).
		Str("clusterName", clusterName).
		Bool("isCluster", isCluster).
		Msg("Polling VMs and containers using efficient cluster/resources endpoint")

	// Get all resources in a single API call
	resources, err := client.GetClusterResources(ctx, "vm")
	if err != nil {
		log.Debug().Err(err).Str("instance", instanceName).Msg("cluster/resources not available, falling back to traditional polling")
		return false
	}

	// Capture previous guest state once per poll cycle so fallback and grace-period
	// behavior is based on a consistent pre-poll snapshot.
	prevGuests := m.previousGuestContextForInstance(instanceName)

	allVMs, allContainers := m.collectGuestsFromClusterResources(
		ctx,
		instanceName,
		resources,
		client,
		prevGuests.containerOCIByVMID,
		prevGuests.vmsByID,
		prevGuests.hostAgentsByVMID,
	)

	allVMs, allContainers = m.preserveGuestsForGracePeriod(instanceName, resources, prevGuests.vms, prevGuests.containers, nodeEffectiveStatus, allVMs, allContainers)

	// Always update state when using efficient polling path
	// Even if arrays are empty, we need to update to clear out VMs from genuinely offline nodes
	m.state.UpdateVMsForInstance(instanceName, allVMs)

	// Check Docker presence for containers that need it (new, restarted, started)
	allContainers = m.CheckContainersForDocker(ctx, allContainers)

	m.state.UpdateContainersForInstance(instanceName, allContainers)

	m.recordGuestMetrics(allVMs, allContainers)

	m.pollReplicationStatus(ctx, instanceName, client, allVMs)

	log.Debug().
		Str("instance", instanceName).
		Int("vms", len(allVMs)).
		Int("containers", len(allContainers)).
		Msg("VMs and containers polled efficiently with cluster/resources")

	return true
}

func (m *Monitor) collectGuestsFromClusterResources(
	ctx context.Context,
	instanceName string,
	resources []proxmox.ClusterResource,
	client PVEClientInterface,
	prevContainerIsOCI map[int]bool,
	prevVMByID map[string]models.VM,
	vmIDToHostAgent map[string]models.Host,
) ([]models.VM, []models.Container) {
	allVMs := make([]models.VM, 0, len(resources))
	allContainers := make([]models.Container, 0, len(resources))

	for _, res := range resources {
		// Generate canonical guest ID: instance:node:vmid
		guestID := makeGuestID(instanceName, res.Node, res.VMID)

		// Debug log the resource type
		log.Debug().
			Str("instance", instanceName).
			Str("name", res.Name).
			Int("vmid", res.VMID).
			Str("type", res.Type).
			Msg("Processing cluster resource")

		switch res.Type {
		case "qemu":
			var prevVM *models.VM
			if prev, ok := prevVMByID[guestID]; ok {
				prevVM = &prev
			}
			vm, ok := m.handleClusterVMResource(ctx, instanceName, res, guestID, client, prevVM, vmIDToHostAgent)
			if !ok {
				continue
			}
			allVMs = append(allVMs, vm)
		case "lxc":
			container, ok := m.handleClusterContainerResource(ctx, instanceName, res, guestID, client, prevContainerIsOCI)
			if !ok {
				continue
			}
			allContainers = append(allContainers, container)
		}
	}

	return allVMs, allContainers
}

func (m *Monitor) handleClusterVMResource(
	ctx context.Context,
	instanceName string,
	res proxmox.ClusterResource,
	guestID string,
	client PVEClientInterface,
	prevVM *models.VM,
	vmIDToHostAgent map[string]models.Host,
) (models.VM, bool) {
	vm, guestRaw, memorySource, snapshotNotes, sampleTime, ok := m.buildVMFromClusterResource(ctx, instanceName, res, client, guestID, vmIDToHostAgent, prevVM)
	if !ok {
		return models.VM{}, false
	}

	// Trigger guest metadata migration if old format exists
	if m.guestMetadataStore != nil {
		m.guestMetadataStore.GetWithLegacyMigration(guestID, instanceName, res.Node, res.VMID)
	}

	m.recordGuestSnapshot(instanceName, vm.Type, res.Node, res.VMID, GuestMemorySnapshot{
		Name:           vm.Name,
		Status:         vm.Status,
		RetrievedAt:    sampleTime,
		MemorySource:   memorySource,
		FallbackReason: guestMemoryFallbackReason(memorySource),
		Memory:         vm.Memory,
		Raw:            guestRaw,
		Notes:          snapshotNotes,
	})

	m.checkGuestAlertsForVM(instanceName, vm)

	return vm, true
}

func (m *Monitor) handleClusterContainerResource(
	ctx context.Context,
	instanceName string,
	res proxmox.ClusterResource,
	guestID string,
	client PVEClientInterface,
	prevContainerIsOCI map[int]bool,
) (models.Container, bool) {
	container, guestRaw, memorySource, sampleTime, ok := m.buildContainerFromClusterResource(ctx, instanceName, res, client, prevContainerIsOCI)
	if !ok {
		return models.Container{}, false
	}

	// Trigger guest metadata migration if old format exists
	if m.guestMetadataStore != nil {
		m.guestMetadataStore.GetWithLegacyMigration(guestID, instanceName, res.Node, res.VMID)
	}

	m.recordGuestSnapshot(instanceName, container.Type, res.Node, res.VMID, GuestMemorySnapshot{
		Name:           container.Name,
		Status:         container.Status,
		RetrievedAt:    sampleTime,
		MemorySource:   memorySource,
		FallbackReason: guestMemoryFallbackReason(memorySource),
		Memory:         container.Memory,
		Raw:            guestRaw,
	})

	m.alertManager.CheckGuest(container, instanceName)

	return container, true
}

func (m *Monitor) checkGuestAlertsForVM(instanceName string, vm models.VM) {
	// For non-running VMs, zero out resource usage metrics to prevent false alerts.
	// Proxmox may report stale or residual metrics for stopped VMs.
	alertVM := vm
	if alertVM.Status != "running" {
		log.Debug().
			Str("vm", alertVM.Name).
			Str("status", alertVM.Status).
			Float64("originalCpu", alertVM.CPU).
			Float64("originalMemUsage", alertVM.Memory.Usage).
			Msg("Non-running VM detected - zeroing metrics")

		// Zero out all usage metrics for stopped/paused/suspended VMs
		alertVM.CPU = 0
		alertVM.Memory.Usage = 0
		alertVM.Disk.Usage = 0
		alertVM.NetworkIn = 0
		alertVM.NetworkOut = 0
		alertVM.DiskRead = 0
		alertVM.DiskWrite = 0
	}

	m.alertManager.CheckGuest(alertVM, instanceName)
}