Pulse/internal/monitoring/multi_tenant_monitor.go
2026-03-18 16:06:30 +00:00

306 lines
8.5 KiB
Go

package monitoring
import (
"context"
"fmt"
"strings"
"sync"
"time"
"github.com/rcourtman/pulse-go-rewrite/internal/config"
recoverymanager "github.com/rcourtman/pulse-go-rewrite/internal/recovery/manager"
"github.com/rcourtman/pulse-go-rewrite/internal/websocket"
"github.com/rs/zerolog/log"
)
// MultiTenantMonitor manages a dedicated Monitor instance for each organization.
type MultiTenantMonitor struct {
mu sync.RWMutex
monitors map[string]*Monitor
tenantCancel map[string]context.CancelFunc
tenantDone map[string]chan struct{}
persistence *config.MultiTenantPersistence
baseConfig *config.Config
wsHub *websocket.Hub
recoveryMgr *recoverymanager.Manager
initializer func(*Monitor)
globalCtx context.Context
globalCancel context.CancelFunc
}
// NewMultiTenantMonitor creates a new multi-tenant monitor manager.
func NewMultiTenantMonitor(baseCfg *config.Config, persistence *config.MultiTenantPersistence, wsHub *websocket.Hub) *MultiTenantMonitor {
ctx, cancel := context.WithCancel(context.Background())
return &MultiTenantMonitor{
monitors: make(map[string]*Monitor),
tenantCancel: make(map[string]context.CancelFunc),
tenantDone: make(map[string]chan struct{}),
persistence: persistence,
baseConfig: baseCfg, // Used as a template or for global settings
wsHub: wsHub,
globalCtx: ctx,
globalCancel: cancel,
}
}
const tenantMonitorShutdownTimeout = 2 * time.Second
// SetRecoveryManager wires a recovery store manager into all existing and future tenant monitors.
func (mtm *MultiTenantMonitor) SetRecoveryManager(manager *recoverymanager.Manager) {
mtm.mu.Lock()
defer mtm.mu.Unlock()
mtm.recoveryMgr = manager
for _, monitor := range mtm.monitors {
if monitor == nil {
continue
}
monitor.SetRecoveryManager(manager)
}
}
// SetMonitorInitializer configures a callback that is applied to all existing
// and future tenant monitors after creation.
func (mtm *MultiTenantMonitor) SetMonitorInitializer(initializer func(*Monitor)) {
if mtm == nil {
return
}
mtm.mu.Lock()
mtm.initializer = initializer
monitors := make([]*Monitor, 0, len(mtm.monitors))
for _, monitor := range mtm.monitors {
if monitor == nil {
continue
}
monitors = append(monitors, monitor)
}
mtm.mu.Unlock()
if initializer == nil {
return
}
for _, monitor := range monitors {
initializer(monitor)
}
}
// GetMonitor returns the monitor instance for a specific organization.
// It lazily initializes the monitor if it doesn't exist.
func (mtm *MultiTenantMonitor) GetMonitor(orgID string) (*Monitor, error) {
orgID = strings.TrimSpace(orgID)
if orgID == "" {
return nil, fmt.Errorf("organization ID is required")
}
mtm.mu.RLock()
monitor, exists := mtm.monitors[orgID]
mtm.mu.RUnlock()
if exists {
return monitor, nil
}
mtm.mu.Lock()
defer mtm.mu.Unlock()
if mtm.monitors == nil {
mtm.monitors = make(map[string]*Monitor)
}
if mtm.tenantCancel == nil {
mtm.tenantCancel = make(map[string]context.CancelFunc)
}
if mtm.tenantDone == nil {
mtm.tenantDone = make(map[string]chan struct{})
}
// Double-check locking pattern
if monitor, exists = mtm.monitors[orgID]; exists {
return monitor, nil
}
if mtm.persistence == nil {
return nil, fmt.Errorf("tenant persistence is not configured")
}
if orgID != "default" && !mtm.persistence.OrgExists(orgID) {
return nil, fmt.Errorf("organization %q is not provisioned", orgID)
}
// Initialize new monitor for this tenant
log.Info().Str("org_id", orgID).Msg("initializing tenant monitor")
// 1. Load Tenant Config
// Deep copy the base config to ensure tenant isolation.
// Each tenant gets its own independent config that won't share
// credential slices or other mutable state with other tenants.
tenantConfig := mtm.baseConfig.DeepCopy()
// Clear inherited credentials - tenants must load their own
// This prevents credential leakage between tenants
tenantConfig.PVEInstances = nil
tenantConfig.PBSInstances = nil
tenantConfig.PMGInstances = nil
// Ensure the DataPath is correct for this tenant to isolate storage (sqlite, etc)
tenantPersistence, err := mtm.persistence.GetPersistence(orgID)
if err != nil {
return nil, fmt.Errorf("failed to get persistence for org %s: %w", orgID, err)
}
tenantConfig.DataPath = tenantPersistence.GetConfigDir()
// Load tenant-specific nodes from <orgDir>/nodes.enc
nodesConfig, err := tenantPersistence.LoadNodesConfig()
if err != nil {
log.Warn().Err(err).Str("org_id", orgID).Msg("failed to load tenant nodes config, starting with empty config")
// Not a fatal error - tenant may not have configured any nodes yet
} else if nodesConfig != nil {
tenantConfig.PVEInstances = nodesConfig.PVEInstances
tenantConfig.PBSInstances = nodesConfig.PBSInstances
tenantConfig.PMGInstances = nodesConfig.PMGInstances
log.Info().
Str("org_id", orgID).
Int("pve_count", len(nodesConfig.PVEInstances)).
Int("pbs_count", len(nodesConfig.PBSInstances)).
Int("pmg_count", len(nodesConfig.PMGInstances)).
Msg("Loaded tenant nodes config")
}
// 2. Create Monitor
// Usage of internal New constructor
monitor, err = New(tenantConfig)
if err != nil {
return nil, fmt.Errorf("failed to create monitor for org %s: %w", orgID, err)
}
// Set org ID for tenant isolation
// This enables tenant-scoped WebSocket broadcasts
monitor.SetOrgID(orgID)
if mtm.recoveryMgr != nil {
monitor.SetRecoveryManager(mtm.recoveryMgr)
}
if mtm.initializer != nil {
mtm.initializer(monitor)
}
// 3. Start Monitor with a tenant-scoped runtime so RemoveTenant can stop it cleanly.
tenantCtx, tenantCancel := context.WithCancel(mtm.globalCtx)
tenantDone := make(chan struct{})
go func() {
defer close(tenantDone)
monitor.Start(tenantCtx, mtm.wsHub)
}()
mtm.monitors[orgID] = monitor
mtm.tenantCancel[orgID] = tenantCancel
mtm.tenantDone[orgID] = tenantDone
return monitor, nil
}
// PeekMonitor returns the tenant monitor instance if it is already initialized.
// It does not create a new monitor.
func (mtm *MultiTenantMonitor) PeekMonitor(orgID string) (*Monitor, bool) {
orgID = strings.TrimSpace(orgID)
if orgID == "" {
return nil, false
}
mtm.mu.RLock()
defer mtm.mu.RUnlock()
monitor, exists := mtm.monitors[orgID]
return monitor, exists
}
// Stop stops all tenant monitors.
func (mtm *MultiTenantMonitor) Stop() {
log.Info().Msg("stopping MultiTenantMonitor and all tenant instances")
mtm.mu.Lock()
mtm.globalCancel()
monitors := make([]*Monitor, 0, len(mtm.monitors))
cancels := make([]context.CancelFunc, 0, len(mtm.tenantCancel))
doneSignals := make([]chan struct{}, 0, len(mtm.tenantDone))
for orgID, monitor := range mtm.monitors {
if cancel := mtm.tenantCancel[orgID]; cancel != nil {
cancels = append(cancels, cancel)
}
if done := mtm.tenantDone[orgID]; done != nil {
doneSignals = append(doneSignals, done)
}
monitors = append(monitors, monitor)
}
mtm.tenantCancel = make(map[string]context.CancelFunc)
mtm.tenantDone = make(map[string]chan struct{})
mtm.monitors = make(map[string]*Monitor)
mtm.mu.Unlock()
for _, cancel := range cancels {
cancel()
}
for _, done := range doneSignals {
waitForTenantMonitorShutdown("", done)
}
for _, monitor := range monitors {
monitor.Stop()
}
}
// RemoveTenant stops and removes a specific tenant's monitor.
// Useful for offboarding or manual reloading.
func (mtm *MultiTenantMonitor) RemoveTenant(orgID string) {
orgID = strings.TrimSpace(orgID)
if orgID == "" {
return
}
mtm.mu.Lock()
monitor, exists := mtm.monitors[orgID]
cancel := mtm.tenantCancel[orgID]
done := mtm.tenantDone[orgID]
delete(mtm.monitors, orgID)
delete(mtm.tenantCancel, orgID)
delete(mtm.tenantDone, orgID)
mtm.mu.Unlock()
if !exists {
return
}
log.Info().Str("org_id", orgID).Msg("stopping and removing tenant monitor")
if cancel != nil {
cancel()
}
waitForTenantMonitorShutdown(orgID, done)
monitor.Stop()
}
func waitForTenantMonitorShutdown(orgID string, done <-chan struct{}) {
if done == nil {
return
}
timer := time.NewTimer(tenantMonitorShutdownTimeout)
defer timer.Stop()
select {
case <-done:
case <-timer.C:
logger := log.Warn().Dur("timeout", tenantMonitorShutdownTimeout)
if orgID != "" {
logger = logger.Str("org_id", orgID)
}
logger.Msg("timed out waiting for tenant monitor loop to exit")
}
}
// OrgExists checks if an organization exists (directory exists) without creating it.
func (mtm *MultiTenantMonitor) OrgExists(orgID string) bool {
orgID = strings.TrimSpace(orgID)
if orgID == "" {
return false
}
if mtm.persistence == nil {
return false
}
return mtm.persistence.OrgExists(orgID)
}