Pulse/internal/servicediscovery/deep_scanner.go
2026-03-18 16:06:30 +00:00

561 lines
16 KiB
Go

package servicediscovery
import (
"context"
"fmt"
"sync"
"time"
"github.com/google/uuid"
"github.com/rs/zerolog/log"
)
// CommandExecutor executes commands on infrastructure.
type CommandExecutor interface {
ExecuteCommand(ctx context.Context, agentID string, cmd ExecuteCommandPayload) (*CommandResultPayload, error)
GetConnectedAgents() []ConnectedAgent
IsAgentConnected(agentID string) bool
}
// ExecuteCommandPayload mirrors agentexec.ExecuteCommandPayload
type ExecuteCommandPayload struct {
RequestID string `json:"request_id"`
Command string `json:"command"`
TargetType string `json:"target_type"` // "agent", "container", "vm"
TargetID string `json:"target_id,omitempty"` // VMID for container/VM
Timeout int `json:"timeout,omitempty"`
}
// CommandResultPayload mirrors agentexec.CommandResultPayload
type CommandResultPayload struct {
RequestID string `json:"request_id"`
Success bool `json:"success"`
Stdout string `json:"stdout,omitempty"`
Stderr string `json:"stderr,omitempty"`
ExitCode int `json:"exit_code"`
Error string `json:"error,omitempty"`
Duration int64 `json:"duration_ms"`
}
// ConnectedAgent mirrors agentexec.ConnectedAgent
type ConnectedAgent struct {
AgentID string
Hostname string
Version string
Platform string
Tags []string
ConnectedAt time.Time
}
// ProgressCallback is called when discovery progress changes.
type ProgressCallback func(*DiscoveryProgress)
// DeepScanner runs discovery commands on resources.
type DeepScanner struct {
executor CommandExecutor
mu sync.RWMutex
progress map[string]*DiscoveryProgress // resourceID -> progress
maxParallel int
timeout time.Duration
progressCallback ProgressCallback
}
const (
defaultDeepScannerMaxParallel = 3
defaultDeepScannerTimeout = 30 * time.Second
)
// NewDeepScanner creates a new deep scanner.
func NewDeepScanner(executor CommandExecutor) *DeepScanner {
return &DeepScanner{
executor: executor,
progress: make(map[string]*DiscoveryProgress),
maxParallel: defaultDeepScannerMaxParallel, // Run up to 3 commands in parallel per resource
timeout: defaultDeepScannerTimeout,
}
}
func (s *DeepScanner) runtimeSettings() (int, time.Duration) {
s.mu.RLock()
maxParallel := s.maxParallel
timeout := s.timeout
s.mu.RUnlock()
if maxParallel <= 0 {
log.Warn().
Int("max_parallel", maxParallel).
Int("default", defaultDeepScannerMaxParallel).
Msg("Invalid deep scanner max parallelism; using default")
maxParallel = defaultDeepScannerMaxParallel
}
if timeout <= 0 {
log.Warn().
Dur("timeout", timeout).
Dur("default", defaultDeepScannerTimeout).
Msg("Invalid deep scanner timeout; using default")
timeout = defaultDeepScannerTimeout
}
return maxParallel, timeout
}
// SetProgressCallback sets a callback function that will be called when discovery progress changes.
func (s *DeepScanner) SetProgressCallback(callback ProgressCallback) {
s.mu.Lock()
defer s.mu.Unlock()
s.progressCallback = callback
}
// notifyProgress calls the progress callback if set.
func (s *DeepScanner) notifyProgress(progress *DiscoveryProgress) {
s.mu.RLock()
callback := s.progressCallback
s.mu.RUnlock()
if callback != nil && progress != nil {
// Calculate elapsed time and percent complete
progressCopy := *progress
if !progress.StartedAt.IsZero() {
progressCopy.ElapsedMs = time.Since(progress.StartedAt).Milliseconds()
}
if progress.TotalSteps > 0 {
progressCopy.PercentComplete = float64(progress.CompletedSteps) / float64(progress.TotalSteps) * 100
}
callback(&progressCopy)
}
}
// ScanResult contains the results of a deep scan.
type ScanResult struct {
ResourceType ResourceType
ResourceID string
TargetID string
Hostname string
CommandOutputs map[string]string
Errors map[string]string
StartedAt time.Time
CompletedAt time.Time
}
// Scan runs discovery commands on a resource and returns the outputs.
func (s *DeepScanner) Scan(ctx context.Context, req DiscoveryRequest) (*ScanResult, error) {
requestTargetID := canonicalRequestTargetID(req)
resourceID := MakeResourceID(req.ResourceType, requestTargetID, req.ResourceID)
startTime := time.Now()
scanLog := log.With().
Str("component", "service_discovery_scanner").
Str("resource_id", resourceID).
Str("resource_type", string(req.ResourceType)).
Str("target_id", requestTargetID).
Str("hostname", req.Hostname).
Logger()
// Initialize progress
s.mu.Lock()
s.progress[resourceID] = &DiscoveryProgress{
ResourceID: resourceID,
Status: DiscoveryStatusRunning,
CurrentStep: "initializing",
StartedAt: startTime,
}
initialProgress := *s.progress[resourceID]
s.mu.Unlock()
// Broadcast scan start
s.notifyProgress(&initialProgress)
defer func() {
s.mu.Lock()
delete(s.progress, resourceID)
s.mu.Unlock()
}()
result := &ScanResult{
ResourceType: req.ResourceType,
ResourceID: req.ResourceID,
TargetID: requestTargetID,
Hostname: req.Hostname,
CommandOutputs: make(map[string]string),
Errors: make(map[string]string),
StartedAt: time.Now(),
}
// Check if we have an agent for this host
if s.executor == nil {
scanLog.Warn().
Str("action", "scan_precondition_failed").
Str("reason", "executor_missing").
Msg("Deep scan unavailable")
return nil, fmt.Errorf("no command executor available")
}
// Find the agent for this host
agentID := s.findAgentForTarget(requestTargetID, req.Hostname)
if agentID == "" {
scanLog.Warn().
Str("action", "scan_precondition_failed").
Str("reason", "agent_not_connected").
Msg("Deep scan unavailable")
return nil, fmt.Errorf("no connected agent for target %s (%s)", requestTargetID, req.Hostname)
}
// Get commands for this resource type
commands := GetCommandsForResource(req.ResourceType)
if len(commands) == 0 {
scanLog.Warn().
Str("action", "scan_precondition_failed").
Str("reason", "commands_not_defined").
Msg("Deep scan unavailable")
return nil, fmt.Errorf("no commands defined for resource type %s", req.ResourceType)
}
// Update progress
s.mu.Lock()
if prog, ok := s.progress[resourceID]; ok {
prog.TotalSteps = len(commands)
prog.CurrentStep = "running commands"
progressCopy := *prog
s.mu.Unlock()
s.notifyProgress(&progressCopy)
} else {
s.mu.Unlock()
}
// Get runtime settings for parallelism and timeouts
maxParallel, timeout := s.runtimeSettings()
// Run commands with limited parallelism
semaphore := make(chan struct{}, maxParallel)
var wg sync.WaitGroup
var mu sync.Mutex
for _, cmd := range commands {
wg.Add(1)
go func(cmd DiscoveryCommand) {
defer wg.Done()
select {
case semaphore <- struct{}{}:
defer func() { <-semaphore }()
case <-ctx.Done():
return
}
// Build the actual command to run
actualCmd := s.buildCommand(req.ResourceType, req.ResourceID, cmd.Command)
// Get the target ID for the agent
targetID := s.getTargetID(req.ResourceType, req.ResourceID)
// Only validate TargetID when it will be interpolated into shell commands
// by the agent (container/vm types). Agent/docker types don't use TargetID
// in command wrapping, so they can have any format (including colons for IPv6).
targetType := s.getTargetType(req.ResourceType)
if targetType == "container" || targetType == "vm" {
if err := ValidateResourceID(targetID); err != nil {
mu.Lock()
result.Errors[cmd.Name] = fmt.Sprintf("invalid target ID: %v", err)
mu.Unlock()
return
}
}
// Execute the command
cmdCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
cmdResult, err := s.executor.ExecuteCommand(cmdCtx, agentID, ExecuteCommandPayload{
RequestID: uuid.New().String(),
Command: actualCmd,
TargetType: s.getTargetType(req.ResourceType),
TargetID: targetID,
Timeout: cmd.Timeout,
})
mu.Lock()
defer mu.Unlock()
if err != nil {
if !cmd.Optional {
result.Errors[cmd.Name] = err.Error()
}
scanLog.Debug().
Str("action", "command_execute_failed").
Err(err).
Str("command", cmd.Name).
Bool("optional", cmd.Optional).
Msg("Command failed during discovery")
return
}
if cmdResult != nil {
output := cmdResult.Stdout
if cmdResult.Stderr != "" && output != "" {
output += "\n--- stderr ---\n" + cmdResult.Stderr
} else if cmdResult.Stderr != "" {
output = cmdResult.Stderr
}
if output != "" {
result.CommandOutputs[cmd.Name] = output
}
if !cmdResult.Success && cmdResult.Error != "" && !cmd.Optional {
result.Errors[cmd.Name] = cmdResult.Error
}
if !cmdResult.Success {
event := scanLog.Debug()
if !cmd.Optional {
event = scanLog.Warn()
}
event.
Str("action", "command_result_failed").
Str("command", cmd.Name).
Bool("optional", cmd.Optional).
Int("exit_code", cmdResult.ExitCode).
Str("request_id", cmdResult.RequestID).
Str("command_error", cmdResult.Error).
Msg("Deep scan command reported failure")
}
}
// Update progress and broadcast
s.mu.Lock()
if prog, ok := s.progress[resourceID]; ok {
prog.CompletedSteps++
prog.CurrentCommand = cmd.Name
progressCopy := *prog
s.mu.Unlock()
s.notifyProgress(&progressCopy)
} else {
s.mu.Unlock()
}
}(cmd)
}
wg.Wait()
result.CompletedAt = time.Now()
// Broadcast scan completion
completionProgress := DiscoveryProgress{
ResourceID: resourceID,
Status: DiscoveryStatusCompleted,
CurrentStep: "completed",
TotalSteps: len(commands),
CompletedSteps: len(commands),
StartedAt: startTime,
ElapsedMs: result.CompletedAt.Sub(startTime).Milliseconds(),
PercentComplete: 100,
}
s.notifyProgress(&completionProgress)
scanLog.Info().
Str("action", "scan_completed").
Int("outputs", len(result.CommandOutputs)).
Int("errors", len(result.Errors)).
Dur("duration", result.CompletedAt.Sub(result.StartedAt)).
Msg("Deep scan completed")
return result, nil
}
// buildCommand wraps the command appropriately for the resource type.
// NOTE: For LXC/VM, the agent handles wrapping via pct exec / qm guest exec
// based on TargetType, so we don't wrap here. We only wrap for Docker containers
// since Docker isn't a recognized TargetType in the agent.
func (s *DeepScanner) buildCommand(resourceType ResourceType, resourceID string, cmd string) string {
switch resourceType {
case ResourceTypeSystemContainer:
// Agent wraps with pct exec based on TargetType="container"
return cmd
case ResourceTypeVM:
// Agent wraps with qm guest exec based on TargetType="vm"
return cmd
case ResourceTypeDocker:
// Docker needs wrapping here since agent doesn't handle it
return BuildDockerCommand(resourceID, cmd)
case ResourceTypeAgent:
// Commands run directly on host
return cmd
case ResourceTypeDockerSystemContainer:
// Docker inside system container - agent wraps with pct exec, we just add docker exec
// resourceID format: "vmid:container_name"
parts := splitResourceID(resourceID)
if len(parts) >= 2 {
return BuildDockerCommand(parts[1], cmd)
}
return cmd
case ResourceTypeDockerVM:
// Docker inside VM - agent wraps with qm guest exec, we just add docker exec
parts := splitResourceID(resourceID)
if len(parts) >= 2 {
return BuildDockerCommand(parts[1], cmd)
}
return cmd
default:
return cmd
}
}
// getTargetType returns the target type for the agent execution payload.
func (s *DeepScanner) getTargetType(resourceType ResourceType) string {
switch resourceType {
case ResourceTypeSystemContainer:
return "container"
case ResourceTypeVM:
return "vm"
case ResourceTypeDocker:
return "agent" // Docker commands run on agent host via docker exec
case ResourceTypeDockerSystemContainer:
return "container" // Docker inside system container: agent wraps with pct exec
case ResourceTypeDockerVM:
return "vm" // Docker inside VM: agent wraps with qm guest exec
case ResourceTypeAgent:
return "agent"
default:
return "agent"
}
}
// getTargetID returns the target ID for the agent execution payload.
// For nested Docker (docker_lxc/docker_vm), this extracts just the vmid.
func (s *DeepScanner) getTargetID(resourceType ResourceType, resourceID string) string {
switch resourceType {
case ResourceTypeDockerSystemContainer, ResourceTypeDockerVM:
// resourceID format: "vmid:container_name" - extract just vmid
parts := splitResourceID(resourceID)
if len(parts) >= 1 {
return parts[0]
}
return resourceID
default:
return resourceID
}
}
// findAgentForTarget finds the agent ID for a given canonical target.
func (s *DeepScanner) findAgentForTarget(targetID, hostname string) string {
agents := s.executor.GetConnectedAgents()
log.Debug().
Str("component", "service_discovery_scanner").
Str("action", "find_agent_for_target").
Str("target_id", targetID).
Str("hostname", hostname).
Int("connected_agents", len(agents)).
Msg("Finding agent for target")
// Log connected agents for debugging
for _, agent := range agents {
log.Debug().
Str("component", "service_discovery_scanner").
Str("action", "find_agent_for_target").
Str("agent_id", agent.AgentID).
Str("agent_hostname", agent.Hostname).
Msg("Connected agent")
}
// First try exact match on agent ID
for _, agent := range agents {
if agent.AgentID == targetID {
return agent.AgentID
}
}
// Then try hostname match
for _, agent := range agents {
if agent.Hostname == hostname || agent.Hostname == targetID {
return agent.AgentID
}
}
// If only one agent connected, use it
if len(agents) == 1 {
return agents[0].AgentID
}
return ""
}
// GetProgress returns a copy of the current progress of a scan.
// Returns nil if no scan is in progress for the resource.
// A copy is returned to avoid data races with the scan goroutine.
func (s *DeepScanner) GetProgress(resourceID string) *DiscoveryProgress {
s.mu.RLock()
defer s.mu.RUnlock()
if prog, ok := s.progress[resourceID]; ok {
// Return a copy to avoid race with scan goroutine
copy := *prog
return &copy
}
return nil
}
// IsScanning returns whether a resource is currently being scanned.
func (s *DeepScanner) IsScanning(resourceID string) bool {
s.mu.RLock()
defer s.mu.RUnlock()
_, ok := s.progress[resourceID]
return ok
}
// splitResourceID splits a compound resource ID (e.g., "101:container_name").
func splitResourceID(id string) []string {
var parts []string
start := 0
for i, c := range id {
if c == ':' {
parts = append(parts, id[start:i])
start = i + 1
}
}
if start < len(id) {
parts = append(parts, id[start:])
}
return parts
}
// ScanDocker runs discovery on Docker containers via the target agent.
func (s *DeepScanner) ScanDocker(ctx context.Context, targetID, hostname, containerName string) (*ScanResult, error) {
req := DiscoveryRequest{
ResourceType: ResourceTypeDocker,
ResourceID: containerName,
TargetID: targetID,
Hostname: hostname,
}
return s.Scan(ctx, req)
}
// ScanSystemContainer runs discovery on a system container (LXC).
func (s *DeepScanner) ScanSystemContainer(ctx context.Context, targetID, hostname, vmid string) (*ScanResult, error) {
req := DiscoveryRequest{
ResourceType: ResourceTypeSystemContainer,
ResourceID: vmid,
TargetID: targetID,
Hostname: hostname,
}
return s.Scan(ctx, req)
}
// ScanVM runs discovery on a VM via QEMU guest agent.
func (s *DeepScanner) ScanVM(ctx context.Context, targetID, hostname, vmid string) (*ScanResult, error) {
req := DiscoveryRequest{
ResourceType: ResourceTypeVM,
ResourceID: vmid,
TargetID: targetID,
Hostname: hostname,
}
return s.Scan(ctx, req)
}
// ScanHost runs discovery on an agent target system.
func (s *DeepScanner) ScanHost(ctx context.Context, targetID, hostname string) (*ScanResult, error) {
req := DiscoveryRequest{
ResourceType: ResourceTypeAgent,
ResourceID: targetID,
TargetID: targetID,
Hostname: hostname,
}
return s.Scan(ctx, req)
}