Pulse/internal/servicediscovery/deep_scanner.go

package servicediscovery

import (
	"context"
	"fmt"
	"sync"
	"time"

	"github.com/google/uuid"
	"github.com/rs/zerolog/log"
)

// CommandExecutor executes commands on infrastructure.
type CommandExecutor interface {
	ExecuteCommand(ctx context.Context, agentID string, cmd ExecuteCommandPayload) (*CommandResultPayload, error)
	GetConnectedAgents() []ConnectedAgent
	IsAgentConnected(agentID string) bool
}

// ExecuteCommandPayload mirrors agentexec.ExecuteCommandPayload
type ExecuteCommandPayload struct {
	RequestID  string `json:"request_id"`
	Command    string `json:"command"`
	TargetType string `json:"target_type"`         // "agent", "container", "vm"
	TargetID   string `json:"target_id,omitempty"` // VMID for container/VM
	Timeout    int    `json:"timeout,omitempty"`
}

// CommandResultPayload mirrors agentexec.CommandResultPayload
type CommandResultPayload struct {
	RequestID string `json:"request_id"`
	Success   bool   `json:"success"`
	Stdout    string `json:"stdout,omitempty"`
	Stderr    string `json:"stderr,omitempty"`
	ExitCode  int    `json:"exit_code"`
	Error     string `json:"error,omitempty"`
	Duration  int64  `json:"duration_ms"`
}

// ConnectedAgent mirrors agentexec.ConnectedAgent
type ConnectedAgent struct {
	AgentID     string
	Hostname    string
	Version     string
	Platform    string
	Tags        []string
	ConnectedAt time.Time
}

// ProgressCallback is called when discovery progress changes.
type ProgressCallback func(*DiscoveryProgress)

// DeepScanner runs discovery commands on resources.
type DeepScanner struct {
	executor         CommandExecutor
	mu               sync.RWMutex
	progress         map[string]*DiscoveryProgress // resourceID -> progress
	maxParallel      int
	timeout          time.Duration
	progressCallback ProgressCallback
}

const (
	defaultDeepScannerMaxParallel = 3
	defaultDeepScannerTimeout     = 30 * time.Second
)

// NewDeepScanner creates a new deep scanner.
func NewDeepScanner(executor CommandExecutor) *DeepScanner {
	return &DeepScanner{
		executor:    executor,
		progress:    make(map[string]*DiscoveryProgress),
		maxParallel: defaultDeepScannerMaxParallel, // Run up to 3 commands in parallel per resource
		timeout:     defaultDeepScannerTimeout,
	}
}

func (s *DeepScanner) runtimeSettings() (int, time.Duration) {
	s.mu.RLock()
	maxParallel := s.maxParallel
	timeout := s.timeout
	s.mu.RUnlock()

	if maxParallel <= 0 {
		log.Warn().
			Int("max_parallel", maxParallel).
			Int("default", defaultDeepScannerMaxParallel).
			Msg("Invalid deep scanner max parallelism; using default")
		maxParallel = defaultDeepScannerMaxParallel
	}
	if timeout <= 0 {
		log.Warn().
			Dur("timeout", timeout).
			Dur("default", defaultDeepScannerTimeout).
			Msg("Invalid deep scanner timeout; using default")
		timeout = defaultDeepScannerTimeout
	}

	return maxParallel, timeout
}

// SetProgressCallback sets a callback function that will be called when discovery progress changes.
func (s *DeepScanner) SetProgressCallback(callback ProgressCallback) {
	s.mu.Lock()
	defer s.mu.Unlock()
	s.progressCallback = callback
}

// notifyProgress calls the progress callback if set.
func (s *DeepScanner) notifyProgress(progress *DiscoveryProgress) {
	s.mu.RLock()
	callback := s.progressCallback
	s.mu.RUnlock()

	if callback != nil && progress != nil {
		// Calculate elapsed time and percent complete
		progressCopy := *progress
		if !progress.StartedAt.IsZero() {
			progressCopy.ElapsedMs = time.Since(progress.StartedAt).Milliseconds()
		}
		if progress.TotalSteps > 0 {
			progressCopy.PercentComplete = float64(progress.CompletedSteps) / float64(progress.TotalSteps) * 100
		}
		callback(&progressCopy)
	}
}

// ScanResult contains the results of a deep scan.
type ScanResult struct {
	ResourceType   ResourceType
	ResourceID     string
	TargetID       string
	Hostname       string
	CommandOutputs map[string]string
	Errors         map[string]string
	StartedAt      time.Time
	CompletedAt    time.Time
}

// Scan runs discovery commands on a resource and returns the outputs.
func (s *DeepScanner) Scan(ctx context.Context, req DiscoveryRequest) (*ScanResult, error) {
	requestTargetID := canonicalRequestTargetID(req)
	resourceID := MakeResourceID(req.ResourceType, requestTargetID, req.ResourceID)
	startTime := time.Now()
	scanLog := log.With().
		Str("component", "service_discovery_scanner").
		Str("resource_id", resourceID).
		Str("resource_type", string(req.ResourceType)).
		Str("target_id", requestTargetID).
		Str("hostname", req.Hostname).
		Logger()

	// Initialize progress
	s.mu.Lock()
	s.progress[resourceID] = &DiscoveryProgress{
		ResourceID:  resourceID,
		Status:      DiscoveryStatusRunning,
		CurrentStep: "initializing",
		StartedAt:   startTime,
	}
	initialProgress := *s.progress[resourceID]
	s.mu.Unlock()

	// Broadcast scan start
	s.notifyProgress(&initialProgress)

	defer func() {
		s.mu.Lock()
		delete(s.progress, resourceID)
		s.mu.Unlock()
	}()

	result := &ScanResult{
		ResourceType:   req.ResourceType,
		ResourceID:     req.ResourceID,
		TargetID:       requestTargetID,
		Hostname:       req.Hostname,
		CommandOutputs: make(map[string]string),
		Errors:         make(map[string]string),
		StartedAt:      time.Now(),
	}

	// Check if we have an agent for this host
	if s.executor == nil {
		scanLog.Warn().
			Str("action", "scan_precondition_failed").
			Str("reason", "executor_missing").
			Msg("Deep scan unavailable")
		return nil, fmt.Errorf("no command executor available")
	}

	// Find the agent for this host
	agentID := s.findAgentForTarget(requestTargetID, req.Hostname)
	if agentID == "" {
		scanLog.Warn().
			Str("action", "scan_precondition_failed").
			Str("reason", "agent_not_connected").
			Msg("Deep scan unavailable")
		return nil, fmt.Errorf("no connected agent for target %s (%s)", requestTargetID, req.Hostname)
	}

	// Get commands for this resource type
	commands := GetCommandsForResource(req.ResourceType)
	if len(commands) == 0 {
		scanLog.Warn().
			Str("action", "scan_precondition_failed").
			Str("reason", "commands_not_defined").
			Msg("Deep scan unavailable")
		return nil, fmt.Errorf("no commands defined for resource type %s", req.ResourceType)
	}

	// Update progress
	s.mu.Lock()
	if prog, ok := s.progress[resourceID]; ok {
		prog.TotalSteps = len(commands)
		prog.CurrentStep = "running commands"
		progressCopy := *prog
		s.mu.Unlock()
		s.notifyProgress(&progressCopy)
	} else {
		s.mu.Unlock()
	}

	// Get runtime settings for parallelism and timeouts
	maxParallel, timeout := s.runtimeSettings()

	// Run commands with limited parallelism
	semaphore := make(chan struct{}, maxParallel)
	var wg sync.WaitGroup
	var mu sync.Mutex

	for _, cmd := range commands {
		wg.Add(1)
		go func(cmd DiscoveryCommand) {
			defer wg.Done()

			select {
			case semaphore <- struct{}{}:
				defer func() { <-semaphore }()
			case <-ctx.Done():
				return
			}

			// Build the actual command to run
			actualCmd := s.buildCommand(req.ResourceType, req.ResourceID, cmd.Command)

			// Get the target ID for the agent
			targetID := s.getTargetID(req.ResourceType, req.ResourceID)

			// Only validate TargetID when it will be interpolated into shell commands
			// by the agent (container/vm types). Agent/docker types don't use TargetID
			// in command wrapping, so they can have any format (including colons for IPv6).
			targetType := s.getTargetType(req.ResourceType)
			if targetType == "container" || targetType == "vm" {
				if err := ValidateResourceID(targetID); err != nil {
					mu.Lock()
					result.Errors[cmd.Name] = fmt.Sprintf("invalid target ID: %v", err)
					mu.Unlock()
					return
				}
			}

			// Execute the command
			cmdCtx, cancel := context.WithTimeout(ctx, timeout)
			defer cancel()

			cmdResult, err := s.executor.ExecuteCommand(cmdCtx, agentID, ExecuteCommandPayload{
				RequestID:  uuid.New().String(),
				Command:    actualCmd,
				TargetType: s.getTargetType(req.ResourceType),
				TargetID:   targetID,
				Timeout:    cmd.Timeout,
			})

			mu.Lock()
			defer mu.Unlock()

			if err != nil {
				if !cmd.Optional {
					result.Errors[cmd.Name] = err.Error()
				}
				scanLog.Debug().
					Str("action", "command_execute_failed").
					Err(err).
					Str("command", cmd.Name).
					Bool("optional", cmd.Optional).
					Msg("Command failed during discovery")
				return
			}

			if cmdResult != nil {
				output := cmdResult.Stdout
				if cmdResult.Stderr != "" && output != "" {
					output += "\n--- stderr ---\n" + cmdResult.Stderr
				} else if cmdResult.Stderr != "" {
					output = cmdResult.Stderr
				}

				if output != "" {
					result.CommandOutputs[cmd.Name] = output
				}

				if !cmdResult.Success && cmdResult.Error != "" && !cmd.Optional {
					result.Errors[cmd.Name] = cmdResult.Error
				}

				if !cmdResult.Success {
					event := scanLog.Debug()
					if !cmd.Optional {
						event = scanLog.Warn()
					}
					event.
						Str("action", "command_result_failed").
						Str("command", cmd.Name).
						Bool("optional", cmd.Optional).
						Int("exit_code", cmdResult.ExitCode).
						Str("request_id", cmdResult.RequestID).
						Str("command_error", cmdResult.Error).
						Msg("Deep scan command reported failure")
				}
			}

			// Update progress and broadcast
			s.mu.Lock()
			if prog, ok := s.progress[resourceID]; ok {
				prog.CompletedSteps++
				prog.CurrentCommand = cmd.Name
				progressCopy := *prog
				s.mu.Unlock()
				s.notifyProgress(&progressCopy)
			} else {
				s.mu.Unlock()
			}
		}(cmd)
	}

	wg.Wait()
	result.CompletedAt = time.Now()

	// Broadcast scan completion
	completionProgress := DiscoveryProgress{
		ResourceID:      resourceID,
		Status:          DiscoveryStatusCompleted,
		CurrentStep:     "completed",
		TotalSteps:      len(commands),
		CompletedSteps:  len(commands),
		StartedAt:       startTime,
		ElapsedMs:       result.CompletedAt.Sub(startTime).Milliseconds(),
		PercentComplete: 100,
	}
	s.notifyProgress(&completionProgress)

	scanLog.Info().
		Str("action", "scan_completed").
		Int("outputs", len(result.CommandOutputs)).
		Int("errors", len(result.Errors)).
		Dur("duration", result.CompletedAt.Sub(result.StartedAt)).
		Msg("Deep scan completed")

	return result, nil
}

// buildCommand wraps the command appropriately for the resource type.
// NOTE: For LXC/VM, the agent handles wrapping via pct exec / qm guest exec
// based on TargetType, so we don't wrap here. We only wrap for Docker containers
// since Docker isn't a recognized TargetType in the agent.
func (s *DeepScanner) buildCommand(resourceType ResourceType, resourceID string, cmd string) string {
	switch resourceType {
	case ResourceTypeSystemContainer:
		// Agent wraps with pct exec based on TargetType="container"
		return cmd
	case ResourceTypeVM:
		// Agent wraps with qm guest exec based on TargetType="vm"
		return cmd
	case ResourceTypeDocker:
		// Docker needs wrapping here since agent doesn't handle it
		return BuildDockerCommand(resourceID, cmd)
	case ResourceTypeAgent:
		// Commands run directly on host
		return cmd
	case ResourceTypeDockerSystemContainer:
		// Docker inside system container - agent wraps with pct exec, we just add docker exec
		// resourceID format: "vmid:container_name"
		parts := splitResourceID(resourceID)
		if len(parts) >= 2 {
			return BuildDockerCommand(parts[1], cmd)
		}
		return cmd
	case ResourceTypeDockerVM:
		// Docker inside VM - agent wraps with qm guest exec, we just add docker exec
		parts := splitResourceID(resourceID)
		if len(parts) >= 2 {
			return BuildDockerCommand(parts[1], cmd)
		}
		return cmd
	default:
		return cmd
	}
}

// getTargetType returns the target type for the agent execution payload.
func (s *DeepScanner) getTargetType(resourceType ResourceType) string {
	switch resourceType {
	case ResourceTypeSystemContainer:
		return "container"
	case ResourceTypeVM:
		return "vm"
	case ResourceTypeDocker:
		return "agent" // Docker commands run on agent host via docker exec
	case ResourceTypeDockerSystemContainer:
		return "container" // Docker inside system container: agent wraps with pct exec
	case ResourceTypeDockerVM:
		return "vm" // Docker inside VM: agent wraps with qm guest exec
	case ResourceTypeAgent:
		return "agent"
	default:
		return "agent"
	}
}

// getTargetID returns the target ID for the agent execution payload.
// For nested Docker (docker_lxc/docker_vm), this extracts just the vmid.
func (s *DeepScanner) getTargetID(resourceType ResourceType, resourceID string) string {
	switch resourceType {
	case ResourceTypeDockerSystemContainer, ResourceTypeDockerVM:
		// resourceID format: "vmid:container_name" - extract just vmid
		parts := splitResourceID(resourceID)
		if len(parts) >= 1 {
			return parts[0]
		}
		return resourceID
	default:
		return resourceID
	}
}

// findAgentForTarget finds the agent ID for a given canonical target.
func (s *DeepScanner) findAgentForTarget(targetID, hostname string) string {
	agents := s.executor.GetConnectedAgents()

	log.Debug().
		Str("component", "service_discovery_scanner").
		Str("action", "find_agent_for_target").
		Str("target_id", targetID).
		Str("hostname", hostname).
		Int("connected_agents", len(agents)).
		Msg("Finding agent for target")

	// Log connected agents for debugging
	for _, agent := range agents {
		log.Debug().
			Str("component", "service_discovery_scanner").
			Str("action", "find_agent_for_target").
			Str("agent_id", agent.AgentID).
			Str("agent_hostname", agent.Hostname).
			Msg("Connected agent")
	}

	// First try exact match on agent ID
	for _, agent := range agents {
		if agent.AgentID == targetID {
			return agent.AgentID
		}
	}

	// Then try hostname match
	for _, agent := range agents {
		if agent.Hostname == hostname || agent.Hostname == targetID {
			return agent.AgentID
		}
	}

	// If only one agent connected, use it
	if len(agents) == 1 {
		return agents[0].AgentID
	}

	return ""
}

// GetProgress returns a copy of the current progress of a scan.
// Returns nil if no scan is in progress for the resource.
// A copy is returned to avoid data races with the scan goroutine.
func (s *DeepScanner) GetProgress(resourceID string) *DiscoveryProgress {
	s.mu.RLock()
	defer s.mu.RUnlock()
	if prog, ok := s.progress[resourceID]; ok {
		// Return a copy to avoid race with scan goroutine
		copy := *prog
		return &copy
	}
	return nil
}

// IsScanning returns whether a resource is currently being scanned.
func (s *DeepScanner) IsScanning(resourceID string) bool {
	s.mu.RLock()
	defer s.mu.RUnlock()
	_, ok := s.progress[resourceID]
	return ok
}

// splitResourceID splits a compound resource ID (e.g., "101:container_name").
func splitResourceID(id string) []string {
	var parts []string
	start := 0
	for i, c := range id {
		if c == ':' {
			parts = append(parts, id[start:i])
			start = i + 1
		}
	}
	if start < len(id) {
		parts = append(parts, id[start:])
	}
	return parts
}

// ScanDocker runs discovery on Docker containers via the target agent.
func (s *DeepScanner) ScanDocker(ctx context.Context, targetID, hostname, containerName string) (*ScanResult, error) {
	req := DiscoveryRequest{
		ResourceType: ResourceTypeDocker,
		ResourceID:   containerName,
		TargetID:     targetID,
		Hostname:     hostname,
	}
	return s.Scan(ctx, req)
}

// ScanSystemContainer runs discovery on a system container (LXC).
func (s *DeepScanner) ScanSystemContainer(ctx context.Context, targetID, hostname, vmid string) (*ScanResult, error) {
	req := DiscoveryRequest{
		ResourceType: ResourceTypeSystemContainer,
		ResourceID:   vmid,
		TargetID:     targetID,
		Hostname:     hostname,
	}
	return s.Scan(ctx, req)
}

// ScanVM runs discovery on a VM via QEMU guest agent.
func (s *DeepScanner) ScanVM(ctx context.Context, targetID, hostname, vmid string) (*ScanResult, error) {
	req := DiscoveryRequest{
		ResourceType: ResourceTypeVM,
		ResourceID:   vmid,
		TargetID:     targetID,
		Hostname:     hostname,
	}
	return s.Scan(ctx, req)
}

// ScanHost runs discovery on an agent target system.
func (s *DeepScanner) ScanHost(ctx context.Context, targetID, hostname string) (*ScanResult, error) {
	req := DiscoveryRequest{
		ResourceType: ResourceTypeAgent,
		ResourceID:   targetID,
		TargetID:     targetID,
		Hostname:     hostname,
	}
	return s.Scan(ctx, req)
}