Pulse/internal/infradiscovery/service.go

// Package infradiscovery provides infrastructure discovery for detecting
// applications and services running on monitored hosts. It uses LLM analysis to
// identify services from Docker containers, enabling AI systems like Patrol to
// understand where services run and propose correct remediation commands.
package infradiscovery

import (
	"context"
	"encoding/json"
	"fmt"
	"strings"
	"sync"
	"time"

	"github.com/rcourtman/pulse-go-rewrite/internal/ai/knowledge"
	"github.com/rcourtman/pulse-go-rewrite/internal/models"
	"github.com/rs/zerolog/log"
)

// StateProvider provides access to the current infrastructure state.
type StateProvider interface {
	GetState() models.StateSnapshot
}

// AIAnalyzer provides AI analysis capabilities for discovery.
// This interface allows the discovery service to use LLM analysis
// without creating circular dependencies with the AI package.
type AIAnalyzer interface {
	// AnalyzeForDiscovery sends a prompt to the AI and returns the response.
	// The model parameter specifies which model to use (e.g., "anthropic:claude-haiku-4-5")
	AnalyzeForDiscovery(ctx context.Context, prompt string) (string, error)
}

// DiscoveredApp represents a detected application or service.
type DiscoveredApp struct {
	ID            string    `json:"id"`             // Unique ID: "docker:hostname:container"
	Type          string    `json:"type"`           // Application type: pbs, postgres, nginx, custom, etc.
	Name          string    `json:"name"`           // Human-readable name: "Proxmox Backup Server"
	Category      string    `json:"category"`       // Category: backup, database, web, monitoring, unknown
	RunsIn        string    `json:"runs_in"`        // Runtime: docker, systemd, native
	HostID        string    `json:"host_id"`        // Host identifier (agent ID or hostname)
	Hostname      string    `json:"hostname"`       // Human-readable hostname
	ContainerID   string    `json:"container_id"`   // Docker container ID (if applicable)
	ContainerName string    `json:"container_name"` // Docker container name (if applicable)
	ServiceUnit   string    `json:"service_unit"`   // Systemd unit name (if applicable)
	Ports         []int     `json:"ports"`          // Exposed ports
	CLIAccess     string    `json:"cli_access"`     // How to access CLI: "docker exec pbs proxmox-backup-manager"
	Confidence    float64   `json:"confidence"`     // Detection confidence 0-1
	DetectedAt    time.Time `json:"detected_at"`    // When this app was detected
	AIReasoning   string    `json:"ai_reasoning"`   // AI's reasoning for the identification
}

// ContainerInfo holds information about a container for AI analysis.
type ContainerInfo struct {
	Name        string            `json:"name"`
	Image       string            `json:"image"`
	Ports       []PortInfo        `json:"ports,omitempty"`
	Labels      map[string]string `json:"labels,omitempty"`
	EnvVarNames []string          `json:"env_var_names,omitempty"` // Just names, not values (security)
	Mounts      []string          `json:"mounts,omitempty"`
	Networks    []string          `json:"networks,omitempty"`
	Status      string            `json:"status,omitempty"`
	Command     string            `json:"command,omitempty"`
}

// PortInfo holds port mapping information.
type PortInfo struct {
	HostPort      int    `json:"host_port,omitempty"`
	ContainerPort int    `json:"container_port"`
	Protocol      string `json:"protocol,omitempty"`
}

// DiscoveryResult represents the AI's analysis of a container.
type DiscoveryResult struct {
	ServiceType string  `json:"service_type"` // e.g., "postgres", "pbs", "nginx", "unknown"
	ServiceName string  `json:"service_name"` // Human-readable name
	Category    string  `json:"category"`     // backup, database, web, monitoring, etc.
	CLICommand  string  `json:"cli_command"`  // How to run CLI commands in this container
	Confidence  float64 `json:"confidence"`   // 0-1 confidence score
	Reasoning   string  `json:"reasoning"`    // Why the AI made this determination
}

// Service manages infrastructure discovery.
type Service struct {
	stateProvider  StateProvider
	knowledgeStore *knowledge.Store
	aiAnalyzer     AIAnalyzer
	mu             sync.RWMutex
	lastRun        time.Time
	interval       time.Duration
	stopCh         chan struct{}
	running        bool
	discoveries    []DiscoveredApp

	// Cache to avoid re-analyzing the same containers
	// Key: image name, Value: analysis result
	analysisCache   map[string]*DiscoveryResult
	cacheMu         sync.RWMutex
	cacheExpiry     time.Duration
	lastCacheUpdate time.Time
}

// Config holds discovery service configuration.
type Config struct {
	Interval    time.Duration // How often to run discovery (default: 5 minutes)
	CacheExpiry time.Duration // How long to cache analysis results (default: 1 hour)
}

// DefaultConfig returns the default discovery configuration.
func DefaultConfig() Config {
	return Config{
		Interval:    5 * time.Minute,
		CacheExpiry: 1 * time.Hour,
	}
}

// NewService creates a new infrastructure discovery service.
func NewService(stateProvider StateProvider, knowledgeStore *knowledge.Store, cfg Config) *Service {
	if cfg.Interval == 0 {
		cfg.Interval = 5 * time.Minute
	}
	if cfg.CacheExpiry == 0 {
		cfg.CacheExpiry = 1 * time.Hour
	}

	return &Service{
		stateProvider:  stateProvider,
		knowledgeStore: knowledgeStore,
		interval:       cfg.Interval,
		cacheExpiry:    cfg.CacheExpiry,
		stopCh:         make(chan struct{}),
		discoveries:    make([]DiscoveredApp, 0),
		analysisCache:  make(map[string]*DiscoveryResult),
	}
}

// SetAIAnalyzer sets the AI analyzer for discovery.
// This must be called before Start() for discovery to work.
func (s *Service) SetAIAnalyzer(analyzer AIAnalyzer) {
	s.mu.Lock()
	defer s.mu.Unlock()
	s.aiAnalyzer = analyzer
}

// Start begins the background discovery service.
func (s *Service) Start(ctx context.Context) {
	s.mu.Lock()
	if s.running {
		s.mu.Unlock()
		return
	}
	s.running = true
	s.mu.Unlock()

	log.Info().
		Dur("interval", s.interval).
		Msg("Starting infrastructure discovery service")

	// Run immediately on startup
	go func() {
		defer func() {
			if r := recover(); r != nil {
				log.Error().
					Interface("panic", r).
					Stack().
					Msg("Recovered from panic in initial infrastructure discovery")
			}
		}()
		s.RunDiscovery(ctx)
	}()

	// Start periodic discovery loop
	go func() {
		defer func() {
			if r := recover(); r != nil {
				log.Error().
					Interface("panic", r).
					Stack().
					Msg("Recovered from panic in infrastructure discovery loop")
			}
		}()
		s.discoveryLoop(ctx)
	}()
}

// Stop stops the background discovery service.
func (s *Service) Stop() {
	s.mu.Lock()
	defer s.mu.Unlock()
	if s.running {
		close(s.stopCh)
		s.running = false
	}
}

// discoveryLoop runs periodic discovery.
func (s *Service) discoveryLoop(ctx context.Context) {
	ticker := time.NewTicker(s.interval)
	defer ticker.Stop()

	for {
		select {
		case <-ticker.C:
			s.RunDiscovery(ctx)
		case <-s.stopCh:
			log.Info().Msg("Stopping infrastructure discovery service")
			return
		case <-ctx.Done():
			log.Info().Msg("Infrastructure discovery context cancelled")
			return
		}
	}
}

// RunDiscovery performs a discovery scan using AI analysis.
func (s *Service) RunDiscovery(ctx context.Context) []DiscoveredApp {
	start := time.Now()
	state := s.stateProvider.GetState()

	s.mu.RLock()
	analyzer := s.aiAnalyzer
	s.mu.RUnlock()

	if analyzer == nil {
		log.Debug().Msg("AI analyzer not set, skipping discovery")
		return nil
	}

	var apps []DiscoveredApp

	// Collect all containers from all Docker hosts
	var allContainers []struct {
		Container models.DockerContainer
		Host      models.DockerHost
	}

	for _, dockerHost := range state.DockerHosts {
		for _, container := range dockerHost.Containers {
			allContainers = append(allContainers, struct {
				Container models.DockerContainer
				Host      models.DockerHost
			}{container, dockerHost})
		}
	}

	if len(allContainers) == 0 {
		log.Debug().Msg("No Docker containers found for discovery")
		s.mu.Lock()
		s.lastRun = time.Now()
		s.mu.Unlock()
		return apps
	}

	// Analyze containers (check cache first, batch uncached ones)
	for _, item := range allContainers {
		app := s.analyzeContainer(ctx, analyzer, item.Container, item.Host)
		if app != nil {
			apps = append(apps, *app)
		}
	}

	// Save discoveries to knowledge store
	s.saveDiscoveries(apps)

	// Update cache
	s.mu.Lock()
	s.discoveries = apps
	s.lastRun = time.Now()
	s.mu.Unlock()

	log.Info().
		Int("containers_scanned", len(allContainers)).
		Int("apps_discovered", len(apps)).
		Dur("duration", time.Since(start)).
		Msg("AI infrastructure discovery completed")

	return apps
}

// analyzeContainer uses AI to analyze a single container.
func (s *Service) analyzeContainer(ctx context.Context, analyzer AIAnalyzer, c models.DockerContainer, host models.DockerHost) *DiscoveredApp {
	// Check cache first
	s.cacheMu.RLock()
	cached, found := s.analysisCache[c.Image]
	cacheValid := time.Since(s.lastCacheUpdate) < s.cacheExpiry
	s.cacheMu.RUnlock()

	var result *DiscoveryResult

	if found && cacheValid {
		result = cached
		log.Debug().
			Str("container", c.Name).
			Str("image", c.Image).
			Msg("Using cached analysis result")
	} else {
		// Build container info for AI analysis
		info := s.buildContainerInfo(c)

		// Create analysis prompt
		prompt := s.buildAnalysisPrompt(info)

		// Call AI
		response, err := analyzer.AnalyzeForDiscovery(ctx, prompt)
		if err != nil {
			log.Warn().
				Err(err).
				Str("container", c.Name).
				Str("image", c.Image).
				Msg("AI analysis failed for container")
			return nil
		}

		// Parse response
		result = s.parseAIResponse(response)
		if result == nil {
			log.Warn().
				Str("container", c.Name).
				Str("response", response).
				Msg("Failed to parse AI response")
			return nil
		}

		// Cache the result
		s.cacheMu.Lock()
		s.analysisCache[c.Image] = result
		s.lastCacheUpdate = time.Now()
		s.cacheMu.Unlock()

		log.Debug().
			Str("container", c.Name).
			Str("image", c.Image).
			Str("service_type", result.ServiceType).
			Float64("confidence", result.Confidence).
			Msg("AI analyzed container")
	}

	// Skip unknown/low-confidence results
	if result.ServiceType == "unknown" || result.Confidence < 0.5 {
		return nil
	}

	// Build CLI access string
	cliAccess := result.CLICommand
	if cliAccess != "" {
		// Replace placeholder with actual container name
		cliAccess = strings.ReplaceAll(cliAccess, "{container}", c.Name)
		cliAccess = strings.ReplaceAll(cliAccess, "${container}", c.Name)
	}

	// Extract ports
	var ports []int
	for _, p := range c.Ports {
		if p.PublicPort > 0 {
			ports = append(ports, int(p.PublicPort))
		} else if p.PrivatePort > 0 {
			ports = append(ports, int(p.PrivatePort))
		}
	}

	return &DiscoveredApp{
		ID:            fmt.Sprintf("docker:%s:%s", host.Hostname, c.Name),
		Type:          result.ServiceType,
		Name:          result.ServiceName,
		Category:      result.Category,
		RunsIn:        "docker",
		HostID:        host.AgentID,
		Hostname:      host.Hostname,
		ContainerID:   c.ID,
		ContainerName: c.Name,
		Ports:         ports,
		CLIAccess:     cliAccess,
		Confidence:    result.Confidence,
		DetectedAt:    time.Now(),
		AIReasoning:   result.Reasoning,
	}
}

// buildContainerInfo extracts relevant information from a container for AI analysis.
func (s *Service) buildContainerInfo(c models.DockerContainer) ContainerInfo {
	info := ContainerInfo{
		Name:   c.Name,
		Image:  c.Image,
		Status: c.Status,
	}

	// Extract ports
	for _, p := range c.Ports {
		info.Ports = append(info.Ports, PortInfo{
			HostPort:      int(p.PublicPort),
			ContainerPort: int(p.PrivatePort),
			Protocol:      p.Protocol,
		})
	}

	// Extract labels
	if len(c.Labels) > 0 {
		info.Labels = c.Labels
	}

	// Extract mount destinations
	for _, m := range c.Mounts {
		info.Mounts = append(info.Mounts, m.Destination)
	}

	// Extract network names
	for _, n := range c.Networks {
		info.Networks = append(info.Networks, n.Name)
	}

	return info
}

// buildAnalysisPrompt creates the prompt for AI container analysis.
func (s *Service) buildAnalysisPrompt(info ContainerInfo) string {
	// Convert info to JSON for the prompt
	infoJSON, _ := json.MarshalIndent(info, "", "  ")

	return fmt.Sprintf(`Analyze this Docker container and identify what service or application it's running.

Container Information:
%s

Based on the image name, ports, labels, environment variables, mounts, and other signals, determine:
1. What service/application is this? (e.g., postgres, redis, nginx, proxmox-backup-server, grafana, etc.)
2. What category does it belong to? (database, cache, web, backup, monitoring, message_queue, storage, etc.)
3. How should CLI commands be executed for this service?

Respond in this exact JSON format:
{
  "service_type": "the_service_type",
  "service_name": "Human Readable Name",
  "category": "category",
  "cli_command": "docker exec {container} <cli-tool>",
  "confidence": 0.95,
  "reasoning": "Brief explanation of why you identified it this way"
}

Important guidelines:
- service_type should be lowercase, no spaces (e.g., "postgres", "redis", "pbs", "nginx")
- For CLI command, use {container} as a placeholder for the container name
- If the service has a CLI tool, include it (e.g., "docker exec {container} psql -U postgres" for PostgreSQL)
- If no CLI is applicable, use empty string for cli_command
- Set confidence between 0 and 1 (1 = certain, 0.5 = guess)
- If you cannot identify the service, use service_type "unknown" with low confidence

Common services to look for:
- Databases: PostgreSQL, MySQL, MariaDB, MongoDB, Redis, Elasticsearch
- Backup: Proxmox Backup Server (PBS), Restic, Borg
- Web: Nginx, Apache, Traefik, Caddy, HAProxy
- Monitoring: Prometheus, Grafana, Loki, Alertmanager
- Message queues: RabbitMQ, Kafka
- Storage: MinIO, Nextcloud
- Home automation: Home Assistant
- Media: Plex, Jellyfin
- CI/CD: Jenkins, Drone, GitLab Runner

Respond with ONLY the JSON, no other text.`, string(infoJSON))
}

// parseAIResponse parses the AI's JSON response.
func (s *Service) parseAIResponse(response string) *DiscoveryResult {
	// Try to extract JSON from the response
	response = strings.TrimSpace(response)

	// Handle markdown code blocks
	if strings.HasPrefix(response, "```") {
		lines := strings.Split(response, "\n")
		var jsonLines []string
		inBlock := false
		for _, line := range lines {
			if strings.HasPrefix(line, "```") {
				inBlock = !inBlock
				continue
			}
			if inBlock {
				jsonLines = append(jsonLines, line)
			}
		}
		response = strings.Join(jsonLines, "\n")
	}

	// Find JSON object in response
	start := strings.Index(response, "{")
	end := strings.LastIndex(response, "}")
	if start >= 0 && end > start {
		response = response[start : end+1]
	}

	var result DiscoveryResult
	if err := json.Unmarshal([]byte(response), &result); err != nil {
		log.Debug().
			Err(err).
			Str("response", response).
			Msg("Failed to parse AI response as JSON")
		return nil
	}

	return &result
}

// saveDiscoveries persists discovered applications to the knowledge store.
func (s *Service) saveDiscoveries(apps []DiscoveredApp) {
	if s.knowledgeStore == nil {
		return
	}

	for _, app := range apps {
		// Create a descriptive note for each discovered application
		title := fmt.Sprintf("%s (%s)", app.Name, app.RunsIn)

		var content string
		if app.CLIAccess != "" {
			content = fmt.Sprintf(
				"Detected %s running in %s on %s. CLI access: %s",
				app.Name,
				app.RunsIn,
				app.Hostname,
				app.CLIAccess,
			)
		} else {
			content = fmt.Sprintf(
				"Detected %s running in %s on %s. No CLI access available.",
				app.Name,
				app.RunsIn,
				app.Hostname,
			)
		}

		// Save to knowledge store under the host's ID
		err := s.knowledgeStore.SaveNote(
			app.HostID,
			app.Hostname,
			"host",
			knowledge.CategoryInfra,
			title,
			content,
		)
		if err != nil {
			log.Warn().
				Err(err).
				Str("app_id", app.ID).
				Str("host", app.Hostname).
				Msg("Failed to save infrastructure discovery to knowledge store")
		}
	}
}

// GetDiscoveries returns the cached list of discovered applications.
func (s *Service) GetDiscoveries() []DiscoveredApp {
	s.mu.RLock()
	defer s.mu.RUnlock()
	result := make([]DiscoveredApp, len(s.discoveries))
	copy(result, s.discoveries)
	return result
}

// GetLastRun returns the time of the last discovery run.
func (s *Service) GetLastRun() time.Time {
	s.mu.RLock()
	defer s.mu.RUnlock()
	return s.lastRun
}

// ForceRefresh triggers an immediate discovery scan.
func (s *Service) ForceRefresh(ctx context.Context) {
	go func() {
		defer func() {
			if r := recover(); r != nil {
				log.Error().
					Interface("panic", r).
					Stack().
					Msg("Recovered from panic in ForceRefresh infrastructure discovery")
			}
		}()
		s.RunDiscovery(ctx)
	}()
}

// ClearCache clears the analysis cache, forcing re-analysis of all containers.
func (s *Service) ClearCache() {
	s.cacheMu.Lock()
	defer s.cacheMu.Unlock()
	s.analysisCache = make(map[string]*DiscoveryResult)
	s.lastCacheUpdate = time.Time{}
}

// GetStatus returns the current service status.
func (s *Service) GetStatus() map[string]interface{} {
	s.mu.RLock()
	defer s.mu.RUnlock()

	s.cacheMu.RLock()
	cacheSize := len(s.analysisCache)
	s.cacheMu.RUnlock()

	return map[string]interface{}{
		"running":         s.running,
		"last_run":        s.lastRun,
		"interval":        s.interval.String(),
		"discovered_apps": len(s.discoveries),
		"cache_size":      cacheSize,
		"ai_analyzer_set": s.aiAnalyzer != nil,
	}
}