Pulse/internal/ai/chat/metrics.go

package chat

import (
	"strings"
	"sync"
	"time"

	"github.com/prometheus/client_golang/prometheus"
)

// maxLabelLen is the maximum length for a metric label value
const maxLabelLen = 64

// sanitizeLabel ensures a label value is safe for Prometheus:
// - Truncates to maxLabelLen
// - Replaces spaces with underscores
// - Returns "unknown" for empty values
func sanitizeLabel(s string) string {
	if s == "" {
		return "unknown"
	}
	s = strings.ReplaceAll(s, " ", "_")
	if len(s) > maxLabelLen {
		s = s[:maxLabelLen]
	}
	return s
}

// AIMetrics manages Prometheus instrumentation for AI chat safety/reliability.
// These metrics help prove the structural guarantees stay fixed over time.
type AIMetrics struct {
	// FSM blocks - tracks when workflow gates prevent unsafe actions
	fsmToolBlock  *prometheus.CounterVec
	fsmFinalBlock *prometheus.CounterVec

	// Strict resolution blocks - tracks when undiscovered resources are blocked
	strictResolutionBlock *prometheus.CounterVec

	// Routing mismatch blocks - tracks when operations target wrong layer
	routingMismatchBlock *prometheus.CounterVec

	// Phantom detection - tracks hallucinated tool execution claims
	phantomDetected *prometheus.CounterVec

	// Auto-recovery - tracks self-healing attempts and outcomes
	autoRecoveryAttempt *prometheus.CounterVec
	autoRecoverySuccess *prometheus.CounterVec

	// Loop health - tracks agentic loop iterations
	agenticIterations *prometheus.CounterVec

	// Explore pre-pass health
	exploreRuns         *prometheus.CounterVec
	exploreDurationSec  *prometheus.HistogramVec
	exploreInputTokens  *prometheus.CounterVec
	exploreOutputTokens *prometheus.CounterVec
}

var (
	aiMetricsInstance *AIMetrics
	aiMetricsOnce     sync.Once
)

// GetAIMetrics returns the singleton AI metrics instance.
// Call this to record metrics from anywhere in the chat package.
func GetAIMetrics() *AIMetrics {
	aiMetricsOnce.Do(func() {
		aiMetricsInstance = newAIMetrics()
	})
	return aiMetricsInstance
}

func newAIMetrics() *AIMetrics {
	m := &AIMetrics{
		fsmToolBlock: prometheus.NewCounterVec(
			prometheus.CounterOpts{
				Namespace: "pulse",
				Subsystem: "ai",
				Name:      "fsm_tool_block_total",
				Help:      "Total FSM blocks of tool execution by state, tool, and kind",
			},
			[]string{"state", "tool", "kind"},
		),
		fsmFinalBlock: prometheus.NewCounterVec(
			prometheus.CounterOpts{
				Namespace: "pulse",
				Subsystem: "ai",
				Name:      "fsm_final_block_total",
				Help:      "Total FSM blocks of final answer by state",
			},
			[]string{"state"},
		),
		strictResolutionBlock: prometheus.NewCounterVec(
			prometheus.CounterOpts{
				Namespace: "pulse",
				Subsystem: "ai",
				Name:      "strict_resolution_block_total",
				Help:      "Total strict resolution blocks by tool and action",
			},
			[]string{"tool", "action"},
		),
		routingMismatchBlock: prometheus.NewCounterVec(
			prometheus.CounterOpts{
				Namespace: "pulse",
				Subsystem: "ai",
				Name:      "routing_mismatch_block_total",
				Help:      "Total routing mismatch blocks when targeting parent host instead of child resource",
			},
			[]string{"tool", "target_kind", "child_kind"},
		),
		phantomDetected: prometheus.NewCounterVec(
			prometheus.CounterOpts{
				Namespace: "pulse",
				Subsystem: "ai",
				Name:      "phantom_detected_total",
				Help:      "Total phantom execution detections by provider and model",
			},
			[]string{"provider", "model"},
		),
		autoRecoveryAttempt: prometheus.NewCounterVec(
			prometheus.CounterOpts{
				Namespace: "pulse",
				Subsystem: "ai",
				Name:      "auto_recovery_attempt_total",
				Help:      "Total auto-recovery attempts by error code and tool",
			},
			[]string{"error_code", "tool"},
		),
		autoRecoverySuccess: prometheus.NewCounterVec(
			prometheus.CounterOpts{
				Namespace: "pulse",
				Subsystem: "ai",
				Name:      "auto_recovery_success_total",
				Help:      "Total successful auto-recoveries by error code and tool",
			},
			[]string{"error_code", "tool"},
		),
		agenticIterations: prometheus.NewCounterVec(
			prometheus.CounterOpts{
				Namespace: "pulse",
				Subsystem: "ai",
				Name:      "agentic_iterations_total",
				Help:      "Total agentic loop iterations by provider and model",
			},
			[]string{"provider", "model"},
		),
		exploreRuns: prometheus.NewCounterVec(
			prometheus.CounterOpts{
				Namespace: "pulse",
				Subsystem: "ai",
				Name:      "explore_runs_total",
				Help:      "Total explore pre-pass runs by outcome and model",
			},
			[]string{"outcome", "model"},
		),
		exploreDurationSec: prometheus.NewHistogramVec(
			prometheus.HistogramOpts{
				Namespace: "pulse",
				Subsystem: "ai",
				Name:      "explore_duration_seconds",
				Help:      "Explore pre-pass duration in seconds by outcome and model",
				Buckets:   []float64{0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10, 20, 30},
			},
			[]string{"outcome", "model"},
		),
		exploreInputTokens: prometheus.NewCounterVec(
			prometheus.CounterOpts{
				Namespace: "pulse",
				Subsystem: "ai",
				Name:      "explore_input_tokens_total",
				Help:      "Total input tokens consumed by explore pre-pass by model",
			},
			[]string{"model"},
		),
		exploreOutputTokens: prometheus.NewCounterVec(
			prometheus.CounterOpts{
				Namespace: "pulse",
				Subsystem: "ai",
				Name:      "explore_output_tokens_total",
				Help:      "Total output tokens consumed by explore pre-pass by model",
			},
			[]string{"model"},
		),
	}

	// Register all metrics
	prometheus.MustRegister(
		m.fsmToolBlock,
		m.fsmFinalBlock,
		m.strictResolutionBlock,
		m.routingMismatchBlock,
		m.phantomDetected,
		m.autoRecoveryAttempt,
		m.autoRecoverySuccess,
		m.agenticIterations,
		m.exploreRuns,
		m.exploreDurationSec,
		m.exploreInputTokens,
		m.exploreOutputTokens,
	)

	return m
}

// RecordFSMToolBlock records when FSM blocks a tool execution
func (m *AIMetrics) RecordFSMToolBlock(state SessionState, tool string, kind ToolKind) {
	m.fsmToolBlock.WithLabelValues(string(state), sanitizeLabel(tool), kind.String()).Inc()
}

// RecordFSMFinalBlock records when FSM blocks a final answer
func (m *AIMetrics) RecordFSMFinalBlock(state SessionState) {
	m.fsmFinalBlock.WithLabelValues(string(state)).Inc()
}

// RecordStrictResolutionBlock records when strict resolution blocks an action
// Note: tool should be the function name (e.g., "validateResolvedResource"), not user input
// Note: action should be a small enum (e.g., "restart", "exec"), not resource IDs
func (m *AIMetrics) RecordStrictResolutionBlock(tool, action string) {
	m.strictResolutionBlock.WithLabelValues(sanitizeLabel(tool), sanitizeLabel(action)).Inc()
}

// RecordRoutingMismatchBlock records when routing validation blocks an operation
// that targeted a parent host when the user recently referenced a child resource.
// Note: use small enums for kinds (node, system-container, vm, app-container), not resource IDs
func (m *AIMetrics) RecordRoutingMismatchBlock(tool, targetKind, childKind string) {
	m.routingMismatchBlock.WithLabelValues(sanitizeLabel(tool), sanitizeLabel(targetKind), sanitizeLabel(childKind)).Inc()
}

// RecordPhantomDetected records when phantom execution is detected
func (m *AIMetrics) RecordPhantomDetected(provider, model string) {
	m.phantomDetected.WithLabelValues(sanitizeLabel(provider), sanitizeLabel(model)).Inc()
}

// RecordAutoRecoveryAttempt records an auto-recovery attempt.
// Definition: "we returned a recoverable error that the model can self-correct"
func (m *AIMetrics) RecordAutoRecoveryAttempt(errorCode, tool string) {
	m.autoRecoveryAttempt.WithLabelValues(sanitizeLabel(errorCode), sanitizeLabel(tool)).Inc()
}

// RecordAutoRecoverySuccess records a successful auto-recovery.
// Definition: "a previously blocked operation succeeded on retry after discovery"
func (m *AIMetrics) RecordAutoRecoverySuccess(errorCode, tool string) {
	m.autoRecoverySuccess.WithLabelValues(sanitizeLabel(errorCode), sanitizeLabel(tool)).Inc()
}

// RecordAgenticIteration records an agentic loop iteration (one LLM call).
// This counts each turn in the agentic loop, not each tool call.
func (m *AIMetrics) RecordAgenticIteration(provider, model string) {
	m.agenticIterations.WithLabelValues(sanitizeLabel(provider), sanitizeLabel(model)).Inc()
}

// RecordExploreRun records explore pre-pass execution health.
func (m *AIMetrics) RecordExploreRun(outcome, model string, duration time.Duration, inputTokens, outputTokens int) {
	modelLabel := sanitizeLabel(model)
	outcomeLabel := sanitizeLabel(outcome)
	m.exploreRuns.WithLabelValues(outcomeLabel, modelLabel).Inc()
	if duration > 0 {
		m.exploreDurationSec.WithLabelValues(outcomeLabel, modelLabel).Observe(duration.Seconds())
	}
	if inputTokens > 0 {
		m.exploreInputTokens.WithLabelValues(modelLabel).Add(float64(inputTokens))
	}
	if outputTokens > 0 {
		m.exploreOutputTokens.WithLabelValues(modelLabel).Add(float64(outputTokens))
	}
}

// AIMetricsTelemetryCallback adapts AIMetrics to the tools.TelemetryCallback interface.
// This allows the tools package to record telemetry without importing the chat package.
type AIMetricsTelemetryCallback struct {
	metrics *AIMetrics
}

// NewAIMetricsTelemetryCallback creates a new telemetry callback adapter.
func NewAIMetricsTelemetryCallback() *AIMetricsTelemetryCallback {
	return &AIMetricsTelemetryCallback{
		metrics: GetAIMetrics(),
	}
}

// RecordStrictResolutionBlock implements tools.TelemetryCallback
func (c *AIMetricsTelemetryCallback) RecordStrictResolutionBlock(tool, action string) {
	if c.metrics != nil {
		c.metrics.RecordStrictResolutionBlock(tool, action)
		// Strict resolution blocks are recoverable (model can discover then retry)
		c.metrics.RecordAutoRecoveryAttempt("STRICT_RESOLUTION", tool)
	}
}

// RecordAutoRecoveryAttempt implements tools.TelemetryCallback
func (c *AIMetricsTelemetryCallback) RecordAutoRecoveryAttempt(errorCode, tool string) {
	if c.metrics != nil {
		c.metrics.RecordAutoRecoveryAttempt(errorCode, tool)
	}
}

// RecordAutoRecoverySuccess implements tools.TelemetryCallback
func (c *AIMetricsTelemetryCallback) RecordAutoRecoverySuccess(errorCode, tool string) {
	if c.metrics != nil {
		c.metrics.RecordAutoRecoverySuccess(errorCode, tool)
	}
}

// RecordRoutingMismatchBlock implements tools.TelemetryCallback
func (c *AIMetricsTelemetryCallback) RecordRoutingMismatchBlock(tool, targetKind, childKind string) {
	if c.metrics != nil {
		c.metrics.RecordRoutingMismatchBlock(tool, targetKind, childKind)
		// Routing mismatch blocks are recoverable (model can retry with correct target)
		c.metrics.RecordAutoRecoveryAttempt("ROUTING_MISMATCH", tool)
	}
}