mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-05-07 17:19:57 +00:00
311 lines
10 KiB
Go
311 lines
10 KiB
Go
package chat
|
|
|
|
import (
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
)
|
|
|
|
// maxLabelLen is the maximum length for a metric label value
|
|
const maxLabelLen = 64
|
|
|
|
// sanitizeLabel ensures a label value is safe for Prometheus:
|
|
// - Truncates to maxLabelLen
|
|
// - Replaces spaces with underscores
|
|
// - Returns "unknown" for empty values
|
|
func sanitizeLabel(s string) string {
|
|
if s == "" {
|
|
return "unknown"
|
|
}
|
|
s = strings.ReplaceAll(s, " ", "_")
|
|
if len(s) > maxLabelLen {
|
|
s = s[:maxLabelLen]
|
|
}
|
|
return s
|
|
}
|
|
|
|
// AIMetrics manages Prometheus instrumentation for AI chat safety/reliability.
|
|
// These metrics help prove the structural guarantees stay fixed over time.
|
|
type AIMetrics struct {
|
|
// FSM blocks - tracks when workflow gates prevent unsafe actions
|
|
fsmToolBlock *prometheus.CounterVec
|
|
fsmFinalBlock *prometheus.CounterVec
|
|
|
|
// Strict resolution blocks - tracks when undiscovered resources are blocked
|
|
strictResolutionBlock *prometheus.CounterVec
|
|
|
|
// Routing mismatch blocks - tracks when operations target wrong layer
|
|
routingMismatchBlock *prometheus.CounterVec
|
|
|
|
// Phantom detection - tracks hallucinated tool execution claims
|
|
phantomDetected *prometheus.CounterVec
|
|
|
|
// Auto-recovery - tracks self-healing attempts and outcomes
|
|
autoRecoveryAttempt *prometheus.CounterVec
|
|
autoRecoverySuccess *prometheus.CounterVec
|
|
|
|
// Loop health - tracks agentic loop iterations
|
|
agenticIterations *prometheus.CounterVec
|
|
|
|
// Explore pre-pass health
|
|
exploreRuns *prometheus.CounterVec
|
|
exploreDurationSec *prometheus.HistogramVec
|
|
exploreInputTokens *prometheus.CounterVec
|
|
exploreOutputTokens *prometheus.CounterVec
|
|
}
|
|
|
|
var (
|
|
aiMetricsInstance *AIMetrics
|
|
aiMetricsOnce sync.Once
|
|
)
|
|
|
|
// GetAIMetrics returns the singleton AI metrics instance.
|
|
// Call this to record metrics from anywhere in the chat package.
|
|
func GetAIMetrics() *AIMetrics {
|
|
aiMetricsOnce.Do(func() {
|
|
aiMetricsInstance = newAIMetrics()
|
|
})
|
|
return aiMetricsInstance
|
|
}
|
|
|
|
func newAIMetrics() *AIMetrics {
|
|
m := &AIMetrics{
|
|
fsmToolBlock: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "ai",
|
|
Name: "fsm_tool_block_total",
|
|
Help: "Total FSM blocks of tool execution by state, tool, and kind",
|
|
},
|
|
[]string{"state", "tool", "kind"},
|
|
),
|
|
fsmFinalBlock: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "ai",
|
|
Name: "fsm_final_block_total",
|
|
Help: "Total FSM blocks of final answer by state",
|
|
},
|
|
[]string{"state"},
|
|
),
|
|
strictResolutionBlock: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "ai",
|
|
Name: "strict_resolution_block_total",
|
|
Help: "Total strict resolution blocks by tool and action",
|
|
},
|
|
[]string{"tool", "action"},
|
|
),
|
|
routingMismatchBlock: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "ai",
|
|
Name: "routing_mismatch_block_total",
|
|
Help: "Total routing mismatch blocks when targeting parent host instead of child resource",
|
|
},
|
|
[]string{"tool", "target_kind", "child_kind"},
|
|
),
|
|
phantomDetected: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "ai",
|
|
Name: "phantom_detected_total",
|
|
Help: "Total phantom execution detections by provider and model",
|
|
},
|
|
[]string{"provider", "model"},
|
|
),
|
|
autoRecoveryAttempt: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "ai",
|
|
Name: "auto_recovery_attempt_total",
|
|
Help: "Total auto-recovery attempts by error code and tool",
|
|
},
|
|
[]string{"error_code", "tool"},
|
|
),
|
|
autoRecoverySuccess: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "ai",
|
|
Name: "auto_recovery_success_total",
|
|
Help: "Total successful auto-recoveries by error code and tool",
|
|
},
|
|
[]string{"error_code", "tool"},
|
|
),
|
|
agenticIterations: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "ai",
|
|
Name: "agentic_iterations_total",
|
|
Help: "Total agentic loop iterations by provider and model",
|
|
},
|
|
[]string{"provider", "model"},
|
|
),
|
|
exploreRuns: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "ai",
|
|
Name: "explore_runs_total",
|
|
Help: "Total explore pre-pass runs by outcome and model",
|
|
},
|
|
[]string{"outcome", "model"},
|
|
),
|
|
exploreDurationSec: prometheus.NewHistogramVec(
|
|
prometheus.HistogramOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "ai",
|
|
Name: "explore_duration_seconds",
|
|
Help: "Explore pre-pass duration in seconds by outcome and model",
|
|
Buckets: []float64{0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10, 20, 30},
|
|
},
|
|
[]string{"outcome", "model"},
|
|
),
|
|
exploreInputTokens: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "ai",
|
|
Name: "explore_input_tokens_total",
|
|
Help: "Total input tokens consumed by explore pre-pass by model",
|
|
},
|
|
[]string{"model"},
|
|
),
|
|
exploreOutputTokens: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "pulse",
|
|
Subsystem: "ai",
|
|
Name: "explore_output_tokens_total",
|
|
Help: "Total output tokens consumed by explore pre-pass by model",
|
|
},
|
|
[]string{"model"},
|
|
),
|
|
}
|
|
|
|
// Register all metrics
|
|
prometheus.MustRegister(
|
|
m.fsmToolBlock,
|
|
m.fsmFinalBlock,
|
|
m.strictResolutionBlock,
|
|
m.routingMismatchBlock,
|
|
m.phantomDetected,
|
|
m.autoRecoveryAttempt,
|
|
m.autoRecoverySuccess,
|
|
m.agenticIterations,
|
|
m.exploreRuns,
|
|
m.exploreDurationSec,
|
|
m.exploreInputTokens,
|
|
m.exploreOutputTokens,
|
|
)
|
|
|
|
return m
|
|
}
|
|
|
|
// RecordFSMToolBlock records when FSM blocks a tool execution
|
|
func (m *AIMetrics) RecordFSMToolBlock(state SessionState, tool string, kind ToolKind) {
|
|
m.fsmToolBlock.WithLabelValues(string(state), sanitizeLabel(tool), kind.String()).Inc()
|
|
}
|
|
|
|
// RecordFSMFinalBlock records when FSM blocks a final answer
|
|
func (m *AIMetrics) RecordFSMFinalBlock(state SessionState) {
|
|
m.fsmFinalBlock.WithLabelValues(string(state)).Inc()
|
|
}
|
|
|
|
// RecordStrictResolutionBlock records when strict resolution blocks an action
|
|
// Note: tool should be the function name (e.g., "validateResolvedResource"), not user input
|
|
// Note: action should be a small enum (e.g., "restart", "exec"), not resource IDs
|
|
func (m *AIMetrics) RecordStrictResolutionBlock(tool, action string) {
|
|
m.strictResolutionBlock.WithLabelValues(sanitizeLabel(tool), sanitizeLabel(action)).Inc()
|
|
}
|
|
|
|
// RecordRoutingMismatchBlock records when routing validation blocks an operation
|
|
// that targeted a parent host when the user recently referenced a child resource.
|
|
// Note: use small enums for kinds (node, system-container, vm, app-container), not resource IDs
|
|
func (m *AIMetrics) RecordRoutingMismatchBlock(tool, targetKind, childKind string) {
|
|
m.routingMismatchBlock.WithLabelValues(sanitizeLabel(tool), sanitizeLabel(targetKind), sanitizeLabel(childKind)).Inc()
|
|
}
|
|
|
|
// RecordPhantomDetected records when phantom execution is detected
|
|
func (m *AIMetrics) RecordPhantomDetected(provider, model string) {
|
|
m.phantomDetected.WithLabelValues(sanitizeLabel(provider), sanitizeLabel(model)).Inc()
|
|
}
|
|
|
|
// RecordAutoRecoveryAttempt records an auto-recovery attempt.
|
|
// Definition: "we returned a recoverable error that the model can self-correct"
|
|
func (m *AIMetrics) RecordAutoRecoveryAttempt(errorCode, tool string) {
|
|
m.autoRecoveryAttempt.WithLabelValues(sanitizeLabel(errorCode), sanitizeLabel(tool)).Inc()
|
|
}
|
|
|
|
// RecordAutoRecoverySuccess records a successful auto-recovery.
|
|
// Definition: "a previously blocked operation succeeded on retry after discovery"
|
|
func (m *AIMetrics) RecordAutoRecoverySuccess(errorCode, tool string) {
|
|
m.autoRecoverySuccess.WithLabelValues(sanitizeLabel(errorCode), sanitizeLabel(tool)).Inc()
|
|
}
|
|
|
|
// RecordAgenticIteration records an agentic loop iteration (one LLM call).
|
|
// This counts each turn in the agentic loop, not each tool call.
|
|
func (m *AIMetrics) RecordAgenticIteration(provider, model string) {
|
|
m.agenticIterations.WithLabelValues(sanitizeLabel(provider), sanitizeLabel(model)).Inc()
|
|
}
|
|
|
|
// RecordExploreRun records explore pre-pass execution health.
|
|
func (m *AIMetrics) RecordExploreRun(outcome, model string, duration time.Duration, inputTokens, outputTokens int) {
|
|
modelLabel := sanitizeLabel(model)
|
|
outcomeLabel := sanitizeLabel(outcome)
|
|
m.exploreRuns.WithLabelValues(outcomeLabel, modelLabel).Inc()
|
|
if duration > 0 {
|
|
m.exploreDurationSec.WithLabelValues(outcomeLabel, modelLabel).Observe(duration.Seconds())
|
|
}
|
|
if inputTokens > 0 {
|
|
m.exploreInputTokens.WithLabelValues(modelLabel).Add(float64(inputTokens))
|
|
}
|
|
if outputTokens > 0 {
|
|
m.exploreOutputTokens.WithLabelValues(modelLabel).Add(float64(outputTokens))
|
|
}
|
|
}
|
|
|
|
// AIMetricsTelemetryCallback adapts AIMetrics to the tools.TelemetryCallback interface.
|
|
// This allows the tools package to record telemetry without importing the chat package.
|
|
type AIMetricsTelemetryCallback struct {
|
|
metrics *AIMetrics
|
|
}
|
|
|
|
// NewAIMetricsTelemetryCallback creates a new telemetry callback adapter.
|
|
func NewAIMetricsTelemetryCallback() *AIMetricsTelemetryCallback {
|
|
return &AIMetricsTelemetryCallback{
|
|
metrics: GetAIMetrics(),
|
|
}
|
|
}
|
|
|
|
// RecordStrictResolutionBlock implements tools.TelemetryCallback
|
|
func (c *AIMetricsTelemetryCallback) RecordStrictResolutionBlock(tool, action string) {
|
|
if c.metrics != nil {
|
|
c.metrics.RecordStrictResolutionBlock(tool, action)
|
|
// Strict resolution blocks are recoverable (model can discover then retry)
|
|
c.metrics.RecordAutoRecoveryAttempt("STRICT_RESOLUTION", tool)
|
|
}
|
|
}
|
|
|
|
// RecordAutoRecoveryAttempt implements tools.TelemetryCallback
|
|
func (c *AIMetricsTelemetryCallback) RecordAutoRecoveryAttempt(errorCode, tool string) {
|
|
if c.metrics != nil {
|
|
c.metrics.RecordAutoRecoveryAttempt(errorCode, tool)
|
|
}
|
|
}
|
|
|
|
// RecordAutoRecoverySuccess implements tools.TelemetryCallback
|
|
func (c *AIMetricsTelemetryCallback) RecordAutoRecoverySuccess(errorCode, tool string) {
|
|
if c.metrics != nil {
|
|
c.metrics.RecordAutoRecoverySuccess(errorCode, tool)
|
|
}
|
|
}
|
|
|
|
// RecordRoutingMismatchBlock implements tools.TelemetryCallback
|
|
func (c *AIMetricsTelemetryCallback) RecordRoutingMismatchBlock(tool, targetKind, childKind string) {
|
|
if c.metrics != nil {
|
|
c.metrics.RecordRoutingMismatchBlock(tool, targetKind, childKind)
|
|
// Routing mismatch blocks are recoverable (model can retry with correct target)
|
|
c.metrics.RecordAutoRecoveryAttempt("ROUTING_MISMATCH", tool)
|
|
}
|
|
}
|