package monitoring import ( stdErrors "errors" "strings" "sync" "time" "github.com/prometheus/client_golang/prometheus" internalerrors "github.com/rcourtman/pulse-go-rewrite/internal/errors" ) // PollMetrics manages Prometheus instrumentation for polling activity. type PollMetrics struct { pollDuration *prometheus.HistogramVec pollResults *prometheus.CounterVec pollErrors *prometheus.CounterVec lastSuccess *prometheus.GaugeVec staleness *prometheus.GaugeVec queueDepth prometheus.Gauge inflight *prometheus.GaugeVec nodePollDuration *prometheus.HistogramVec nodePollResults *prometheus.CounterVec nodePollErrors *prometheus.CounterVec nodeLastSuccess *prometheus.GaugeVec nodeStaleness *prometheus.GaugeVec schedulerQueueReady prometheus.Gauge schedulerQueueDepthByType *prometheus.GaugeVec schedulerQueueWait *prometheus.HistogramVec schedulerDeadLetterDepth *prometheus.GaugeVec schedulerBreakerState *prometheus.GaugeVec schedulerBreakerFailureCount *prometheus.GaugeVec schedulerBreakerRetrySeconds *prometheus.GaugeVec mu sync.RWMutex lastSuccessByKey map[metricKey]time.Time nodeLastSuccessByKey map[nodeMetricKey]time.Time lastQueueTypeKeys map[string]struct{} lastDLQKeys map[string]struct{} pending int } type metricKey struct { instanceType string instance string } type nodeMetricKey struct { instanceType string instance string node string } var ( pollMetricsInstance *PollMetrics pollMetricsOnce sync.Once ) func getPollMetrics() *PollMetrics { pollMetricsOnce.Do(func() { pollMetricsInstance = newPollMetrics() }) return pollMetricsInstance } func newPollMetrics() *PollMetrics { pm := &PollMetrics{ pollDuration: prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: "pulse", Subsystem: "monitor", Name: "poll_duration_seconds", Help: "Duration of polling operations per instance.", Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 15, 20, 30}, }, []string{"instance_type", "instance"}, ), pollResults: prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: "pulse", Subsystem: "monitor", Name: "poll_total", Help: "Total polling attempts partitioned by result.", }, []string{"instance_type", "instance", "result"}, ), pollErrors: prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: "pulse", Subsystem: "monitor", Name: "poll_errors_total", Help: "Polling failures grouped by error type.", }, []string{"instance_type", "instance", "error_type"}, ), lastSuccess: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "pulse", Subsystem: "monitor", Name: "poll_last_success_timestamp", Help: "Unix timestamp of the last successful poll.", }, []string{"instance_type", "instance"}, ), staleness: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "pulse", Subsystem: "monitor", Name: "poll_staleness_seconds", Help: "Seconds since the last successful poll. -1 indicates no successes yet.", }, []string{"instance_type", "instance"}, ), queueDepth: prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: "pulse", Subsystem: "monitor", Name: "poll_queue_depth", Help: "Approximate number of poll tasks waiting to complete in the current cycle.", }, ), inflight: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "pulse", Subsystem: "monitor", Name: "poll_inflight", Help: "Current number of poll operations executing per instance type.", }, []string{"instance_type"}, ), nodePollDuration: prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: "pulse", Subsystem: "monitor", Name: "node_poll_duration_seconds", Help: "Duration of polling operations per node.", Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 15, 20, 30}, }, []string{"instance_type", "instance", "node"}, ), nodePollResults: prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: "pulse", Subsystem: "monitor", Name: "node_poll_total", Help: "Total polling attempts per node partitioned by result.", }, []string{"instance_type", "instance", "node", "result"}, ), nodePollErrors: prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: "pulse", Subsystem: "monitor", Name: "node_poll_errors_total", Help: "Polling failures per node grouped by error type.", }, []string{"instance_type", "instance", "node", "error_type"}, ), nodeLastSuccess: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "pulse", Subsystem: "monitor", Name: "node_poll_last_success_timestamp", Help: "Unix timestamp of the last successful poll for a node.", }, []string{"instance_type", "instance", "node"}, ), nodeStaleness: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "pulse", Subsystem: "monitor", Name: "node_poll_staleness_seconds", Help: "Seconds since the last successful poll for a node. -1 indicates no successes yet.", }, []string{"instance_type", "instance", "node"}, ), schedulerQueueReady: prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: "pulse", Subsystem: "scheduler", Name: "queue_due_soon", Help: "Number of tasks due to run within the immediate window (12s).", }, ), schedulerQueueDepthByType: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "pulse", Subsystem: "scheduler", Name: "queue_depth", Help: "Current scheduler queue depth partitioned by instance type.", }, []string{"instance_type"}, ), schedulerQueueWait: prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: "pulse", Subsystem: "scheduler", Name: "queue_wait_seconds", Help: "Observed wait time between task readiness and execution.", Buckets: []float64{0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60}, }, []string{"instance_type"}, ), schedulerDeadLetterDepth: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "pulse", Subsystem: "scheduler", Name: "dead_letter_depth", Help: "Number of tasks currently parked in the dead-letter queue per instance.", }, []string{"instance_type", "instance"}, ), schedulerBreakerState: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "pulse", Subsystem: "scheduler", Name: "breaker_state", Help: "Circuit breaker state encoded as 0=closed, 1=half-open, 2=open, -1=unknown.", }, []string{"instance_type", "instance"}, ), schedulerBreakerFailureCount: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "pulse", Subsystem: "scheduler", Name: "breaker_failure_count", Help: "Current consecutive failure count tracked by the circuit breaker.", }, []string{"instance_type", "instance"}, ), schedulerBreakerRetrySeconds: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "pulse", Subsystem: "scheduler", Name: "breaker_retry_seconds", Help: "Seconds until the circuit breaker will allow another attempt.", }, []string{"instance_type", "instance"}, ), lastSuccessByKey: make(map[metricKey]time.Time), nodeLastSuccessByKey: make(map[nodeMetricKey]time.Time), lastQueueTypeKeys: make(map[string]struct{}), lastDLQKeys: make(map[string]struct{}), } prometheus.MustRegister( pm.pollDuration, pm.pollResults, pm.pollErrors, pm.lastSuccess, pm.staleness, pm.queueDepth, pm.inflight, pm.nodePollDuration, pm.nodePollResults, pm.nodePollErrors, pm.nodeLastSuccess, pm.nodeStaleness, pm.schedulerQueueReady, pm.schedulerQueueDepthByType, pm.schedulerQueueWait, pm.schedulerDeadLetterDepth, pm.schedulerBreakerState, pm.schedulerBreakerFailureCount, pm.schedulerBreakerRetrySeconds, ) return pm } // NodePollResult captures timing and outcome for a specific node within a poll cycle. type NodePollResult struct { InstanceName string InstanceType string NodeName string Success bool Error error StartTime time.Time EndTime time.Time } // RecordNodeResult records metrics for an individual node poll. func (pm *PollMetrics) RecordNodeResult(result NodePollResult) { if pm == nil { return } instType, inst := sanitizeInstanceLabels(result.InstanceType, result.InstanceName) nodeLabel := normalizeNodeLabel(result.NodeName) duration := result.EndTime.Sub(result.StartTime).Seconds() if duration < 0 { duration = 0 } pm.nodePollDuration.WithLabelValues(instType, inst, nodeLabel).Observe(duration) resultValue := "success" if !result.Success { resultValue = "error" } pm.nodePollResults.WithLabelValues(instType, inst, nodeLabel, resultValue).Inc() if result.Success { pm.nodeLastSuccess.WithLabelValues(instType, inst, nodeLabel).Set(float64(result.EndTime.Unix())) pm.storeNodeLastSuccess(instType, inst, nodeLabel, result.EndTime) pm.updateNodeStaleness(instType, inst, nodeLabel, 0) return } errType := pm.classifyError(result.Error) pm.nodePollErrors.WithLabelValues(instType, inst, nodeLabel, errType).Inc() if last, ok := pm.lastNodeSuccessFor(instType, inst, nodeLabel); ok && !last.IsZero() { staleness := result.EndTime.Sub(last).Seconds() if staleness < 0 { staleness = 0 } pm.updateNodeStaleness(instType, inst, nodeLabel, staleness) } else { pm.updateNodeStaleness(instType, inst, nodeLabel, -1) } } // RecordQueueWait observes the time a task spent waiting in the scheduler queue. func (pm *PollMetrics) RecordQueueWait(instanceType string, wait time.Duration) { if pm == nil { return } if wait < 0 { wait = 0 } label := normalizeLabel(instanceType) pm.schedulerQueueWait.WithLabelValues(label).Observe(wait.Seconds()) } // UpdateQueueSnapshot updates scheduler queue depth metrics. func (pm *PollMetrics) UpdateQueueSnapshot(snapshot QueueSnapshot) { if pm == nil { return } pm.schedulerQueueReady.Set(float64(snapshot.DueWithinSeconds)) current := make(map[string]struct{}, len(snapshot.PerType)) for instanceType, depth := range snapshot.PerType { key := normalizeLabel(instanceType) pm.schedulerQueueDepthByType.WithLabelValues(key).Set(float64(depth)) current[key] = struct{}{} } pm.mu.Lock() for key := range pm.lastQueueTypeKeys { if _, ok := current[key]; !ok { pm.schedulerQueueDepthByType.WithLabelValues(key).Set(0) } } pm.lastQueueTypeKeys = current pm.mu.Unlock() } // UpdateDeadLetterCounts refreshes dead-letter queue gauges based on the provided tasks. func (pm *PollMetrics) UpdateDeadLetterCounts(tasks []DeadLetterTask) { if pm == nil { return } current := make(map[string]float64) for _, task := range tasks { instType := normalizeLabel(task.Type) inst := normalizeLabel(task.Instance) key := instType + "::" + inst current[key] = current[key] + 1 } pm.mu.Lock() prev := pm.lastDLQKeys pm.lastDLQKeys = make(map[string]struct{}, len(current)) pm.mu.Unlock() for key, count := range current { instType, inst := splitInstanceKey(key) pm.schedulerDeadLetterDepth.WithLabelValues(instType, inst).Set(count) } pm.mu.Lock() for key := range current { pm.lastDLQKeys[key] = struct{}{} } for key := range prev { if _, ok := current[key]; !ok { instType, inst := splitInstanceKey(key) pm.schedulerDeadLetterDepth.WithLabelValues(instType, inst).Set(0) } } pm.mu.Unlock() } // SetBreakerState updates circuit breaker metrics for a specific instance. func (pm *PollMetrics) SetBreakerState(instanceType, instance, state string, failures int, retryAt time.Time) { if pm == nil { return } instType, inst := sanitizeInstanceLabels(instanceType, instance) value := breakerStateToValue(state) pm.schedulerBreakerState.WithLabelValues(instType, inst).Set(value) pm.schedulerBreakerFailureCount.WithLabelValues(instType, inst).Set(float64(failures)) retrySeconds := 0.0 if !retryAt.IsZero() { retrySeconds = time.Until(retryAt).Seconds() if retrySeconds < 0 { retrySeconds = 0 } } pm.schedulerBreakerRetrySeconds.WithLabelValues(instType, inst).Set(retrySeconds) } // RecordResult records metrics for a polling result. func (pm *PollMetrics) RecordResult(result PollResult) { if pm == nil { return } instType, inst := sanitizeInstanceLabels(result.InstanceType, result.InstanceName) duration := result.EndTime.Sub(result.StartTime).Seconds() if duration < 0 { duration = 0 } pm.pollDuration.WithLabelValues(instType, inst).Observe(duration) resultValue := "success" if !result.Success { resultValue = "error" } pm.pollResults.WithLabelValues(instType, inst, resultValue).Inc() if result.Success { pm.lastSuccess.WithLabelValues(instType, inst).Set(float64(result.EndTime.Unix())) pm.storeLastSuccess(instType, inst, result.EndTime) pm.updateStaleness(instType, inst, 0) } else { errType := pm.classifyError(result.Error) pm.pollErrors.WithLabelValues(instType, inst, errType).Inc() if last, ok := pm.lastSuccessFor(instType, inst); ok && !last.IsZero() { staleness := result.EndTime.Sub(last).Seconds() if staleness < 0 { staleness = 0 } pm.updateStaleness(instType, inst, staleness) } else { pm.updateStaleness(instType, inst, -1) } } pm.decrementPending() } // ResetQueueDepth sets the pending queue depth for the next polling cycle. func (pm *PollMetrics) ResetQueueDepth(total int) { if pm == nil { return } if total < 0 { total = 0 } pm.mu.Lock() pm.pending = total pm.mu.Unlock() pm.queueDepth.Set(float64(total)) } // SetQueueDepth allows direct gauge control when needed. func (pm *PollMetrics) SetQueueDepth(depth int) { if pm == nil { return } if depth < 0 { depth = 0 } pm.queueDepth.Set(float64(depth)) } // IncInFlight increments the in-flight gauge for the given instance type. func (pm *PollMetrics) IncInFlight(instanceType string) { if pm == nil { return } pm.inflight.WithLabelValues(instanceType).Inc() } // DecInFlight decrements the in-flight gauge for the given instance type. func (pm *PollMetrics) DecInFlight(instanceType string) { if pm == nil { return } pm.inflight.WithLabelValues(instanceType).Dec() } func (pm *PollMetrics) decrementPending() { if pm == nil { return } pm.mu.Lock() if pm.pending > 0 { pm.pending-- } current := pm.pending pm.mu.Unlock() pm.queueDepth.Set(float64(current)) } func (pm *PollMetrics) storeLastSuccess(instanceType, instance string, ts time.Time) { pm.mu.Lock() pm.lastSuccessByKey[makeMetricKey(instanceType, instance)] = ts pm.mu.Unlock() } func (pm *PollMetrics) lastSuccessFor(instanceType, instance string) (time.Time, bool) { pm.mu.RLock() ts, ok := pm.lastSuccessByKey[makeMetricKey(instanceType, instance)] pm.mu.RUnlock() return ts, ok } func (pm *PollMetrics) updateStaleness(instanceType, instance string, value float64) { instType, inst := sanitizeInstanceLabels(instanceType, instance) pm.staleness.WithLabelValues(instType, inst).Set(value) } func (pm *PollMetrics) storeNodeLastSuccess(instanceType, instance, node string, ts time.Time) { pm.mu.Lock() pm.nodeLastSuccessByKey[makeNodeMetricKey(instanceType, instance, node)] = ts pm.mu.Unlock() } func (pm *PollMetrics) lastNodeSuccessFor(instanceType, instance, node string) (time.Time, bool) { pm.mu.RLock() ts, ok := pm.nodeLastSuccessByKey[makeNodeMetricKey(instanceType, instance, node)] pm.mu.RUnlock() return ts, ok } func (pm *PollMetrics) updateNodeStaleness(instanceType, instance, node string, value float64) { instType, inst := sanitizeInstanceLabels(instanceType, instance) nodeLabel := normalizeLabel(node) pm.nodeStaleness.WithLabelValues(instType, inst, nodeLabel).Set(value) } func splitInstanceKey(key string) (string, string) { parts := strings.SplitN(key, "::", 2) if len(parts) == 2 { return normalizeLabel(parts[0]), normalizeLabel(parts[1]) } if key == "" { return "unknown", "unknown" } return "unknown", normalizeLabel(key) } func breakerStateToValue(state string) float64 { switch strings.ToLower(state) { case "closed": return 0 case "half_open", "half-open": return 1 case "open": return 2 default: return -1 } } func sanitizeInstanceLabels(instanceType, instance string) (string, string) { return normalizeLabel(instanceType), normalizeLabel(instance) } func makeMetricKey(instanceType, instance string) metricKey { instType, inst := sanitizeInstanceLabels(instanceType, instance) return metricKey{ instanceType: instType, instance: inst, } } func makeNodeMetricKey(instanceType, instance, node string) nodeMetricKey { instType, inst := sanitizeInstanceLabels(instanceType, instance) return nodeMetricKey{ instanceType: instType, instance: inst, node: normalizeLabel(node), } } func normalizeLabel(value string) string { v := strings.TrimSpace(value) if v == "" { return "unknown" } return v } func normalizeNodeLabel(value string) string { label := normalizeLabel(value) if label == "unknown" { return "unknown-node" } return label } func (pm *PollMetrics) classifyError(err error) string { if err == nil { return "none" } var monitorErr *internalerrors.MonitorError if stdErrors.As(err, &monitorErr) { return string(monitorErr.Type) } return "unknown" }