Pulse/pkg/metrics/store_bench_test.go

package metrics

import (
	"fmt"
	"path/filepath"
	"testing"
	"time"

	"github.com/rs/zerolog"
	"github.com/rs/zerolog/log"
)

// suppressLogs disables the zerolog global logger for the duration of a
// benchmark/test and restores it on cleanup. This prevents log I/O from
// skewing benchmark results while leaving other tests in the package unaffected.
func suppressLogs(tb testing.TB) {
	tb.Helper()
	orig := log.Logger
	log.Logger = zerolog.Nop()
	tb.Cleanup(func() { log.Logger = orig })
}

// newBenchStore creates an ephemeral metrics store suitable for benchmarks.
// It disables automatic background flushes so callers control timing.
func newBenchStore(b *testing.B) *Store {
	b.Helper()
	dir := b.TempDir()
	cfg := DefaultConfig(dir)
	cfg.DBPath = filepath.Join(dir, "bench.db")
	cfg.FlushInterval = time.Hour // prevent background flushes
	cfg.WriteBufferSize = 10_000  // large buffer so Write() doesn't auto-flush
	store, err := NewStore(cfg)
	if err != nil {
		b.Fatalf("NewStore: %v", err)
	}
	b.Cleanup(func() { store.Close() })
	return store
}

// BenchmarkWriteBatchSync measures raw SQLite insert throughput via the
// synchronous batch-write path (the hot path during metrics recording).
// Batch sizes mirror real-world usage: 100 metrics per flush.
func BenchmarkWriteBatchSync(b *testing.B) {
	suppressLogs(b)
	for _, batchSize := range []int{10, 100, 500} {
		b.Run(fmt.Sprintf("batch=%d", batchSize), func(b *testing.B) {
			store := newBenchStore(b)
			base := time.Now()

			// Precompute all batches so timestamp construction is outside the timed loop.
			batches := make([][]WriteMetric, b.N)
			for n := range batches {
				batch := make([]WriteMetric, batchSize)
				offset := time.Duration(n*batchSize) * time.Second
				for i := range batch {
					batch[i] = WriteMetric{
						ResourceType: "vm",
						ResourceID:   fmt.Sprintf("vm-%d", i%50),
						MetricType:   "cpu",
						Value:        float64(i % 100),
						Timestamp:    base.Add(offset + time.Duration(i)*time.Second),
						Tier:         TierRaw,
					}
				}
				batches[n] = batch
			}

			b.ResetTimer()
			b.ReportAllocs()
			for i := 0; i < b.N; i++ {
				store.WriteBatchSync(batches[i])
			}
		})
	}
}

// BenchmarkQuery measures single-metric query latency over a pre-populated
// dataset of 1000 raw data points — representative of a 2-hour window at
// ~7-second intervals.
func BenchmarkQuery(b *testing.B) {
	suppressLogs(b)
	store := newBenchStore(b)
	base := time.Now().Add(-2 * time.Hour)

	// Seed 1000 raw points for one resource.
	const numPoints = 1000
	batch := make([]WriteMetric, numPoints)
	for i := range batch {
		batch[i] = WriteMetric{
			ResourceType: "vm",
			ResourceID:   "vm-bench",
			MetricType:   "cpu",
			Value:        float64(i % 100),
			Timestamp:    base.Add(time.Duration(i) * 7 * time.Second),
			Tier:         TierRaw,
		}
	}
	store.WriteBatchSync(batch)

	start := base.Add(-time.Second)
	end := base.Add(time.Duration(numPoints) * 7 * time.Second)

	b.Run("no-downsample", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			pts, err := store.Query("vm", "vm-bench", "cpu", start, end, 0)
			if err != nil {
				b.Fatalf("Query: %v", err)
			}
			if len(pts) != numPoints {
				b.Fatalf("expected %d points, got %d", numPoints, len(pts))
			}
		}
	})

	b.Run("downsample-60s", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			_, err := store.Query("vm", "vm-bench", "cpu", start, end, 60)
			if err != nil {
				b.Fatalf("Query: %v", err)
			}
		}
	})
}

// BenchmarkQueryAll measures multi-metric query latency. Seeds 4 metric types
// with 500 points each — representative of a dashboard loading all metrics for
// one resource.
func BenchmarkQueryAll(b *testing.B) {
	suppressLogs(b)
	store := newBenchStore(b)
	base := time.Now().Add(-2 * time.Hour)

	metricTypes := []string{"cpu", "memory", "disk_read", "disk_write"}
	const pointsPerMetric = 500

	batch := make([]WriteMetric, 0, len(metricTypes)*pointsPerMetric)
	for _, mt := range metricTypes {
		for i := 0; i < pointsPerMetric; i++ {
			batch = append(batch, WriteMetric{
				ResourceType: "vm",
				ResourceID:   "vm-bench",
				MetricType:   mt,
				Value:        float64(i % 100),
				Timestamp:    base.Add(time.Duration(i) * 14 * time.Second),
				Tier:         TierRaw,
			})
		}
	}
	store.WriteBatchSync(batch)

	start := base.Add(-time.Second)
	end := base.Add(time.Duration(pointsPerMetric) * 14 * time.Second)

	b.Run("4-metrics", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			result, err := store.QueryAll("vm", "vm-bench", start, end, 0)
			if err != nil {
				b.Fatalf("QueryAll: %v", err)
			}
			if len(result) != len(metricTypes) {
				b.Fatalf("expected %d metric types, got %d", len(metricTypes), len(result))
			}
			for _, mt := range metricTypes {
				if len(result[mt]) != pointsPerMetric {
					b.Fatalf("expected %d points for %s, got %d", pointsPerMetric, mt, len(result[mt]))
				}
			}
		}
	})

	b.Run("4-metrics-downsample-60s", func(b *testing.B) {
		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			_, err := store.QueryAll("vm", "vm-bench", start, end, 60)
			if err != nil {
				b.Fatalf("QueryAll: %v", err)
			}
		}
	})
}

// BenchmarkWriteBuffered measures the in-memory buffer path (Write → buffer
// append). This isolates the mutex-protected hot path that every live metrics
// tick goes through. The buffer capacity is set high enough that writes don't
// trigger flushes, and the buffer is periodically drained (outside timing) to
// bound memory usage.
func BenchmarkWriteBuffered(b *testing.B) {
	suppressLogs(b)

	const drainEvery = 100_000 // drain buffer every 100k writes to bound memory

	dir := b.TempDir()
	cfg := DefaultConfig(dir)
	cfg.DBPath = filepath.Join(dir, "bench.db")
	cfg.FlushInterval = time.Hour
	cfg.WriteBufferSize = drainEvery + 1 // larger than drain interval → no auto-flush
	store, err := NewStore(cfg)
	if err != nil {
		b.Fatalf("NewStore: %v", err)
	}
	b.Cleanup(func() {
		store.bufferMu.Lock()
		store.buffer = store.buffer[:0]
		store.bufferMu.Unlock()
		store.Close()
	})

	base := time.Now()

	b.ReportAllocs()
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		if i > 0 && i%drainEvery == 0 {
			b.StopTimer()
			store.bufferMu.Lock()
			store.buffer = store.buffer[:0]
			store.bufferMu.Unlock()
			b.StartTimer()
		}
		store.Write("vm", "vm-101", "cpu", 42.0, base.Add(time.Duration(i)*time.Second))
	}
}

// BenchmarkRollupCandidate measures the core rollup aggregation path — an
// INSERT SELECT that reads raw-tier data, aggregates into minute buckets, and
// writes the result. This is the inner loop called once per resource/metric
// pair every 5 minutes. At 500-node scale (4 metrics × ~500 resources), this
// runs ~2000 times per rollup cycle.
//
// After the first iteration, INSERT OR IGNORE detects existing minute-tier rows
// and skips re-insertion. The SELECT + GROUP BY aggregation (the expensive part)
// still executes every iteration, so this accurately measures scan/aggregation
// cost which dominates rollup latency.
func BenchmarkRollupCandidate(b *testing.B) {
	suppressLogs(b)

	for _, numPoints := range []int{100, 1000} {
		b.Run(fmt.Sprintf("points=%d", numPoints), func(b *testing.B) {
			store := newBenchStore(b)

			// Place all data points well in the past so even the largest
			// sub-benchmark (1000 points at 1-second spacing = ~17 minutes)
			// never extends past "now". 30 minutes of headroom is sufficient.
			// Floor to minute boundary so startTs alignment doesn't trim
			// leading points, keeping the effective input size deterministic.
			raw := time.Now().Add(-30 * time.Minute).Unix()
			base := time.Unix((raw/60)*60, 0)
			batch := make([]WriteMetric, numPoints)
			for i := range batch {
				batch[i] = WriteMetric{
					ResourceType: "vm",
					ResourceID:   "vm-rollup",
					MetricType:   "cpu",
					Value:        float64(i % 100),
					Timestamp:    base.Add(time.Duration(i) * time.Second),
					Tier:         TierRaw,
				}
			}
			store.WriteBatchSync(batch)

			// Bucket-align boundaries to match production rollupTier behavior.
			// startTs is floored (base is already aligned). endTs is ceiled so
			// all seeded points fall within the [startTs, endTs) window.
			bucketSecs := int64(60)
			startTs := (base.Unix() / bucketSecs) * bucketSecs
			rawEnd := base.Add(time.Duration(numPoints) * time.Second).Unix()
			endTs := ((rawEnd + bucketSecs - 1) / bucketSecs) * bucketSecs

			// Sanity check: verify rollup actually produces data on first call.
			store.rollupCandidate("vm", "vm-rollup", "cpu", TierRaw, TierMinute, bucketSecs, startTs, endTs)
			var minuteCount int
			if err := store.db.QueryRow(
				`SELECT COUNT(*) FROM metrics WHERE tier = 'minute' AND resource_id = 'vm-rollup'`,
			).Scan(&minuteCount); err != nil {
				b.Fatalf("sanity check query: %v", err)
			}
			if minuteCount == 0 {
				b.Fatal("sanity check: rollupCandidate produced no minute-tier rows")
			}

			b.ResetTimer()
			b.ReportAllocs()
			for i := 0; i < b.N; i++ {
				store.rollupCandidate("vm", "vm-rollup", "cpu", TierRaw, TierMinute, bucketSecs, startTs, endTs)
			}
		})
	}
}

// BenchmarkRollupTierBatched measures the batched rollupTier path that
// aggregates ALL resource/metric combinations in a single INSERT...SELECT
// statement. This is the production path — contrasts with BenchmarkRollupCandidate
// which measures per-candidate performance. With 50 resources × 2 metrics,
// the batched approach issues 1 SQL statement instead of 100+ individual
// transactions. The rollup checkpoint is reset between iterations so each
// iteration performs actual rollup work (INSERT OR IGNORE is idempotent).
func BenchmarkRollupTierBatched(b *testing.B) {
	suppressLogs(b)
	store := newBenchStore(b)

	const numResources = 50
	const metricsPerResource = 2
	const pointsPerMetric = 20

	rawBase := time.Now().Add(-30 * time.Minute).Unix()
	base := time.Unix((rawBase/60)*60, 0)

	metricTypes := []string{"cpu", "mem"}
	batch := make([]WriteMetric, 0, numResources*metricsPerResource*pointsPerMetric)
	for r := 0; r < numResources; r++ {
		for _, mt := range metricTypes[:metricsPerResource] {
			for p := 0; p < pointsPerMetric; p++ {
				batch = append(batch, WriteMetric{
					ResourceType: "vm",
					ResourceID:   fmt.Sprintf("vm-%d", r),
					MetricType:   mt,
					Value:        float64((r + p) % 100),
					Timestamp:    base.Add(time.Duration(p) * time.Second),
					Tier:         TierRaw,
				})
			}
		}
	}
	store.WriteBatchSync(batch)

	metaKey := "rollup:raw:minute"

	b.ResetTimer()
	b.ReportAllocs()
	for i := 0; i < b.N; i++ {
		// Reset the rollup checkpoint so each iteration does real work.
		// INSERT OR IGNORE makes re-rollup idempotent.
		_ = store.setMetaInt(metaKey, 0)
		store.rollupTier(TierRaw, TierMinute, time.Minute, 0)
	}
}

// BenchmarkConcurrentReadWrite measures query latency under worst-case write
// contention. A background goroutine continuously writes batches while the
// benchmark loop queries historical data. Since the production Store uses
// MaxOpenConns(1) (SQLite best practice), this measures single-connection
// pool contention — the same bottleneck production hits when live metric
// writes overlap with dashboard chart queries. Catches regressions in
// connection pool fairness, write transaction duration, and query plan
// efficiency under contention.
func BenchmarkConcurrentReadWrite(b *testing.B) {
	suppressLogs(b)
	store := newBenchStore(b)
	base := time.Now().Add(-time.Hour)

	// Seed initial data so queries return results from the start.
	const seedPoints = 500
	seed := make([]WriteMetric, seedPoints)
	for i := range seed {
		seed[i] = WriteMetric{
			ResourceType: "vm",
			ResourceID:   "vm-crw",
			MetricType:   "cpu",
			Value:        float64(i % 100),
			Timestamp:    base.Add(time.Duration(i) * 7 * time.Second),
			Tier:         TierRaw,
		}
	}
	store.WriteBatchSync(seed)

	start := base.Add(-time.Second)
	end := base.Add(time.Duration(seedPoints) * 7 * time.Second)

	// Launch a background writer that continuously appends batches at
	// maximum throughput (worst-case stress, not paced like production 2s
	// ticks). It writes to different resource IDs than the reader queries,
	// mirroring production where ingestion targets many resources while a
	// user views one.
	stop := make(chan struct{})
	writerDone := make(chan struct{})
	started := make(chan struct{})
	go func() {
		defer close(writerDone)
		writeBase := end
		tick := 0
		for {
			select {
			case <-stop:
				return
			default:
			}
			batch := make([]WriteMetric, 10)
			for j := range batch {
				batch[j] = WriteMetric{
					ResourceType: "vm",
					ResourceID:   fmt.Sprintf("vm-crw-live-%d", tick%50),
					MetricType:   "cpu",
					Value:        float64((tick + j) % 100),
					Timestamp:    writeBase.Add(time.Duration(tick*10+j) * 2 * time.Second),
					Tier:         TierRaw,
				}
			}
			store.WriteBatchSync(batch)
			if tick == 0 {
				close(started) // Signal after first write completes.
			}
			tick++
		}
	}()

	// Wait until the writer has completed at least one write so contention
	// is guaranteed when timing begins.
	<-started

	// Clean up the writer on any exit path (including b.Fatalf).
	b.Cleanup(func() {
		close(stop)
		<-writerDone
	})

	b.ResetTimer()
	b.ReportAllocs()
	for i := 0; i < b.N; i++ {
		pts, err := store.Query("vm", "vm-crw", "cpu", start, end, 0)
		if err != nil {
			b.Fatalf("Query: %v", err)
		}
		if len(pts) < seedPoints {
			b.Fatalf("expected at least %d points, got %d", seedPoints, len(pts))
		}
	}
}

// BenchmarkQueryAllBatch measures batch multi-resource query latency —
// the hot path for dashboard chart loading after the N+1 fix. Sub-benchmarks
// vary resource count to show how batch performance scales with the number
// of resources queried in a single call.
func BenchmarkQueryAllBatch(b *testing.B) {
	suppressLogs(b)
	store := newBenchStore(b)
	base := time.Now().Add(-2 * time.Hour)

	const totalResources = 100
	metricTypes := []string{"cpu", "memory", "disk_read", "disk_write"}
	const pointsPerMetric = 100

	// Seed data for 100 resources × 4 metrics × 100 points.
	batch := make([]WriteMetric, 0, totalResources*len(metricTypes)*pointsPerMetric)
	for r := 0; r < totalResources; r++ {
		for _, mt := range metricTypes {
			for p := 0; p < pointsPerMetric; p++ {
				batch = append(batch, WriteMetric{
					ResourceType: "vm",
					ResourceID:   fmt.Sprintf("vm-%d", r),
					MetricType:   mt,
					Value:        float64((r + p) % 100),
					Timestamp:    base.Add(time.Duration(p) * 72 * time.Second),
					Tier:         TierRaw,
				})
			}
		}
	}
	store.WriteBatchSync(batch)

	start := base.Add(-time.Second)
	end := base.Add(time.Duration(pointsPerMetric) * 72 * time.Second)

	for _, numResources := range []int{1, 10, 50, 100} {
		ids := make([]string, numResources)
		for i := range ids {
			ids[i] = fmt.Sprintf("vm-%d", i)
		}

		b.Run(fmt.Sprintf("%d-resources", numResources), func(b *testing.B) {
			b.ReportAllocs()
			for i := 0; i < b.N; i++ {
				result, err := store.QueryAllBatch("vm", ids, start, end, 0)
				if err != nil {
					b.Fatalf("QueryAllBatch: %v", err)
				}
				if len(result) != numResources {
					b.Fatalf("expected %d resources, got %d", numResources, len(result))
				}
				for _, id := range ids {
					resMetrics := result[id]
					if len(resMetrics) != len(metricTypes) {
						b.Fatalf("resource %s: expected %d metric types, got %d", id, len(metricTypes), len(resMetrics))
					}
					for _, mt := range metricTypes {
						if len(resMetrics[mt]) != pointsPerMetric {
							b.Fatalf("resource %s metric %s: expected %d points, got %d", id, mt, pointsPerMetric, len(resMetrics[mt]))
						}
					}
				}
			}
		})
	}

	b.Run("50-resources-downsample-60s", func(b *testing.B) {
		ids := make([]string, 50)
		for i := range ids {
			ids[i] = fmt.Sprintf("vm-%d", i)
		}

		b.ReportAllocs()
		for i := 0; i < b.N; i++ {
			result, err := store.QueryAllBatch("vm", ids, start, end, 60)
			if err != nil {
				b.Fatalf("QueryAllBatch: %v", err)
			}
			if len(result) != len(ids) {
				b.Fatalf("expected %d resources, got %d", len(ids), len(result))
			}
			for _, id := range ids {
				resMetrics := result[id]
				if len(resMetrics) != len(metricTypes) {
					b.Fatalf("resource %s: expected %d metric types, got %d", id, len(metricTypes), len(resMetrics))
				}
			}
		}
	})
}

// BenchmarkQueryManyResources measures query latency when the metrics table
// contains data for many distinct resources — simulating a 100-resource
// deployment where index isolation matters.
func BenchmarkQueryManyResources(b *testing.B) {
	suppressLogs(b)
	store := newBenchStore(b)
	base := time.Now().Add(-time.Hour)

	const numResources = 100
	const pointsPerResource = 20
	batch := make([]WriteMetric, 0, numResources*pointsPerResource)
	for r := 0; r < numResources; r++ {
		for p := 0; p < pointsPerResource; p++ {
			batch = append(batch, WriteMetric{
				ResourceType: "node",
				ResourceID:   fmt.Sprintf("node-%d", r),
				MetricType:   "cpu",
				Value:        float64(p * 5),
				Timestamp:    base.Add(time.Duration(p) * 3 * time.Minute),
				Tier:         TierRaw,
			})
		}
	}
	store.WriteBatchSync(batch)

	// Precompute resource IDs to avoid fmt.Sprintf in the timed loop.
	resourceIDs := make([]string, numResources)
	for r := 0; r < numResources; r++ {
		resourceIDs[r] = fmt.Sprintf("node-%d", r)
	}

	start := base.Add(-time.Second)
	end := base.Add(time.Duration(pointsPerResource) * 3 * time.Minute)

	b.ResetTimer()
	b.ReportAllocs()
	for i := 0; i < b.N; i++ {
		// Query a single resource — index should isolate it from the 2000 total rows.
		pts, err := store.Query("node", resourceIDs[i%numResources], "cpu", start, end, 0)
		if err != nil {
			b.Fatalf("Query: %v", err)
		}
		if len(pts) != pointsPerResource {
			b.Fatalf("expected %d points, got %d", pointsPerResource, len(pts))
		}
	}
}