Reduce metrics store transaction churn (#1124)

2026-04-28 03:20:11 +00:00 · 2026-03-25 12:06:28 +00:00 · 2026-03-25 12:06:28 +00:00 · 2acf2e9ef9
commit 2acf2e9ef9
parent 73bebf2f4f
2 changed files with 74 additions and 5 deletions
--- a/pkg/metrics/store.go
+++ b/pkg/metrics/store.go
@ -49,8 +49,11 @@ type StoreConfig struct {
 // DefaultConfig returns sensible defaults for metrics storage
 func DefaultConfig(dataDir string) StoreConfig {
 	return StoreConfig{
-		DBPath:          filepath.Join(dataDir, "metrics.db"),
+		DBPath: filepath.Join(dataDir, "metrics.db"),
-		WriteBufferSize: 100,
+		// Large installs can enqueue hundreds of metric points per poll cycle.
 		// A larger buffer keeps those writes inside a single SQLite transaction
 		// more often, which materially reduces WAL churn on SSD-backed setups.
 		WriteBufferSize: 500,
 		FlushInterval:   5 * time.Second,
 		RetentionRaw:    2 * time.Hour,
 		RetentionMinute: 24 * time.Hour,
@ -110,7 +113,9 @@ func NewStore(config StoreConfig) (*Store, error) {
 			"journal_mode(WAL)",
 			"synchronous(NORMAL)",
 			"auto_vacuum(INCREMENTAL)",
-			"wal_autocheckpoint(500)",
+			// Checkpoint less aggressively so high-cardinality installs don't
 			// keep rewriting tiny WAL segments back into the main DB file.
 			"wal_autocheckpoint(4000)",
 		},
 	}.Encode()
 	db, err := sql.Open("sqlite", dsn)
@ -406,6 +411,32 @@ func (s *Store) writeBatch(metrics []bufferedMetric) {
 	log.Debug().Int("count", len(metrics)).Msg("Wrote metrics batch")
 }
 // coalesceQueuedBatches drains any already-queued write batches so the worker
 // can commit them in a single SQLite transaction. This reduces WAL write
 // amplification when the in-memory buffer flushes multiple times during one
 // poll cycle.
 func (s *Store) coalesceQueuedBatches(initial []bufferedMetric) []bufferedMetric {
 	if len(initial) == 0 {
 		return nil
 	}
 	combined := initial
 	for {
 		select {
 		case next, ok := <-s.writeCh:
 			if !ok {
 				return combined
 			}
 			if len(next) == 0 {
 				continue
 			}
 			combined = append(combined, next...)
 		default:
 			return combined
 		}
 	}
 }
 // Query retrieves metrics for a resource within a time range, with optional downsampling
 func (s *Store) Query(resourceType, resourceID, metricType string, start, end time.Time, stepSecs int64) ([]MetricPoint, error) {
 	tiers := s.tierFallbacks(end.Sub(start))
@ -673,13 +704,20 @@ func (s *Store) backgroundWorker() {
 			s.Flush()
 			// Process remaining writes
 			close(s.writeCh)
 			var remaining []bufferedMetric
 			for batch := range s.writeCh {
-				s.writeBatch(batch)
+				if len(batch) == 0 {
 					continue
 				}
 				remaining = append(remaining, batch...)
 			}
 			if len(remaining) > 0 {
 				s.writeBatch(remaining)
 			}
 			return
 		case batch := <-s.writeCh:
-			s.writeBatch(batch)
+			s.writeBatch(s.coalesceQueuedBatches(batch))
 		case <-flushTicker.C:
 			s.Flush()
--- a/pkg/metrics/store_additional_test.go
+++ b/pkg/metrics/store_additional_test.go
@ -6,6 +6,37 @@ import (
 	"time"
 )
 func TestDefaultConfig_UsesLargerWriteBuffer(t *testing.T) {
 	cfg := DefaultConfig("/tmp/pulse")
 	if cfg.WriteBufferSize != 500 {
 		t.Fatalf("WriteBufferSize = %d, want 500", cfg.WriteBufferSize)
 	}
 }
 func TestStoreCoalesceQueuedBatches(t *testing.T) {
 	store := &Store{
 		writeCh: make(chan []bufferedMetric, 4),
 	}
 	initial := []bufferedMetric{
 		{resourceType: "vm", resourceID: "vm-1", metricType: "cpu", value: 10},
 	}
 	store.writeCh <- []bufferedMetric{
 		{resourceType: "vm", resourceID: "vm-1", metricType: "memory", value: 20},
 	}
 	store.writeCh <- []bufferedMetric{
 		{resourceType: "vm", resourceID: "vm-2", metricType: "cpu", value: 30},
 	}
 	combined := store.coalesceQueuedBatches(initial)
 	if len(combined) != 3 {
 		t.Fatalf("expected 3 combined metrics, got %d", len(combined))
 	}
 	if len(store.writeCh) != 0 {
 		t.Fatalf("expected queued batches to be drained, got %d remaining", len(store.writeCh))
 	}
 }
 func TestStoreWriteBatchSync(t *testing.T) {
 	dir := t.TempDir()
 	store, err := NewStore(DefaultConfig(dir))