mirror of
https://github.com/rcourtman/Pulse.git
synced 2026-04-28 03:20:11 +00:00
Reduce metrics store transaction churn (#1124)
This commit is contained in:
parent
73bebf2f4f
commit
2acf2e9ef9
2 changed files with 74 additions and 5 deletions
|
|
@ -49,8 +49,11 @@ type StoreConfig struct {
|
||||||
// DefaultConfig returns sensible defaults for metrics storage
|
// DefaultConfig returns sensible defaults for metrics storage
|
||||||
func DefaultConfig(dataDir string) StoreConfig {
|
func DefaultConfig(dataDir string) StoreConfig {
|
||||||
return StoreConfig{
|
return StoreConfig{
|
||||||
DBPath: filepath.Join(dataDir, "metrics.db"),
|
DBPath: filepath.Join(dataDir, "metrics.db"),
|
||||||
WriteBufferSize: 100,
|
// Large installs can enqueue hundreds of metric points per poll cycle.
|
||||||
|
// A larger buffer keeps those writes inside a single SQLite transaction
|
||||||
|
// more often, which materially reduces WAL churn on SSD-backed setups.
|
||||||
|
WriteBufferSize: 500,
|
||||||
FlushInterval: 5 * time.Second,
|
FlushInterval: 5 * time.Second,
|
||||||
RetentionRaw: 2 * time.Hour,
|
RetentionRaw: 2 * time.Hour,
|
||||||
RetentionMinute: 24 * time.Hour,
|
RetentionMinute: 24 * time.Hour,
|
||||||
|
|
@ -110,7 +113,9 @@ func NewStore(config StoreConfig) (*Store, error) {
|
||||||
"journal_mode(WAL)",
|
"journal_mode(WAL)",
|
||||||
"synchronous(NORMAL)",
|
"synchronous(NORMAL)",
|
||||||
"auto_vacuum(INCREMENTAL)",
|
"auto_vacuum(INCREMENTAL)",
|
||||||
"wal_autocheckpoint(500)",
|
// Checkpoint less aggressively so high-cardinality installs don't
|
||||||
|
// keep rewriting tiny WAL segments back into the main DB file.
|
||||||
|
"wal_autocheckpoint(4000)",
|
||||||
},
|
},
|
||||||
}.Encode()
|
}.Encode()
|
||||||
db, err := sql.Open("sqlite", dsn)
|
db, err := sql.Open("sqlite", dsn)
|
||||||
|
|
@ -406,6 +411,32 @@ func (s *Store) writeBatch(metrics []bufferedMetric) {
|
||||||
log.Debug().Int("count", len(metrics)).Msg("Wrote metrics batch")
|
log.Debug().Int("count", len(metrics)).Msg("Wrote metrics batch")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// coalesceQueuedBatches drains any already-queued write batches so the worker
|
||||||
|
// can commit them in a single SQLite transaction. This reduces WAL write
|
||||||
|
// amplification when the in-memory buffer flushes multiple times during one
|
||||||
|
// poll cycle.
|
||||||
|
func (s *Store) coalesceQueuedBatches(initial []bufferedMetric) []bufferedMetric {
|
||||||
|
if len(initial) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
combined := initial
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case next, ok := <-s.writeCh:
|
||||||
|
if !ok {
|
||||||
|
return combined
|
||||||
|
}
|
||||||
|
if len(next) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
combined = append(combined, next...)
|
||||||
|
default:
|
||||||
|
return combined
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Query retrieves metrics for a resource within a time range, with optional downsampling
|
// Query retrieves metrics for a resource within a time range, with optional downsampling
|
||||||
func (s *Store) Query(resourceType, resourceID, metricType string, start, end time.Time, stepSecs int64) ([]MetricPoint, error) {
|
func (s *Store) Query(resourceType, resourceID, metricType string, start, end time.Time, stepSecs int64) ([]MetricPoint, error) {
|
||||||
tiers := s.tierFallbacks(end.Sub(start))
|
tiers := s.tierFallbacks(end.Sub(start))
|
||||||
|
|
@ -673,13 +704,20 @@ func (s *Store) backgroundWorker() {
|
||||||
s.Flush()
|
s.Flush()
|
||||||
// Process remaining writes
|
// Process remaining writes
|
||||||
close(s.writeCh)
|
close(s.writeCh)
|
||||||
|
var remaining []bufferedMetric
|
||||||
for batch := range s.writeCh {
|
for batch := range s.writeCh {
|
||||||
s.writeBatch(batch)
|
if len(batch) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
remaining = append(remaining, batch...)
|
||||||
|
}
|
||||||
|
if len(remaining) > 0 {
|
||||||
|
s.writeBatch(remaining)
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
|
|
||||||
case batch := <-s.writeCh:
|
case batch := <-s.writeCh:
|
||||||
s.writeBatch(batch)
|
s.writeBatch(s.coalesceQueuedBatches(batch))
|
||||||
|
|
||||||
case <-flushTicker.C:
|
case <-flushTicker.C:
|
||||||
s.Flush()
|
s.Flush()
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,37 @@ import (
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func TestDefaultConfig_UsesLargerWriteBuffer(t *testing.T) {
|
||||||
|
cfg := DefaultConfig("/tmp/pulse")
|
||||||
|
if cfg.WriteBufferSize != 500 {
|
||||||
|
t.Fatalf("WriteBufferSize = %d, want 500", cfg.WriteBufferSize)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestStoreCoalesceQueuedBatches(t *testing.T) {
|
||||||
|
store := &Store{
|
||||||
|
writeCh: make(chan []bufferedMetric, 4),
|
||||||
|
}
|
||||||
|
|
||||||
|
initial := []bufferedMetric{
|
||||||
|
{resourceType: "vm", resourceID: "vm-1", metricType: "cpu", value: 10},
|
||||||
|
}
|
||||||
|
store.writeCh <- []bufferedMetric{
|
||||||
|
{resourceType: "vm", resourceID: "vm-1", metricType: "memory", value: 20},
|
||||||
|
}
|
||||||
|
store.writeCh <- []bufferedMetric{
|
||||||
|
{resourceType: "vm", resourceID: "vm-2", metricType: "cpu", value: 30},
|
||||||
|
}
|
||||||
|
|
||||||
|
combined := store.coalesceQueuedBatches(initial)
|
||||||
|
if len(combined) != 3 {
|
||||||
|
t.Fatalf("expected 3 combined metrics, got %d", len(combined))
|
||||||
|
}
|
||||||
|
if len(store.writeCh) != 0 {
|
||||||
|
t.Fatalf("expected queued batches to be drained, got %d remaining", len(store.writeCh))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestStoreWriteBatchSync(t *testing.T) {
|
func TestStoreWriteBatchSync(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
store, err := NewStore(DefaultConfig(dir))
|
store, err := NewStore(DefaultConfig(dir))
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue