From 7949923e3bb4a00ca0efbf263e4b249b2c249505 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 26 Dec 2025 19:08:38 +0000 Subject: [PATCH] feat(mincut-transformer): Add comprehensive criterion benchmarks Add kernel benchmark suite (benches/kernel.rs) covering: - INT8 GEMM scalar vs SIMD comparison (64x64, 128x128, 256x256) - INT8 GEMV matrix-vector multiplication - INT4 quantization pack/unpack operations - INT4 weights creation and memory comparison - INT4 GEMV and GEMM operations - Layer norm and RMS norm comparison - Arena allocator creation and allocation patterns - Benchmark utilities (Timer, BenchStats, compute_gflops) - Full transformer layer simulation (QKV projection, FFN forward) Update Cargo.toml with kernel benchmark target. Existing benchmarks (latency.rs, gate.rs) remain for: - Inference latency across all 4 tiers - Gate evaluation overhead and policies - Spike scheduler and drop ratio calculations --- .../Cargo.toml | 4 + .../benches/kernel.rs | 527 ++++++++++++++++++ 2 files changed, 531 insertions(+) create mode 100644 crates/ruvector-mincut-gated-transformer/benches/kernel.rs diff --git a/crates/ruvector-mincut-gated-transformer/Cargo.toml b/crates/ruvector-mincut-gated-transformer/Cargo.toml index fa29bbbc7..c36be0adf 100644 --- a/crates/ruvector-mincut-gated-transformer/Cargo.toml +++ b/crates/ruvector-mincut-gated-transformer/Cargo.toml @@ -64,6 +64,10 @@ harness = false name = "gate" harness = false +[[bench]] +name = "kernel" +harness = false + [[example]] name = "scorer" path = "examples/scorer.rs" diff --git a/crates/ruvector-mincut-gated-transformer/benches/kernel.rs b/crates/ruvector-mincut-gated-transformer/benches/kernel.rs new file mode 100644 index 000000000..7d9bb5445 --- /dev/null +++ b/crates/ruvector-mincut-gated-transformer/benches/kernel.rs @@ -0,0 +1,527 @@ +//! Kernel benchmarks for low-level operations. +//! +//! Tests GEMM, INT4 quantization, arena allocation, and SIMD operations. + +use criterion::{black_box, criterion_group, criterion_main, Criterion, BenchmarkId, Throughput}; +use ruvector_mincut_gated_transformer::kernel::{ + qgemm_i8, qgemm_i8_simd, + layer_norm, rms_norm, + Timer, BenchStats, compute_gflops, + pack_int4, unpack_int4, quantize_f32_to_int4, dequantize_int4_to_f32, + Int4Weights, int4_gemv, int4_gemm, +}; +use ruvector_mincut_gated_transformer::arena::{WeightArena, calculate_arena_size}; + +// ============================================================================ +// INT8 GEMM Benchmarks +// ============================================================================ + +fn bench_qgemm_i8_sizes(c: &mut Criterion) { + let mut group = c.benchmark_group("qgemm_i8"); + + for size in [64, 128, 256].iter() { + let m = *size; + let n = *size; + let k = *size; + + let a: Vec = (0..m * k).map(|i| ((i as i16 % 256 - 128) as i8)).collect(); + let b: Vec = (0..n * k).map(|i| ((i as i16 % 256 - 128) as i8)).collect(); + let b_scales: Vec = vec![1.0 / 128.0; n]; + + let ops = 2 * m * n * k; + group.throughput(Throughput::Elements(ops as u64)); + + group.bench_with_input( + BenchmarkId::new("scalar", size), + size, + |bench, _| { + let mut c_out = vec![0i32; m * n]; + bench.iter(|| { + qgemm_i8(m, n, k, &a, 1.0 / 128.0, &b, &b_scales, None, &mut c_out); + black_box(c_out[0]) + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("simd", size), + size, + |bench, _| { + let mut c_out = vec![0i32; m * n]; + bench.iter(|| { + qgemm_i8_simd(m, n, k, &a, 1.0 / 128.0, &b, &b_scales, None, &mut c_out); + black_box(c_out[0]) + }) + }, + ); + } + + group.finish(); +} + +fn bench_qgemv(c: &mut Criterion) { + let mut group = c.benchmark_group("qgemv"); + + for size in [128, 256, 512].iter() { + let n = *size; + let k = *size; + + let a: Vec = (0..k).map(|i| ((i as i16 % 256 - 128) as i8)).collect(); + let b: Vec = (0..n * k).map(|i| ((i as i16 % 256 - 128) as i8)).collect(); + let b_scales: Vec = vec![1.0 / 128.0; n]; + + group.throughput(Throughput::Elements((2 * n * k) as u64)); + + group.bench_with_input( + BenchmarkId::from_parameter(size), + size, + |bench, _| { + let mut c_out = vec![0i32; n]; + bench.iter(|| { + qgemm_i8_simd(1, n, k, &a, 1.0 / 128.0, &b, &b_scales, None, &mut c_out); + black_box(c_out[0]) + }) + }, + ); + } + + group.finish(); +} + +// ============================================================================ +// INT4 Quantization Benchmarks +// ============================================================================ + +fn bench_int4_pack_unpack(c: &mut Criterion) { + let mut group = c.benchmark_group("int4_pack_unpack"); + + group.bench_function("pack_single", |b| { + b.iter(|| { + let packed = pack_int4(black_box(5), black_box(-3)); + black_box(packed) + }) + }); + + group.bench_function("unpack_single", |b| { + let packed = pack_int4(5, -3); + b.iter(|| { + let (v0, v1) = unpack_int4(black_box(packed)); + black_box((v0, v1)) + }) + }); + + // Bulk operations + for count in [256, 1024, 4096].iter() { + let values: Vec = (0..*count).map(|i| (i as f32 - *count as f32 / 2.0) / 100.0).collect(); + group.throughput(Throughput::Elements(*count as u64)); + + group.bench_with_input( + BenchmarkId::new("quantize", count), + count, + |bench, cnt| { + let mut packed = vec![0u8; (*cnt + 1) / 2]; + bench.iter(|| { + let scale = quantize_f32_to_int4(&values, &mut packed); + black_box(scale) + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("dequantize", count), + count, + |bench, cnt| { + let mut packed = vec![0u8; (*cnt + 1) / 2]; + let scale = quantize_f32_to_int4(&values, &mut packed); + let mut output = vec![0.0f32; *cnt]; + bench.iter(|| { + dequantize_int4_to_f32(&packed, scale, *cnt, &mut output); + black_box(output[0]) + }) + }, + ); + } + + group.finish(); +} + +fn bench_int4_weights(c: &mut Criterion) { + let mut group = c.benchmark_group("int4_weights"); + + for (rows, cols) in [(256, 256), (512, 512), (768, 768)].iter() { + let weights: Vec = (0..rows * cols) + .map(|i| ((i % 200) as f32 - 100.0) / 100.0) + .collect(); + + group.throughput(Throughput::Bytes((*rows * *cols * 4) as u64)); + + group.bench_with_input( + BenchmarkId::new("from_f32", format!("{}x{}", rows, cols)), + &(*rows, *cols), + |bench, (r, c)| { + bench.iter(|| { + let int4_w = Int4Weights::from_f32(&weights, *r, *c); + black_box(int4_w.memory_bytes()) + }) + }, + ); + } + + group.finish(); +} + +fn bench_int4_gemv(c: &mut Criterion) { + let mut group = c.benchmark_group("int4_gemv"); + + for size in [256, 512, 768].iter() { + let n = *size; + let k = *size; + + let weights: Vec = (0..n * k) + .map(|i| ((i % 200) as f32 - 100.0) / 100.0) + .collect(); + let int4_w = Int4Weights::from_f32(&weights, n, k); + let x: Vec = (0..k).map(|i| (i as f32) / k as f32).collect(); + + let ops = 2 * n * k; + group.throughput(Throughput::Elements(ops as u64)); + + group.bench_with_input( + BenchmarkId::from_parameter(size), + size, + |bench, sz| { + let mut y = vec![0.0f32; *sz]; + bench.iter(|| { + int4_gemv(&int4_w, &x, 1.0, &mut y); + black_box(y[0]) + }) + }, + ); + } + + group.finish(); +} + +fn bench_int4_gemm(c: &mut Criterion) { + let mut group = c.benchmark_group("int4_gemm"); + + for (m, n, k) in [(32, 256, 256), (64, 512, 512)].iter() { + let weights: Vec = (0..n * k) + .map(|i| ((i % 200) as f32 - 100.0) / 100.0) + .collect(); + let int4_w = Int4Weights::from_f32(&weights, *n, *k); + let a: Vec = (0..m * k).map(|i| (i as f32) / (m * k) as f32).collect(); + + let ops = 2 * m * n * k; + group.throughput(Throughput::Elements(ops as u64)); + + group.bench_with_input( + BenchmarkId::from_parameter(format!("{}x{}x{}", m, n, k)), + &(*m, *n, *k), + |bench, (batch, nn, _)| { + let mut c_out = vec![0.0f32; *batch * *nn]; + bench.iter(|| { + int4_gemm(&int4_w, &a, 1.0, *batch, &mut c_out); + black_box(c_out[0]) + }) + }, + ); + } + + group.finish(); +} + +fn bench_int4_memory_comparison(c: &mut Criterion) { + let mut group = c.benchmark_group("int4_vs_int8_memory"); + + for size in [256, 512].iter() { + let n = *size; + let k = *size; + let total_weights = n * k; + + // INT8 baseline + let weights_i8: Vec = (0..total_weights).map(|i| (i as i16 % 256 - 128) as i8).collect(); + let b_scales: Vec = vec![1.0 / 128.0; n]; + let x_i8: Vec = (0..k).map(|i| (i as i16 % 256 - 128) as i8).collect(); + + // INT4 + let weights_f32: Vec = (0..total_weights) + .map(|i| ((i % 200) as f32 - 100.0) / 100.0) + .collect(); + let int4_w = Int4Weights::from_f32(&weights_f32, n, k); + let x_f32: Vec = (0..k).map(|i| i as f32 / k as f32).collect(); + + group.bench_with_input( + BenchmarkId::new("int8_gemv", size), + size, + |bench, sz| { + let mut y_i8 = vec![0i32; *sz]; + bench.iter(|| { + qgemm_i8_simd(1, n, k, &x_i8, 1.0 / 128.0, &weights_i8, &b_scales, None, &mut y_i8); + black_box(y_i8[0]) + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("int4_gemv", size), + size, + |bench, sz| { + let mut y_f32 = vec![0.0f32; *sz]; + bench.iter(|| { + int4_gemv(&int4_w, &x_f32, 1.0, &mut y_f32); + black_box(y_f32[0]) + }) + }, + ); + } + + group.finish(); +} + +// ============================================================================ +// Normalization Benchmarks +// ============================================================================ + +fn bench_layer_norm(c: &mut Criterion) { + let mut group = c.benchmark_group("layer_norm"); + + for size in [128, 256, 512, 768].iter() { + let input: Vec = (0..*size).map(|i| (i as f32 - *size as f32 / 2.0) / 100.0).collect(); + let gamma: Vec = vec![1.0f32; *size]; + let beta: Vec = vec![0.0f32; *size]; + + group.throughput(Throughput::Elements(*size as u64)); + + group.bench_with_input( + BenchmarkId::new("layer_norm", size), + size, + |bench, sz| { + let mut output = vec![0.0f32; *sz]; + bench.iter(|| { + layer_norm(&input, &gamma, &beta, 1e-5, &mut output); + black_box(output[0]) + }) + }, + ); + + group.bench_with_input( + BenchmarkId::new("rms_norm", size), + size, + |bench, sz| { + let mut output = vec![0.0f32; *sz]; + bench.iter(|| { + rms_norm(&input, &gamma, 1e-5, &mut output); + black_box(output[0]) + }) + }, + ); + } + + group.finish(); +} + +// ============================================================================ +// Arena Allocator Benchmarks +// ============================================================================ + +fn bench_arena_allocation(c: &mut Criterion) { + let mut group = c.benchmark_group("arena_alloc"); + + for size_kb in [64, 256, 1024].iter() { + let size = size_kb * 1024; + + group.bench_with_input( + BenchmarkId::new("create", format!("{}KB", size_kb)), + &size, + |bench, sz| { + bench.iter(|| { + let arena = WeightArena::new(black_box(*sz)); + black_box(arena.capacity()) + }) + }, + ); + } + + // Allocation patterns + let arena_size = 1024 * 1024; // 1MB + + group.bench_function("alloc_i8_1024", |b| { + b.iter(|| { + let mut arena = WeightArena::new(arena_size); + for _ in 0..100 { + let _ = arena.alloc_i8(black_box(1024)); + } + black_box(arena.offset()) + }) + }); + + group.bench_function("alloc_f32_256", |b| { + b.iter(|| { + let mut arena = WeightArena::new(arena_size); + for _ in 0..100 { + let _ = arena.alloc_f32(black_box(256)); + } + black_box(arena.offset()) + }) + }); + + group.bench_function("alloc_mixed", |b| { + b.iter(|| { + let mut arena = WeightArena::new(arena_size); + for i in 0..50 { + if i % 2 == 0 { + let _ = arena.alloc_i8(black_box(1024)); + } else { + let _ = arena.alloc_f32(black_box(256)); + } + } + black_box(arena.offset()) + }) + }); + + group.bench_function("reset_reuse", |b| { + let mut arena = WeightArena::new(arena_size); + b.iter(|| { + arena.reset(); + for _ in 0..100 { + let _ = arena.alloc_i8(black_box(1024)); + } + black_box(arena.offset()) + }) + }); + + group.finish(); +} + +fn bench_arena_size_calculation(c: &mut Criterion) { + let mut group = c.benchmark_group("arena_size_calc"); + + for (layers, hidden) in [(4, 256), (12, 768), (24, 1024)].iter() { + group.bench_with_input( + BenchmarkId::from_parameter(format!("{}L_{}H", layers, hidden)), + &(*layers, *hidden), + |bench, (l, h)| { + bench.iter(|| { + let size = calculate_arena_size(black_box(*l), black_box(*h), 4, 8); + black_box(size) + }) + }, + ); + } + + group.finish(); +} + +// ============================================================================ +// Timer/Stats Utilities Benchmarks +// ============================================================================ + +fn bench_timer_overhead(c: &mut Criterion) { + let mut group = c.benchmark_group("timer_overhead"); + + group.bench_function("timer_start_stop", |b| { + b.iter(|| { + let mut timer = Timer::new(); + timer.start(); + timer.stop(); + black_box(timer.elapsed_ns()) + }) + }); + + group.bench_function("bench_stats_record", |b| { + let mut stats = BenchStats::new(1000); + b.iter(|| { + stats.add_sample(black_box(100)); + black_box(stats.min_ns()) + }) + }); + + group.bench_function("compute_gflops", |b| { + b.iter(|| { + let gflops = compute_gflops(black_box(2_000_000_000), black_box(1_000_000)); + black_box(gflops) + }) + }); + + group.finish(); +} + +// ============================================================================ +// Combined Workload Benchmarks +// ============================================================================ + +fn bench_transformer_layer_simulation(c: &mut Criterion) { + let mut group = c.benchmark_group("layer_simulation"); + + let hidden = 256; + let ffn_hidden = hidden * 4; + + let q_weights: Vec = (0..hidden * hidden).map(|i| ((i as i16 % 256 - 128) as i8)).collect(); + let k_weights: Vec = (0..hidden * hidden).map(|i| ((i as i16 % 256 - 128) as i8)).collect(); + let v_weights: Vec = (0..hidden * hidden).map(|i| ((i as i16 % 256 - 128) as i8)).collect(); + let ffn_up: Vec = (0..hidden * ffn_hidden).map(|i| ((i as i16 % 256 - 128) as i8)).collect(); + let ffn_down: Vec = (0..ffn_hidden * hidden).map(|i| ((i as i16 % 256 - 128) as i8)).collect(); + + let q_scales: Vec = vec![1.0 / 128.0; hidden]; + let k_scales: Vec = vec![1.0 / 128.0; hidden]; + let v_scales: Vec = vec![1.0 / 128.0; hidden]; + let ffn_up_scales: Vec = vec![1.0 / 128.0; ffn_hidden]; + let ffn_down_scales: Vec = vec![1.0 / 128.0; hidden]; + + let input: Vec = (0..hidden).map(|i| ((i as i16 % 256 - 128) as i8)).collect(); + + group.bench_function("qkv_projection", |b| { + let mut q_out = vec![0i32; hidden]; + let mut k_out = vec![0i32; hidden]; + let mut v_out = vec![0i32; hidden]; + b.iter(|| { + qgemm_i8_simd(1, hidden, hidden, &input, 1.0/128.0, &q_weights, &q_scales, None, &mut q_out); + qgemm_i8_simd(1, hidden, hidden, &input, 1.0/128.0, &k_weights, &k_scales, None, &mut k_out); + qgemm_i8_simd(1, hidden, hidden, &input, 1.0/128.0, &v_weights, &v_scales, None, &mut v_out); + black_box((q_out[0], k_out[0], v_out[0])) + }) + }); + + group.bench_function("ffn_forward", |b| { + let mut ffn_mid = vec![0i32; ffn_hidden]; + let mut out = vec![0i32; hidden]; + b.iter(|| { + qgemm_i8_simd(1, ffn_hidden, hidden, &input, 1.0/128.0, &ffn_up, &ffn_up_scales, None, &mut ffn_mid); + let ffn_mid_i8: Vec = ffn_mid.iter().map(|&x| (x >> 8).clamp(-128, 127) as i8).collect(); + qgemm_i8_simd(1, hidden, ffn_hidden, &ffn_mid_i8, 1.0/128.0, &ffn_down, &ffn_down_scales, None, &mut out); + black_box(out[0]) + }) + }); + + group.bench_function("layer_norm_overhead", |b| { + let input_f32: Vec = input.iter().map(|&x| x as f32 / 128.0).collect(); + let gamma_f32: Vec = vec![1.0f32; hidden]; + let beta_f32: Vec = vec![0.0f32; hidden]; + let mut output = vec![0.0f32; hidden]; + b.iter(|| { + layer_norm(&input_f32, &gamma_f32, &beta_f32, 1e-5, &mut output); + black_box(output[0]) + }) + }); + + group.finish(); +} + +criterion_group!( + benches, + bench_qgemm_i8_sizes, + bench_qgemv, + bench_int4_pack_unpack, + bench_int4_weights, + bench_int4_gemv, + bench_int4_gemm, + bench_int4_memory_comparison, + bench_layer_norm, + bench_arena_allocation, + bench_arena_size_calculation, + bench_timer_overhead, + bench_transformer_layer_simulation, +); + +criterion_main!(benches);