From 03f8d08fd01ade480cffc0409caf268aa00ecdf0 Mon Sep 17 00:00:00 2001 From: ruvnet Date: Fri, 8 May 2026 12:51:14 -0400 Subject: [PATCH] =?UTF-8?q?feat(sparse-mario):=20iter=204=20=E2=80=94=20be?= =?UTF-8?q?nch=20dense=20vs=20sparse=20vs=20sparse+FastGRNN?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `benches/sparse_mario_bench.rs` exercising the retrieval workload shape (heads=1, head_dim=64, non-causal, window=256, block=64) at seq lengths 256/512/1024/2048 — the realistic range of corpus + prefix in the example. Headline numbers (Ryzen 9 9950X, --features parallel, --warm-up-time 1 --measurement-time 3 --sample-size 20): seq dense sparse sparse+FG speedup (sparse vs dense) 256 2.41 ms 1.74 ms 2.23 ms 1.4x 512 9.59 ms 5.21 ms 6.24 ms 1.8x 1024 38.4 ms 12.2 ms 14.2 ms 3.1x 2048 154 ms 26.2 ms 30.3 ms 5.9x Dense scales 4x per doubling (O(N²) confirmed). Sparse scales ~2x per doubling (sub-quadratic). FastGRNN gate adds a small constant cost that dominates at small N and single-head; it would pay back at longer sequences and wider heads — iter 5 will sweep this. Iter-plan progress: ✓ 1-3. corpus + retrieval LM + ASCII generation ✓ 4. sparse-mario bench ← here 5. fp16 KV cache + FastGRNN sweep + top-k sampling 6. validation + final summary Co-Authored-By: claude-flow --- crates/ruvllm_sparse_attention/Cargo.toml | 4 + .../benches/sparse_mario_bench.rs | 111 ++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 crates/ruvllm_sparse_attention/benches/sparse_mario_bench.rs diff --git a/crates/ruvllm_sparse_attention/Cargo.toml b/crates/ruvllm_sparse_attention/Cargo.toml index d2482784..005e35db 100644 --- a/crates/ruvllm_sparse_attention/Cargo.toml +++ b/crates/ruvllm_sparse_attention/Cargo.toml @@ -42,3 +42,7 @@ criterion = "0.5" [[bench]] name = "attention_bench" harness = false + +[[bench]] +name = "sparse_mario_bench" +harness = false diff --git a/crates/ruvllm_sparse_attention/benches/sparse_mario_bench.rs b/crates/ruvllm_sparse_attention/benches/sparse_mario_bench.rs new file mode 100644 index 00000000..e2c4d1b0 --- /dev/null +++ b/crates/ruvllm_sparse_attention/benches/sparse_mario_bench.rs @@ -0,0 +1,111 @@ +// sparse_mario_bench — benchmark the retrieval workload used by +// `examples/sparse_mario.rs` against three attention paths: +// +// 1. dense_attention — O(N²) baseline +// 2. sparse forward() — O(N log N) with non-causal window+log-stride+landmarks +// 3. sparse forward_gated_with_fastgrnn() — near-linear with FastGRNN salience gate +// +// Tensor shape mirrors sparse-mario: heads=1, head_dim=64, non-causal, +// window=256, block=64. Sequence lengths 256/512/1024/2048 cover the +// realistic range of corpus_len + prefix_len in the example (2.1K–2.9K). +// +// Run with: +// cargo bench --bench sparse_mario_bench --features parallel \ +// -- --warm-up-time 1 --measurement-time 3 --sample-size 20 + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use ruvllm_sparse_attention::{ + dense_attention, AttentionBackend, FastGrnnGate, SparseAttentionConfig, + SubquadraticSparseAttention, Tensor3, +}; + +const HEAD_DIM: usize = 64; +const N_HEADS: usize = 1; +const SEQS: &[usize] = &[256, 512, 1024, 2048]; + +fn random_tensor(seq: usize, seed: u64) -> Tensor3 { + let mut rng = StdRng::seed_from_u64(seed); + let len = seq * N_HEADS * HEAD_DIM; + let data: Vec = (0..len).map(|_| rng.gen_range(-1.0f32..1.0f32)).collect(); + Tensor3::from_vec(data, seq, N_HEADS, HEAD_DIM).unwrap() +} + +fn mario_config() -> SparseAttentionConfig { + SparseAttentionConfig { + window: 256, + block_size: 64, + global_tokens: vec![0], + causal: false, + use_log_stride: true, + use_landmarks: true, + sort_candidates: false, + } +} + +fn bench_dense(c: &mut Criterion) { + let mut group = c.benchmark_group("sparse_mario/dense"); + for &seq in SEQS { + let q = random_tensor(seq, 1); + let k = random_tensor(seq, 2); + let v = random_tensor(seq, 3); + group.bench_function(format!("seq_{}", seq), |b| { + b.iter(|| { + // sparse-mario uses non-causal attention; dense_attention's + // last arg is the causal flag. + dense_attention(black_box(&q), black_box(&k), black_box(&v), false).unwrap() + }) + }); + } + group.finish(); +} + +fn bench_sparse(c: &mut Criterion) { + let mut group = c.benchmark_group("sparse_mario/sparse"); + let attention = SubquadraticSparseAttention::new(mario_config()).unwrap(); + for &seq in SEQS { + let q = random_tensor(seq, 4); + let k = random_tensor(seq, 5); + let v = random_tensor(seq, 6); + group.bench_function(format!("seq_{}", seq), |b| { + b.iter(|| { + attention + .forward(black_box(&q), black_box(&k), black_box(&v)) + .unwrap() + }) + }); + } + group.finish(); +} + +fn bench_sparse_fastgrnn(c: &mut Criterion) { + let mut group = c.benchmark_group("sparse_mario/sparse_fastgrnn"); + let attention = SubquadraticSparseAttention::new(mario_config()).unwrap(); + let gate = FastGrnnGate::new(HEAD_DIM, 32); + + for &seq in SEQS { + let q = random_tensor(seq, 7); + let k = random_tensor(seq, 8); + let v = random_tensor(seq, 9); + // Keep top 25% of long-range candidates — FastGRNN drops the rest. + let gate_top_k = (seq / 4).max(8); + group.bench_function(format!("seq_{}", seq), |b| { + b.iter(|| { + attention + .forward_gated_with_fastgrnn( + black_box(&q), + black_box(&k), + black_box(&v), + &gate, + gate_top_k, + ) + .unwrap() + }) + }); + } + group.finish(); +} + +criterion_group!(benches, bench_dense, bench_sparse, bench_sparse_fastgrnn); +criterion_main!(benches);