From 03f8d08fd01ade480cffc0409caf268aa00ecdf0 Mon Sep 17 00:00:00 2001
From: ruvnet <ruvnet@gmail.com>
Date: Fri, 8 May 2026 12:51:14 -0400
Subject: [PATCH] =?UTF-8?q?feat(sparse-mario):=20iter=204=20=E2=80=94=20be?=
 =?UTF-8?q?nch=20dense=20vs=20sparse=20vs=20sparse+FastGRNN?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds `benches/sparse_mario_bench.rs` exercising the retrieval workload
shape (heads=1, head_dim=64, non-causal, window=256, block=64) at
seq lengths 256/512/1024/2048 — the realistic range of corpus + prefix
in the example.

Headline numbers (Ryzen 9 9950X, --features parallel,
--warm-up-time 1 --measurement-time 3 --sample-size 20):

  seq    dense       sparse      sparse+FG    speedup (sparse vs dense)
  256    2.41 ms     1.74 ms     2.23 ms      1.4x
  512    9.59 ms     5.21 ms     6.24 ms      1.8x
  1024   38.4 ms     12.2 ms     14.2 ms      3.1x
  2048   154 ms      26.2 ms     30.3 ms      5.9x

Dense scales 4x per doubling (O(N²) confirmed). Sparse scales ~2x per
doubling (sub-quadratic). FastGRNN gate adds a small constant cost
that dominates at small N and single-head; it would pay back at
longer sequences and wider heads — iter 5 will sweep this.

Iter-plan progress:
  ✓ 1-3. corpus + retrieval LM + ASCII generation
  ✓ 4. sparse-mario bench                          ← here
    5. fp16 KV cache + FastGRNN sweep + top-k sampling
    6. validation + final summary

Co-Authored-By: claude-flow <ruv@ruv.net>
---
 crates/ruvllm_sparse_attention/Cargo.toml     |   4 +
 .../benches/sparse_mario_bench.rs             | 111 ++++++++++++++++++
 2 files changed, 115 insertions(+)
 create mode 100644 crates/ruvllm_sparse_attention/benches/sparse_mario_bench.rs
diff --git a/crates/ruvllm_sparse_attention/Cargo.toml b/crates/ruvllm_sparse_attention/Cargo.toml
index d2482784..005e35db 100644
--- a/crates/ruvllm_sparse_attention/Cargo.toml
+++ b/crates/ruvllm_sparse_attention/Cargo.toml
@@ -42,3 +42,7 @@ criterion = "0.5"
 [[bench]]
 name = "attention_bench"
 harness = false
+
+[[bench]]
+name = "sparse_mario_bench"
+harness = false
diff --git a/crates/ruvllm_sparse_attention/benches/sparse_mario_bench.rs b/crates/ruvllm_sparse_attention/benches/sparse_mario_bench.rs
new file mode 100644
index 00000000..e2c4d1b0
--- /dev/null
+++ b/crates/ruvllm_sparse_attention/benches/sparse_mario_bench.rs
@@ -0,0 +1,111 @@
+// sparse_mario_bench — benchmark the retrieval workload used by
+// `examples/sparse_mario.rs` against three attention paths:
+//
+//   1. dense_attention             — O(N²) baseline
+//   2. sparse forward()            — O(N log N) with non-causal window+log-stride+landmarks
+//   3. sparse forward_gated_with_fastgrnn() — near-linear with FastGRNN salience gate
+//
+// Tensor shape mirrors sparse-mario: heads=1, head_dim=64, non-causal,
+// window=256, block=64. Sequence lengths 256/512/1024/2048 cover the
+// realistic range of corpus_len + prefix_len in the example (2.1K–2.9K).
+//
+// Run with:
+//   cargo bench --bench sparse_mario_bench --features parallel \
+//       -- --warm-up-time 1 --measurement-time 3 --sample-size 20
+
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use ruvllm_sparse_attention::{
+    dense_attention, AttentionBackend, FastGrnnGate, SparseAttentionConfig,
+    SubquadraticSparseAttention, Tensor3,
+};
+
+const HEAD_DIM: usize = 64;
+const N_HEADS: usize = 1;
+const SEQS: &[usize] = &[256, 512, 1024, 2048];
+
+fn random_tensor(seq: usize, seed: u64) -> Tensor3 {
+    let mut rng = StdRng::seed_from_u64(seed);
+    let len = seq * N_HEADS * HEAD_DIM;
+    let data: Vec<f32> = (0..len).map(|_| rng.gen_range(-1.0f32..1.0f32)).collect();
+    Tensor3::from_vec(data, seq, N_HEADS, HEAD_DIM).unwrap()
+}
+
+fn mario_config() -> SparseAttentionConfig {
+    SparseAttentionConfig {
+        window: 256,
+        block_size: 64,
+        global_tokens: vec![0],
+        causal: false,
+        use_log_stride: true,
+        use_landmarks: true,
+        sort_candidates: false,
+    }
+}
+
+fn bench_dense(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sparse_mario/dense");
+    for &seq in SEQS {
+        let q = random_tensor(seq, 1);
+        let k = random_tensor(seq, 2);
+        let v = random_tensor(seq, 3);
+        group.bench_function(format!("seq_{}", seq), |b| {
+            b.iter(|| {
+                // sparse-mario uses non-causal attention; dense_attention's
+                // last arg is the causal flag.
+                dense_attention(black_box(&q), black_box(&k), black_box(&v), false).unwrap()
+            })
+        });
+    }
+    group.finish();
+}
+
+fn bench_sparse(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sparse_mario/sparse");
+    let attention = SubquadraticSparseAttention::new(mario_config()).unwrap();
+    for &seq in SEQS {
+        let q = random_tensor(seq, 4);
+        let k = random_tensor(seq, 5);
+        let v = random_tensor(seq, 6);
+        group.bench_function(format!("seq_{}", seq), |b| {
+            b.iter(|| {
+                attention
+                    .forward(black_box(&q), black_box(&k), black_box(&v))
+                    .unwrap()
+            })
+        });
+    }
+    group.finish();
+}
+
+fn bench_sparse_fastgrnn(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sparse_mario/sparse_fastgrnn");
+    let attention = SubquadraticSparseAttention::new(mario_config()).unwrap();
+    let gate = FastGrnnGate::new(HEAD_DIM, 32);
+
+    for &seq in SEQS {
+        let q = random_tensor(seq, 7);
+        let k = random_tensor(seq, 8);
+        let v = random_tensor(seq, 9);
+        // Keep top 25% of long-range candidates — FastGRNN drops the rest.
+        let gate_top_k = (seq / 4).max(8);
+        group.bench_function(format!("seq_{}", seq), |b| {
+            b.iter(|| {
+                attention
+                    .forward_gated_with_fastgrnn(
+                        black_box(&q),
+                        black_box(&k),
+                        black_box(&v),
+                        &gate,
+                        gate_top_k,
+                    )
+                    .unwrap()
+            })
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(benches, bench_dense, bench_sparse, bench_sparse_fastgrnn);
+criterion_main!(benches);