diff --git a/crates/ruvllm_sparse_attention/Cargo.toml b/crates/ruvllm_sparse_attention/Cargo.toml
index d2482784..005e35db 100644
--- a/crates/ruvllm_sparse_attention/Cargo.toml
+++ b/crates/ruvllm_sparse_attention/Cargo.toml
@@ -42,3 +42,7 @@ criterion = "0.5"
 [[bench]]
 name = "attention_bench"
 harness = false
+
+[[bench]]
+name = "sparse_mario_bench"
+harness = false
diff --git a/crates/ruvllm_sparse_attention/benches/sparse_mario_bench.rs b/crates/ruvllm_sparse_attention/benches/sparse_mario_bench.rs
new file mode 100644
index 00000000..e2c4d1b0
--- /dev/null
+++ b/crates/ruvllm_sparse_attention/benches/sparse_mario_bench.rs
@@ -0,0 +1,111 @@
+// sparse_mario_bench — benchmark the retrieval workload used by
+// `examples/sparse_mario.rs` against three attention paths:
+//
+//   1. dense_attention             — O(N²) baseline
+//   2. sparse forward()            — O(N log N) with non-causal window+log-stride+landmarks
+//   3. sparse forward_gated_with_fastgrnn() — near-linear with FastGRNN salience gate
+//
+// Tensor shape mirrors sparse-mario: heads=1, head_dim=64, non-causal,
+// window=256, block=64. Sequence lengths 256/512/1024/2048 cover the
+// realistic range of corpus_len + prefix_len in the example (2.1K–2.9K).
+//
+// Run with:
+//   cargo bench --bench sparse_mario_bench --features parallel \
+//       -- --warm-up-time 1 --measurement-time 3 --sample-size 20
+
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use ruvllm_sparse_attention::{
+    dense_attention, AttentionBackend, FastGrnnGate, SparseAttentionConfig,
+    SubquadraticSparseAttention, Tensor3,
+};
+
+const HEAD_DIM: usize = 64;
+const N_HEADS: usize = 1;
+const SEQS: &[usize] = &[256, 512, 1024, 2048];
+
+fn random_tensor(seq: usize, seed: u64) -> Tensor3 {
+    let mut rng = StdRng::seed_from_u64(seed);
+    let len = seq * N_HEADS * HEAD_DIM;
+    let data: Vec<f32> = (0..len).map(|_| rng.gen_range(-1.0f32..1.0f32)).collect();
+    Tensor3::from_vec(data, seq, N_HEADS, HEAD_DIM).unwrap()
+}
+
+fn mario_config() -> SparseAttentionConfig {
+    SparseAttentionConfig {
+        window: 256,
+        block_size: 64,
+        global_tokens: vec![0],
+        causal: false,
+        use_log_stride: true,
+        use_landmarks: true,
+        sort_candidates: false,
+    }
+}
+
+fn bench_dense(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sparse_mario/dense");
+    for &seq in SEQS {
+        let q = random_tensor(seq, 1);
+        let k = random_tensor(seq, 2);
+        let v = random_tensor(seq, 3);
+        group.bench_function(format!("seq_{}", seq), |b| {
+            b.iter(|| {
+                // sparse-mario uses non-causal attention; dense_attention's
+                // last arg is the causal flag.
+                dense_attention(black_box(&q), black_box(&k), black_box(&v), false).unwrap()
+            })
+        });
+    }
+    group.finish();
+}
+
+fn bench_sparse(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sparse_mario/sparse");
+    let attention = SubquadraticSparseAttention::new(mario_config()).unwrap();
+    for &seq in SEQS {
+        let q = random_tensor(seq, 4);
+        let k = random_tensor(seq, 5);
+        let v = random_tensor(seq, 6);
+        group.bench_function(format!("seq_{}", seq), |b| {
+            b.iter(|| {
+                attention
+                    .forward(black_box(&q), black_box(&k), black_box(&v))
+                    .unwrap()
+            })
+        });
+    }
+    group.finish();
+}
+
+fn bench_sparse_fastgrnn(c: &mut Criterion) {
+    let mut group = c.benchmark_group("sparse_mario/sparse_fastgrnn");
+    let attention = SubquadraticSparseAttention::new(mario_config()).unwrap();
+    let gate = FastGrnnGate::new(HEAD_DIM, 32);
+
+    for &seq in SEQS {
+        let q = random_tensor(seq, 7);
+        let k = random_tensor(seq, 8);
+        let v = random_tensor(seq, 9);
+        // Keep top 25% of long-range candidates — FastGRNN drops the rest.
+        let gate_top_k = (seq / 4).max(8);
+        group.bench_function(format!("seq_{}", seq), |b| {
+            b.iter(|| {
+                attention
+                    .forward_gated_with_fastgrnn(
+                        black_box(&q),
+                        black_box(&k),
+                        black_box(&v),
+                        &gate,
+                        gate_top_k,
+                    )
+                    .unwrap()
+            })
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(benches, bench_dense, bench_sparse, bench_sparse_fastgrnn);
+criterion_main!(benches);