diff --git a/crates/ruvector-rulake/BENCHMARK.md b/crates/ruvector-rulake/BENCHMARK.md
index 03f89a02..98ae6bdc 100644
--- a/crates/ruvector-rulake/BENCHMARK.md
+++ b/crates/ruvector-rulake/BENCHMARK.md
@@ -59,6 +59,37 @@ The QPS drop with shard count under this single-thread benchmark is
 *not* pure `par_iter` startup overhead — see the concurrent-client
 numbers below for the honest picture.
 
+### search_batch vs per-query loop (n = 100 k, warm cache, single-threaded)
+
+`RuLake::search_batch(queries, k)` amortizes `ensure_fresh` and the
+cache mutex across N queries. Measured speedup on an already-primed
+`LocalBackend` under `Consistency::Eventual` (the hot path):
+
+| batch size |     QPS | speedup vs per-query |
+|-----------:|--------:|---------------------:|
+|         8  |   2,874 |              1.01×   |
+|        32  |   2,961 |              1.04×   |
+|       128  |   2,943 |              1.03×   |
+|       300  |   2,986 |              1.05×   |
+| per-query  |   2,855 | baseline             |
+
+Modest on this workload — the warm cache path is already uncontended
+(single-threaded, Eventual-TTL so `ensure_fresh` is a HashMap lookup,
+not a backend RTT). The bigger wins for batch are latent:
+
+- **`Consistency::Fresh`** — each per-query `ensure_fresh` is a
+  backend round-trip. A batch of 300 on Fresh amortizes 300 RTTs
+  into 1, which is catastrophically different at network latency.
+- **Concurrent contention** — fewer mutex acquires under heavy
+  multi-client load. Not measured in this single-threaded bench.
+- **Kernel dispatch (ADR-157)** — GPU / SIMD kernels cross over CPU
+  only above their `min_batch`. `search_batch` is the plug-point
+  that makes dispatch tractable; a per-query API would never let
+  GPU win.
+
+Test `search_batch_acquires_cache_lock_once` proves the amortization
+mechanically: a batch of 32 registers as 1 coherence check, not 32.
+
 ### Concurrent clients × shard count (n = 100 k, 8 clients × 300 queries)
 
 With the **adaptive per-shard rerank** introduced via
diff --git a/crates/ruvector-rulake/src/bin/rulake-demo.rs b/crates/ruvector-rulake/src/bin/rulake-demo.rs
index fd79e9a4..8d98cfc6 100644
--- a/crates/ruvector-rulake/src/bin/rulake-demo.rs
+++ b/crates/ruvector-rulake/src/bin/rulake-demo.rs
@@ -302,6 +302,45 @@ fn main() {
         println!();
     }
     if !fast {
+        println!("── search_batch vs per-query loop (n=100k) ──");
+        let n = 100_000;
+        let data = clustered(n, d, 100, seed);
+        let queries = clustered(300, d, 100, seed ^ 0xdead_beef);
+
+        let backend = Arc::new(LocalBackend::new("bench"));
+        backend
+            .put_collection("c", d, (0..n as u64).collect(), data.clone())
+            .unwrap();
+        let lake =
+            RuLake::new(rerank, seed).with_consistency(Consistency::Eventual { ttl_ms: 60_000 });
+        lake.register_backend(backend).unwrap();
+        // Prime.
+        lake.search_one("bench", "c", &queries[0], 10).unwrap();
+
+        // Per-query loop over the full 300-query set.
+        let t = Instant::now();
+        for q in &queries {
+            let _ = lake.search_one("bench", "c", q, 10).unwrap();
+        }
+        let loop_qps = queries.len() as f64 / t.elapsed().as_secs_f64();
+
+        // Batch the same 300 queries in chunks of 32.
+        for &batch_size in &[8usize, 32, 128, 300] {
+            let t = Instant::now();
+            for chunk in queries.chunks(batch_size) {
+                let _ = lake.search_batch("bench", "c", chunk, 10).unwrap();
+            }
+            let batch_qps = queries.len() as f64 / t.elapsed().as_secs_f64();
+            println!(
+                "  batch={:>3}   qps={:>8.0}   speedup vs per-query {:.2}×",
+                batch_size,
+                batch_qps,
+                batch_qps / loop_qps
+            );
+        }
+        println!("  per-query loop   qps={:>8.0}   (baseline)", loop_qps);
+        println!();
+
         println!("── concurrent clients × federation (n=100k, 8 clients × 300 queries) ──");
         let n = 100_000;
         let queries = clustered(300, d, 100, seed ^ 0xdead_beef);