From 08099401a1cc5563aae5c2ff804d9196539b0df9 Mon Sep 17 00:00:00 2001
From: ruvnet <ruvnet@gmail.com>
Date: Sat, 2 May 2026 11:01:29 -0400
Subject: [PATCH] bench(ruvector-hailo): WordPiece tokenizer throughput
 regression guard
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a criterion bench (`cargo bench --bench wordpiece_throughput`)
that builds a realistic ~30k-entry synthetic vocab (mirrors BERT-base
shape: 100 unused, 26 single chars + ## variants, 676 bigrams, ~28k
3-6 char trigrams + ## continuations) and measures `encode()` at four
sequence-length targets: 16, 64, 128, 256.

Baseline numbers (May 2026):

  max_seq | x86 Ryzen | Pi 5 Cortex-A76 | % of 3ms NPU forward
  --------+-----------+-----------------+---------------------
    16    |  1.61 µs  |    8.19 µs      |        0.27%
    64    |  7.99 µs  |   39.70 µs      |        1.32%
   128    | 17.96 µs  |   88.70 µs      |        2.96%
   256    | 34.88 µs  |  178.20 µs      |        5.93%

Conclusion: Cortex-A76 tokenizes the all-MiniLM-L6-v2 default 128-token
sequence in ~89 µs single-threaded, ~33x faster than the projected
Hailo-8 forward pass. Tokenizer is not the bottleneck of the hot path;
SIMD vectorization (basic-tokenize / wordpiece greedy match) is
premature optimization at this profile and is intentionally not
pursued. Revisit only if a future profile shows tokenizer p99 climbing
into 0.5 ms+ territory.

Bench is regression-only — no clippy gate, no CI step (criterion runs
in dev environments only). Runs fine on x86 dev hosts; meaningful
numbers are aarch64 Pi 5 native (run via SSH + genesis toolchain).

Co-Authored-By: claude-flow <ruv@ruv.net>
---
 crates/ruvector-hailo/Cargo.toml              |   9 +-
 .../benches/wordpiece_throughput.rs           | 128 ++++++++++++++++++
 2 files changed, 135 insertions(+), 2 deletions(-)
 create mode 100644 crates/ruvector-hailo/benches/wordpiece_throughput.rs

diff --git a/crates/ruvector-hailo/Cargo.toml b/crates/ruvector-hailo/Cargo.toml
index f3bd083a..8466c243 100644
--- a/crates/ruvector-hailo/Cargo.toml
+++ b/crates/ruvector-hailo/Cargo.toml
@@ -24,5 +24,10 @@ thiserror   = "2"
 hailort-sys = { path = "../hailort-sys" }
 
 [dev-dependencies]
-anyhow   = "1"
-proptest = "1"
+anyhow    = "1"
+proptest  = "1"
+criterion = { version = "0.5", default-features = false, features = ["plotters", "cargo_bench_support"] }
+
+[[bench]]
+name    = "wordpiece_throughput"
+harness = false
diff --git a/crates/ruvector-hailo/benches/wordpiece_throughput.rs b/crates/ruvector-hailo/benches/wordpiece_throughput.rs
new file mode 100644
index 00000000..0b8ae428
--- /dev/null
+++ b/crates/ruvector-hailo/benches/wordpiece_throughput.rs
@@ -0,0 +1,128 @@
+//! `WordPieceTokenizer::encode` throughput benchmark.
+//!
+//! Motivation (May 2026 design check): the worker hot path is
+//! `tokenize -> NPU forward pass -> reply`. NPU forward on Hailo-8 is
+//! ~1-3 ms for a single 128-token sequence. If tokenization on Cortex-A76
+//! costs more than ~500 µs, the NPU is starved.
+//!
+//! This bench builds a realistic-size synthetic vocabulary (~30k entries
+//! to mirror BERT-base) and runs `encode` against representative English
+//! text at four sequence-length targets: 16, 64, 128, 256 tokens.
+//!
+//! It is hardware-agnostic — the same harness runs on x86 dev hosts and
+//! on the Pi 5 over SSH; the absolute numbers from each give a
+//! before/after comparison for any optimisation work (SIMD basic-tokenize,
+//! interned vocab, etc.).
+//!
+//! Run with:
+//!   cargo bench --bench wordpiece_throughput
+//!
+//! On the Pi 5 (cross-compiled or native) this is the canonical signal
+//! for whether tokenization is the bottleneck before / after the HEF
+//! lands. Measurements logged to PR #413 review thread.
+
+use criterion::{black_box, criterion_group, criterion_main, Criterion, BenchmarkId, Throughput};
+use ruvector_hailo::tokenizer::WordPieceTokenizer;
+
+/// Build a synthetic ~30k-entry vocab that mirrors BERT-base's structure:
+/// 4 specials, then a mix of 2-6 char base tokens and `##xxx` continuations.
+/// Deterministic so bench numbers are comparable across runs.
+fn synthetic_vocab() -> String {
+    let mut v: Vec<String> = Vec::with_capacity(30_522);
+    v.push("[PAD]".into());
+    v.push("[UNK]".into());
+    v.push("[CLS]".into());
+    v.push("[SEP]".into());
+    // Pad to 100 with [unusedN] so common BERT IDs land where users expect.
+    for i in 4..100 {
+        v.push(format!("[unused{}]", i));
+    }
+    // Common single chars first (high hit rate on real text).
+    for c in 'a'..='z' {
+        v.push(c.to_string());
+        v.push(format!("##{}", c));
+    }
+    for c in '0'..='9' {
+        v.push(c.to_string());
+    }
+    for p in [",", ".", "!", "?", "-", ":", ";", "(", ")", "'", "\"", "/", "&"] {
+        v.push(p.to_string());
+    }
+    // Bigrams (2-char). 26*26 = 676.
+    for a in 'a'..='z' {
+        for b in 'a'..='z' {
+            v.push(format!("{}{}", a, b));
+        }
+    }
+    // 3-grams sampled procedurally to fill out the vocab. Use a tiny LCG
+    // so we don't need a rand crate dep. ~28k entries land here.
+    let mut state: u32 = 0xc0ffee;
+    let next = |s: &mut u32| -> u32 {
+        *s = s.wrapping_mul(1664525).wrapping_add(1013904223);
+        *s
+    };
+    while v.len() < 30_500 {
+        let l = 3 + (next(&mut state) % 4) as usize; // 3..=6 char tokens
+        let prefix = if next(&mut state) % 3 == 0 { "##" } else { "" };
+        let mut s = String::from(prefix);
+        for _ in 0..l {
+            let c = b'a' + (next(&mut state) % 26) as u8;
+            s.push(c as char);
+        }
+        v.push(s);
+    }
+    v.join("\n")
+}
+
+/// Realistic English-ish text generator. Avoids a network or embedded
+/// fixture — produces deterministic prose-shaped strings of approximately
+/// the requested character length.
+fn sample_text(target_chars: usize) -> String {
+    const STOCK: &[&str] = &[
+        "the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog", "ruvector",
+        "embeddings", "search", "system", "produces", "high-quality", "dense", "vectors",
+        "from", "natural", "language", "queries", "for", "use", "in", "downstream",
+        "retrieval", "pipelines", "and", "neural", "ranking", "models", "trained",
+        "on", "scientific", "literature", "and", "general", "domain", "corpora",
+    ];
+    let mut out = String::with_capacity(target_chars);
+    let mut i = 0usize;
+    while out.len() < target_chars {
+        if !out.is_empty() {
+            out.push(' ');
+        }
+        out.push_str(STOCK[i % STOCK.len()]);
+        i += 1;
+        // Sprinkle some punctuation every ~12 words.
+        if i % 12 == 11 {
+            out.push(',');
+        }
+    }
+    out
+}
+
+fn bench_encode(c: &mut Criterion) {
+    let vocab = synthetic_vocab();
+    let tok = WordPieceTokenizer::from_vocab_str(&vocab).expect("build tokenizer");
+
+    let mut group = c.benchmark_group("wordpiece_encode");
+    // Target sequence lengths cover the realistic span: short queries
+    // (16), single-sentence chunks (64), paragraph chunks (128 — the
+    // all-MiniLM-L6-v2 default), and long passages (256).
+    for max_seq in &[16usize, 64, 128, 256] {
+        // Aim for ~max_seq*4 chars so post-tokenization length lands
+        // close to the target before truncation kicks in.
+        let text = sample_text(max_seq * 4);
+        group.throughput(Throughput::Elements(1));
+        group.bench_with_input(BenchmarkId::from_parameter(max_seq), max_seq, |b, &max_seq| {
+            b.iter(|| {
+                let enc = tok.encode(black_box(&text), black_box(max_seq), true);
+                black_box(enc);
+            });
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(benches, bench_encode);
+criterion_main!(benches);