bench(ruvector-hailo): WordPiece tokenizer throughput regression guard

Adds a criterion bench (`cargo bench --bench wordpiece_throughput`)
that builds a realistic ~30k-entry synthetic vocab (mirrors BERT-base
shape: 100 unused, 26 single chars + ## variants, 676 bigrams, ~28k
3-6 char trigrams + ## continuations) and measures `encode()` at four
sequence-length targets: 16, 64, 128, 256.

Baseline numbers (May 2026):

  max_seq | x86 Ryzen | Pi 5 Cortex-A76 | % of 3ms NPU forward
  --------+-----------+-----------------+---------------------
    16    |  1.61 µs  |    8.19 µs      |        0.27%
    64    |  7.99 µs  |   39.70 µs      |        1.32%
   128    | 17.96 µs  |   88.70 µs      |        2.96%
   256    | 34.88 µs  |  178.20 µs      |        5.93%

Conclusion: Cortex-A76 tokenizes the all-MiniLM-L6-v2 default 128-token
sequence in ~89 µs single-threaded, ~33x faster than the projected
Hailo-8 forward pass. Tokenizer is not the bottleneck of the hot path;
SIMD vectorization (basic-tokenize / wordpiece greedy match) is
premature optimization at this profile and is intentionally not
pursued. Revisit only if a future profile shows tokenizer p99 climbing
into 0.5 ms+ territory.

Bench is regression-only — no clippy gate, no CI step (criterion runs
in dev environments only). Runs fine on x86 dev hosts; meaningful
numbers are aarch64 Pi 5 native (run via SSH + genesis toolchain).

Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
ruvnet 2026-05-02 11:01:29 -04:00
parent ea91065e47
commit 08099401a1
2 changed files with 135 additions and 2 deletions

View file

@ -24,5 +24,10 @@ thiserror = "2"
hailort-sys = { path = "../hailort-sys" }
[dev-dependencies]
anyhow = "1"
proptest = "1"
anyhow = "1"
proptest = "1"
criterion = { version = "0.5", default-features = false, features = ["plotters", "cargo_bench_support"] }
[[bench]]
name = "wordpiece_throughput"
harness = false

View file

@ -0,0 +1,128 @@
//! `WordPieceTokenizer::encode` throughput benchmark.
//!
//! Motivation (May 2026 design check): the worker hot path is
//! `tokenize -> NPU forward pass -> reply`. NPU forward on Hailo-8 is
//! ~1-3 ms for a single 128-token sequence. If tokenization on Cortex-A76
//! costs more than ~500 µs, the NPU is starved.
//!
//! This bench builds a realistic-size synthetic vocabulary (~30k entries
//! to mirror BERT-base) and runs `encode` against representative English
//! text at four sequence-length targets: 16, 64, 128, 256 tokens.
//!
//! It is hardware-agnostic — the same harness runs on x86 dev hosts and
//! on the Pi 5 over SSH; the absolute numbers from each give a
//! before/after comparison for any optimisation work (SIMD basic-tokenize,
//! interned vocab, etc.).
//!
//! Run with:
//! cargo bench --bench wordpiece_throughput
//!
//! On the Pi 5 (cross-compiled or native) this is the canonical signal
//! for whether tokenization is the bottleneck before / after the HEF
//! lands. Measurements logged to PR #413 review thread.
use criterion::{black_box, criterion_group, criterion_main, Criterion, BenchmarkId, Throughput};
use ruvector_hailo::tokenizer::WordPieceTokenizer;
/// Build a synthetic ~30k-entry vocab that mirrors BERT-base's structure:
/// 4 specials, then a mix of 2-6 char base tokens and `##xxx` continuations.
/// Deterministic so bench numbers are comparable across runs.
fn synthetic_vocab() -> String {
let mut v: Vec<String> = Vec::with_capacity(30_522);
v.push("[PAD]".into());
v.push("[UNK]".into());
v.push("[CLS]".into());
v.push("[SEP]".into());
// Pad to 100 with [unusedN] so common BERT IDs land where users expect.
for i in 4..100 {
v.push(format!("[unused{}]", i));
}
// Common single chars first (high hit rate on real text).
for c in 'a'..='z' {
v.push(c.to_string());
v.push(format!("##{}", c));
}
for c in '0'..='9' {
v.push(c.to_string());
}
for p in [",", ".", "!", "?", "-", ":", ";", "(", ")", "'", "\"", "/", "&"] {
v.push(p.to_string());
}
// Bigrams (2-char). 26*26 = 676.
for a in 'a'..='z' {
for b in 'a'..='z' {
v.push(format!("{}{}", a, b));
}
}
// 3-grams sampled procedurally to fill out the vocab. Use a tiny LCG
// so we don't need a rand crate dep. ~28k entries land here.
let mut state: u32 = 0xc0ffee;
let next = |s: &mut u32| -> u32 {
*s = s.wrapping_mul(1664525).wrapping_add(1013904223);
*s
};
while v.len() < 30_500 {
let l = 3 + (next(&mut state) % 4) as usize; // 3..=6 char tokens
let prefix = if next(&mut state) % 3 == 0 { "##" } else { "" };
let mut s = String::from(prefix);
for _ in 0..l {
let c = b'a' + (next(&mut state) % 26) as u8;
s.push(c as char);
}
v.push(s);
}
v.join("\n")
}
/// Realistic English-ish text generator. Avoids a network or embedded
/// fixture — produces deterministic prose-shaped strings of approximately
/// the requested character length.
fn sample_text(target_chars: usize) -> String {
const STOCK: &[&str] = &[
"the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog", "ruvector",
"embeddings", "search", "system", "produces", "high-quality", "dense", "vectors",
"from", "natural", "language", "queries", "for", "use", "in", "downstream",
"retrieval", "pipelines", "and", "neural", "ranking", "models", "trained",
"on", "scientific", "literature", "and", "general", "domain", "corpora",
];
let mut out = String::with_capacity(target_chars);
let mut i = 0usize;
while out.len() < target_chars {
if !out.is_empty() {
out.push(' ');
}
out.push_str(STOCK[i % STOCK.len()]);
i += 1;
// Sprinkle some punctuation every ~12 words.
if i % 12 == 11 {
out.push(',');
}
}
out
}
fn bench_encode(c: &mut Criterion) {
let vocab = synthetic_vocab();
let tok = WordPieceTokenizer::from_vocab_str(&vocab).expect("build tokenizer");
let mut group = c.benchmark_group("wordpiece_encode");
// Target sequence lengths cover the realistic span: short queries
// (16), single-sentence chunks (64), paragraph chunks (128 — the
// all-MiniLM-L6-v2 default), and long passages (256).
for max_seq in &[16usize, 64, 128, 256] {
// Aim for ~max_seq*4 chars so post-tokenization length lands
// close to the target before truncation kicks in.
let text = sample_text(max_seq * 4);
group.throughput(Throughput::Elements(1));
group.bench_with_input(BenchmarkId::from_parameter(max_seq), max_seq, |b, &max_seq| {
b.iter(|| {
let enc = tok.encode(black_box(&text), black_box(max_seq), true);
black_box(enc);
});
});
}
group.finish();
}
criterion_group!(benches, bench_encode);
criterion_main!(benches);