From 08099401a1cc5563aae5c2ff804d9196539b0df9 Mon Sep 17 00:00:00 2001 From: ruvnet Date: Sat, 2 May 2026 11:01:29 -0400 Subject: [PATCH] bench(ruvector-hailo): WordPiece tokenizer throughput regression guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a criterion bench (`cargo bench --bench wordpiece_throughput`) that builds a realistic ~30k-entry synthetic vocab (mirrors BERT-base shape: 100 unused, 26 single chars + ## variants, 676 bigrams, ~28k 3-6 char trigrams + ## continuations) and measures `encode()` at four sequence-length targets: 16, 64, 128, 256. Baseline numbers (May 2026): max_seq | x86 Ryzen | Pi 5 Cortex-A76 | % of 3ms NPU forward --------+-----------+-----------------+--------------------- 16 | 1.61 µs | 8.19 µs | 0.27% 64 | 7.99 µs | 39.70 µs | 1.32% 128 | 17.96 µs | 88.70 µs | 2.96% 256 | 34.88 µs | 178.20 µs | 5.93% Conclusion: Cortex-A76 tokenizes the all-MiniLM-L6-v2 default 128-token sequence in ~89 µs single-threaded, ~33x faster than the projected Hailo-8 forward pass. Tokenizer is not the bottleneck of the hot path; SIMD vectorization (basic-tokenize / wordpiece greedy match) is premature optimization at this profile and is intentionally not pursued. Revisit only if a future profile shows tokenizer p99 climbing into 0.5 ms+ territory. Bench is regression-only — no clippy gate, no CI step (criterion runs in dev environments only). Runs fine on x86 dev hosts; meaningful numbers are aarch64 Pi 5 native (run via SSH + genesis toolchain). Co-Authored-By: claude-flow --- crates/ruvector-hailo/Cargo.toml | 9 +- .../benches/wordpiece_throughput.rs | 128 ++++++++++++++++++ 2 files changed, 135 insertions(+), 2 deletions(-) create mode 100644 crates/ruvector-hailo/benches/wordpiece_throughput.rs diff --git a/crates/ruvector-hailo/Cargo.toml b/crates/ruvector-hailo/Cargo.toml index f3bd083a..8466c243 100644 --- a/crates/ruvector-hailo/Cargo.toml +++ b/crates/ruvector-hailo/Cargo.toml @@ -24,5 +24,10 @@ thiserror = "2" hailort-sys = { path = "../hailort-sys" } [dev-dependencies] -anyhow = "1" -proptest = "1" +anyhow = "1" +proptest = "1" +criterion = { version = "0.5", default-features = false, features = ["plotters", "cargo_bench_support"] } + +[[bench]] +name = "wordpiece_throughput" +harness = false diff --git a/crates/ruvector-hailo/benches/wordpiece_throughput.rs b/crates/ruvector-hailo/benches/wordpiece_throughput.rs new file mode 100644 index 00000000..0b8ae428 --- /dev/null +++ b/crates/ruvector-hailo/benches/wordpiece_throughput.rs @@ -0,0 +1,128 @@ +//! `WordPieceTokenizer::encode` throughput benchmark. +//! +//! Motivation (May 2026 design check): the worker hot path is +//! `tokenize -> NPU forward pass -> reply`. NPU forward on Hailo-8 is +//! ~1-3 ms for a single 128-token sequence. If tokenization on Cortex-A76 +//! costs more than ~500 µs, the NPU is starved. +//! +//! This bench builds a realistic-size synthetic vocabulary (~30k entries +//! to mirror BERT-base) and runs `encode` against representative English +//! text at four sequence-length targets: 16, 64, 128, 256 tokens. +//! +//! It is hardware-agnostic — the same harness runs on x86 dev hosts and +//! on the Pi 5 over SSH; the absolute numbers from each give a +//! before/after comparison for any optimisation work (SIMD basic-tokenize, +//! interned vocab, etc.). +//! +//! Run with: +//! cargo bench --bench wordpiece_throughput +//! +//! On the Pi 5 (cross-compiled or native) this is the canonical signal +//! for whether tokenization is the bottleneck before / after the HEF +//! lands. Measurements logged to PR #413 review thread. + +use criterion::{black_box, criterion_group, criterion_main, Criterion, BenchmarkId, Throughput}; +use ruvector_hailo::tokenizer::WordPieceTokenizer; + +/// Build a synthetic ~30k-entry vocab that mirrors BERT-base's structure: +/// 4 specials, then a mix of 2-6 char base tokens and `##xxx` continuations. +/// Deterministic so bench numbers are comparable across runs. +fn synthetic_vocab() -> String { + let mut v: Vec = Vec::with_capacity(30_522); + v.push("[PAD]".into()); + v.push("[UNK]".into()); + v.push("[CLS]".into()); + v.push("[SEP]".into()); + // Pad to 100 with [unusedN] so common BERT IDs land where users expect. + for i in 4..100 { + v.push(format!("[unused{}]", i)); + } + // Common single chars first (high hit rate on real text). + for c in 'a'..='z' { + v.push(c.to_string()); + v.push(format!("##{}", c)); + } + for c in '0'..='9' { + v.push(c.to_string()); + } + for p in [",", ".", "!", "?", "-", ":", ";", "(", ")", "'", "\"", "/", "&"] { + v.push(p.to_string()); + } + // Bigrams (2-char). 26*26 = 676. + for a in 'a'..='z' { + for b in 'a'..='z' { + v.push(format!("{}{}", a, b)); + } + } + // 3-grams sampled procedurally to fill out the vocab. Use a tiny LCG + // so we don't need a rand crate dep. ~28k entries land here. + let mut state: u32 = 0xc0ffee; + let next = |s: &mut u32| -> u32 { + *s = s.wrapping_mul(1664525).wrapping_add(1013904223); + *s + }; + while v.len() < 30_500 { + let l = 3 + (next(&mut state) % 4) as usize; // 3..=6 char tokens + let prefix = if next(&mut state) % 3 == 0 { "##" } else { "" }; + let mut s = String::from(prefix); + for _ in 0..l { + let c = b'a' + (next(&mut state) % 26) as u8; + s.push(c as char); + } + v.push(s); + } + v.join("\n") +} + +/// Realistic English-ish text generator. Avoids a network or embedded +/// fixture — produces deterministic prose-shaped strings of approximately +/// the requested character length. +fn sample_text(target_chars: usize) -> String { + const STOCK: &[&str] = &[ + "the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog", "ruvector", + "embeddings", "search", "system", "produces", "high-quality", "dense", "vectors", + "from", "natural", "language", "queries", "for", "use", "in", "downstream", + "retrieval", "pipelines", "and", "neural", "ranking", "models", "trained", + "on", "scientific", "literature", "and", "general", "domain", "corpora", + ]; + let mut out = String::with_capacity(target_chars); + let mut i = 0usize; + while out.len() < target_chars { + if !out.is_empty() { + out.push(' '); + } + out.push_str(STOCK[i % STOCK.len()]); + i += 1; + // Sprinkle some punctuation every ~12 words. + if i % 12 == 11 { + out.push(','); + } + } + out +} + +fn bench_encode(c: &mut Criterion) { + let vocab = synthetic_vocab(); + let tok = WordPieceTokenizer::from_vocab_str(&vocab).expect("build tokenizer"); + + let mut group = c.benchmark_group("wordpiece_encode"); + // Target sequence lengths cover the realistic span: short queries + // (16), single-sentence chunks (64), paragraph chunks (128 — the + // all-MiniLM-L6-v2 default), and long passages (256). + for max_seq in &[16usize, 64, 128, 256] { + // Aim for ~max_seq*4 chars so post-tokenization length lands + // close to the target before truncation kicks in. + let text = sample_text(max_seq * 4); + group.throughput(Throughput::Elements(1)); + group.bench_with_input(BenchmarkId::from_parameter(max_seq), max_seq, |b, &max_seq| { + b.iter(|| { + let enc = tok.encode(black_box(&text), black_box(max_seq), true); + black_box(enc); + }); + }); + } + group.finish(); +} + +criterion_group!(benches, bench_encode); +criterion_main!(benches);