From 1a563ec661b0c45fb4e0cd173663ba3ec55396ec Mon Sep 17 00:00:00 2001 From: ruvnet Date: Sun, 3 May 2026 15:20:25 -0400 Subject: [PATCH] =?UTF-8?q?feat(hailo):=20P4=20=E2=80=94=20HailoEmbedder?= =?UTF-8?q?=20routes=20HEF=20>=20cpu-fallback=20(iter=20162)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ADR-176 P4. HailoEmbedder::open now picks the best available inference path: 1. NPU HEF (hailo + cpu-fallback features ON, model.hef + safetensors trio present in dir) 2. cpu-fallback (cpu-fallback feature ON, safetensors only) 3. NoModelLoaded (worker still serves health probes) 4. FeatureDisabled (no relevant features built in) embed() dispatches in the same order; has_model() returns true if either HEF or cpu-fallback is loaded. The dimensions() value comes from the HEF output shape when available, then cpu-fallback's BERT config, then the MINI_LM_DIM constant. cpu-fallback only loads if HEF didn't (avoids a duplicate 90 MB safetensors mmap when both candidates could). The cluster's iter-143 fingerprint already keys off the artifacts present, so HEF-equipped workers and cpu-fallback workers automatically end up in distinct fleet groups (their vectors differ slightly due to INT8 quantization vs FP32, so mixing would break dispatch invariants). All 4 feature combos clippy-clean (-D warnings): default ✓ --features cpu-fallback ✓ --features hailo ✓ --features hailo,cpu-fallback ✓ ruvector-hailo: 15 lib tests pass (was 14, +host_embeddings test). ruvector-hailo-cluster: 99 tests pass, worker builds clean. Iter 163 next: deploy iter-162 worker to Pi 5 + drop the iter-156b HEF into /var/lib/ruvector-hailo/models/all-minilm-l6-v2/, restart systemd, verify startup self-test fires through the HEF path, benchmark vs cpu-fallback (target ≥5x throughput per ADR-176 acceptance criteria). Co-Authored-By: claude-flow --- crates/ruvector-hailo/src/lib.rs | 106 +++++++++++++++++++++++-------- 1 file changed, 79 insertions(+), 27 deletions(-) diff --git a/crates/ruvector-hailo/src/lib.rs b/crates/ruvector-hailo/src/lib.rs index 693828578..43c806339 100644 --- a/crates/ruvector-hailo/src/lib.rs +++ b/crates/ruvector-hailo/src/lib.rs @@ -77,6 +77,13 @@ pub struct HailoEmbedder { /// `--features cpu-fallback`. #[cfg(feature = "cpu-fallback")] cpu_fallback: Option, + /// Iter 162 (ADR-176 P4) — NPU acceleration via the iter-156b HEF + /// plus iter-160 host-side embeddings plus iter-161 end-to-end + /// pipeline. `Some(_)` when both `model.hef` and the safetensors + /// trio are present in `model_dir`. Takes precedence over + /// `cpu_fallback` in `embed()` dispatch. + #[cfg(all(feature = "hailo", feature = "cpu-fallback"))] + hef_embedder: Option, } impl HailoEmbedder { @@ -128,25 +135,65 @@ impl HailoEmbedder { #[cfg(all(not(feature = "hailo"), feature = "cpu-fallback"))] let device_id = "cpu-fallback:no-hailo-feature".to_string(); - // Iter 133 path-C: load CPU fallback when the feature is on - // and the model dir has the HF safetensors trio. When there's - // no HEF (always true today — model surgery pending) the CPU - // fallback is the sole inference path. + // Iter 162 (ADR-176 P4) — open priority: + // 1. HEF + safetensors trio (NPU acceleration) + // 2. safetensors trio only (cpu-fallback) + // 3. neither (NoModelLoaded — health probe still serves) + // + // HEF requires both `hailo` (for HefPipeline) and `cpu-fallback` + // (for HostEmbeddings + tokenizer). When the feature lattice + // doesn't enable both, we fall straight through to cpu-fallback + // (or no model). + // Both paths are only consulted under `feature = "cpu-fallback"` + // (HEF requires it for HostEmbeddings, cpu-fallback obviously); + // gate to silence unused-var warnings on `--features hailo` alone. #[cfg(feature = "cpu-fallback")] - let cpu_fallback = { - let safetensors = model_dir.join("model.safetensors"); - let hef_path = model_dir.join("model.hef"); - if !hef_path.exists() && safetensors.exists() { - Some(crate::cpu_embedder::CpuEmbedder::open(model_dir)?) + let hef_path = model_dir.join("model.hef"); + #[cfg(feature = "cpu-fallback")] + let safetensors = model_dir.join("model.safetensors"); + + #[cfg(all(feature = "hailo", feature = "cpu-fallback"))] + let hef_embedder = { + if hef_path.exists() && safetensors.exists() { + if let Some(dev) = device_opt.as_ref() { + Some(crate::hef_embedder::HefEmbedder::open(dev, model_dir)?) + } else { + None + } } else { None } }; - // Dimension comes from the CPU fallback's BERT config when - // available, otherwise the MINI_LM constant. Future HEF path - // reads it from the network group's output shape. - #[cfg(feature = "cpu-fallback")] + // cpu-fallback: load only if HEF wasn't loaded (avoid duplicate + // 90 MB safetensors mmap when both could load). + #[cfg(all(feature = "hailo", feature = "cpu-fallback"))] + let cpu_fallback = if hef_embedder.is_some() { + None + } else if !hef_path.exists() && safetensors.exists() { + Some(crate::cpu_embedder::CpuEmbedder::open(model_dir)?) + } else { + None + }; + + #[cfg(all(not(feature = "hailo"), feature = "cpu-fallback"))] + let cpu_fallback = if !hef_path.exists() && safetensors.exists() { + Some(crate::cpu_embedder::CpuEmbedder::open(model_dir)?) + } else { + None + }; + + // Dimension priority: HEF output dim > cpu-fallback BERT config + // > MINI_LM_DIM constant. The HEF was compiled for hidden_size + // 384 in iter-156b; this gate makes any future HEF with a + // different hidden_size automatically picked up. + #[cfg(all(feature = "hailo", feature = "cpu-fallback"))] + let dimensions = hef_embedder + .as_ref() + .map(|h| h.output_dim()) + .or_else(|| cpu_fallback.as_ref().map(|c| c.output_dim())) + .unwrap_or(crate::inference::MINI_LM_DIM); + #[cfg(all(not(feature = "hailo"), feature = "cpu-fallback"))] let dimensions = cpu_fallback .as_ref() .map(|c| c.output_dim()) @@ -168,6 +215,8 @@ impl HailoEmbedder { device: device_opt.map(Mutex::new), #[cfg(feature = "cpu-fallback")] cpu_fallback, + #[cfg(all(feature = "hailo", feature = "cpu-fallback"))] + hef_embedder, }) } @@ -220,11 +269,16 @@ impl HailoEmbedder { /// the ModelLoaded gate trips and `embed` starts dispatching to /// the NPU's vstream API. pub fn embed(&self, text: &str) -> Result> { - // Iter 137: dispatch order: - // 1. CPU fallback if loaded (real semantic vectors today) - // 2. NPU HEF inference (only path that exercises the device, - // currently NoModelLoaded — pending HEF model surgery) - // 3. FeatureDisabled if neither feature is built in + // Iter 162 (ADR-176 P4): dispatch order: + // 1. NPU HEF (real NPU acceleration, ~73 FPS encoder) + // 2. CPU fallback (host CPU BERT-6, ~7 FPS / Pi worker) + // 3. NoModelLoaded — health probes still serve + // 4. FeatureDisabled if neither feature is built in + #[cfg(all(feature = "hailo", feature = "cpu-fallback"))] + if let Some(hef) = &self.hef_embedder { + return hef.embed(text); + } + #[cfg(feature = "cpu-fallback")] if let Some(cpu) = &self.cpu_fallback { return cpu.embed(text); @@ -233,12 +287,6 @@ impl HailoEmbedder { #[cfg(feature = "hailo")] { let _ = text; - // Hold the device lock briefly — preserves the contract - // that the real HEF-based inference path needs - // single-writer access to the vstream descriptors. - if let Some(dev) = &self.device { - let _guard = dev.lock().unwrap_or_else(|p| p.into_inner()); - } return Err(HailoError::NoModelLoaded); } @@ -278,9 +326,13 @@ impl HailoEmbedder { /// configured into the vdevice. No callers need to change — the /// signal flips automatically. pub fn has_model(&self) -> bool { - // Iter 133 path-C: CPU fallback counts as a loaded model. - // The cluster's `validate_fleet` flow correctly marks workers - // ready=true when CPU fallback is wired even with no HEF. + // Iter 162 (ADR-176 P4): HEF + cpu-fallback both count. + #[cfg(all(feature = "hailo", feature = "cpu-fallback"))] + { + if self.hef_embedder.is_some() { + return true; + } + } #[cfg(feature = "cpu-fallback")] { if self.cpu_fallback.is_some() {