From 1a563ec661b0c45fb4e0cd173663ba3ec55396ec Mon Sep 17 00:00:00 2001
From: ruvnet <ruvnet@gmail.com>
Date: Sun, 3 May 2026 15:20:25 -0400
Subject: [PATCH] =?UTF-8?q?feat(hailo):=20P4=20=E2=80=94=20HailoEmbedder?=
 =?UTF-8?q?=20routes=20HEF=20>=20cpu-fallback=20(iter=20162)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ADR-176 P4. HailoEmbedder::open now picks the best available
inference path:

  1. NPU HEF       (hailo + cpu-fallback features ON,
                    model.hef + safetensors trio present in dir)
  2. cpu-fallback  (cpu-fallback feature ON, safetensors only)
  3. NoModelLoaded (worker still serves health probes)
  4. FeatureDisabled (no relevant features built in)

embed() dispatches in the same order; has_model() returns true if
either HEF or cpu-fallback is loaded. The dimensions() value comes
from the HEF output shape when available, then cpu-fallback's BERT
config, then the MINI_LM_DIM constant.

cpu-fallback only loads if HEF didn't (avoids a duplicate 90 MB
safetensors mmap when both candidates could). The cluster's
iter-143 fingerprint already keys off the artifacts present, so
HEF-equipped workers and cpu-fallback workers automatically end up
in distinct fleet groups (their vectors differ slightly due to INT8
quantization vs FP32, so mixing would break dispatch invariants).

All 4 feature combos clippy-clean (-D warnings):
  default                       ✓
  --features cpu-fallback        ✓
  --features hailo               ✓
  --features hailo,cpu-fallback  ✓

ruvector-hailo: 15 lib tests pass (was 14, +host_embeddings test).
ruvector-hailo-cluster: 99 tests pass, worker builds clean.

Iter 163 next: deploy iter-162 worker to Pi 5 + drop the iter-156b
HEF into /var/lib/ruvector-hailo/models/all-minilm-l6-v2/, restart
systemd, verify startup self-test fires through the HEF path,
benchmark vs cpu-fallback (target ≥5x throughput per ADR-176
acceptance criteria).

Co-Authored-By: claude-flow <ruv@ruv.net>
---
 crates/ruvector-hailo/src/lib.rs | 106 +++++++++++++++++++++++--------
 1 file changed, 79 insertions(+), 27 deletions(-)

diff --git a/crates/ruvector-hailo/src/lib.rs b/crates/ruvector-hailo/src/lib.rs
index 693828578..43c806339 100644
--- a/crates/ruvector-hailo/src/lib.rs
+++ b/crates/ruvector-hailo/src/lib.rs
@@ -77,6 +77,13 @@ pub struct HailoEmbedder {
     /// `--features cpu-fallback`.
     #[cfg(feature = "cpu-fallback")]
     cpu_fallback: Option<crate::cpu_embedder::CpuEmbedder>,
+    /// Iter 162 (ADR-176 P4) — NPU acceleration via the iter-156b HEF
+    /// plus iter-160 host-side embeddings plus iter-161 end-to-end
+    /// pipeline. `Some(_)` when both `model.hef` and the safetensors
+    /// trio are present in `model_dir`. Takes precedence over
+    /// `cpu_fallback` in `embed()` dispatch.
+    #[cfg(all(feature = "hailo", feature = "cpu-fallback"))]
+    hef_embedder: Option<crate::hef_embedder::HefEmbedder>,
 }
 
 impl HailoEmbedder {
@@ -128,25 +135,65 @@ impl HailoEmbedder {
         #[cfg(all(not(feature = "hailo"), feature = "cpu-fallback"))]
         let device_id = "cpu-fallback:no-hailo-feature".to_string();
 
-        // Iter 133 path-C: load CPU fallback when the feature is on
-        // and the model dir has the HF safetensors trio. When there's
-        // no HEF (always true today — model surgery pending) the CPU
-        // fallback is the sole inference path.
+        // Iter 162 (ADR-176 P4) — open priority:
+        //   1. HEF + safetensors trio (NPU acceleration)
+        //   2. safetensors trio only (cpu-fallback)
+        //   3. neither (NoModelLoaded — health probe still serves)
+        //
+        // HEF requires both `hailo` (for HefPipeline) and `cpu-fallback`
+        // (for HostEmbeddings + tokenizer). When the feature lattice
+        // doesn't enable both, we fall straight through to cpu-fallback
+        // (or no model).
+        // Both paths are only consulted under `feature = "cpu-fallback"`
+        // (HEF requires it for HostEmbeddings, cpu-fallback obviously);
+        // gate to silence unused-var warnings on `--features hailo` alone.
         #[cfg(feature = "cpu-fallback")]
-        let cpu_fallback = {
-            let safetensors = model_dir.join("model.safetensors");
-            let hef_path = model_dir.join("model.hef");
-            if !hef_path.exists() && safetensors.exists() {
-                Some(crate::cpu_embedder::CpuEmbedder::open(model_dir)?)
+        let hef_path = model_dir.join("model.hef");
+        #[cfg(feature = "cpu-fallback")]
+        let safetensors = model_dir.join("model.safetensors");
+
+        #[cfg(all(feature = "hailo", feature = "cpu-fallback"))]
+        let hef_embedder = {
+            if hef_path.exists() && safetensors.exists() {
+                if let Some(dev) = device_opt.as_ref() {
+                    Some(crate::hef_embedder::HefEmbedder::open(dev, model_dir)?)
+                } else {
+                    None
+                }
             } else {
                 None
             }
         };
 
-        // Dimension comes from the CPU fallback's BERT config when
-        // available, otherwise the MINI_LM constant. Future HEF path
-        // reads it from the network group's output shape.
-        #[cfg(feature = "cpu-fallback")]
+        // cpu-fallback: load only if HEF wasn't loaded (avoid duplicate
+        // 90 MB safetensors mmap when both could load).
+        #[cfg(all(feature = "hailo", feature = "cpu-fallback"))]
+        let cpu_fallback = if hef_embedder.is_some() {
+            None
+        } else if !hef_path.exists() && safetensors.exists() {
+            Some(crate::cpu_embedder::CpuEmbedder::open(model_dir)?)
+        } else {
+            None
+        };
+
+        #[cfg(all(not(feature = "hailo"), feature = "cpu-fallback"))]
+        let cpu_fallback = if !hef_path.exists() && safetensors.exists() {
+            Some(crate::cpu_embedder::CpuEmbedder::open(model_dir)?)
+        } else {
+            None
+        };
+
+        // Dimension priority: HEF output dim > cpu-fallback BERT config
+        // > MINI_LM_DIM constant. The HEF was compiled for hidden_size
+        // 384 in iter-156b; this gate makes any future HEF with a
+        // different hidden_size automatically picked up.
+        #[cfg(all(feature = "hailo", feature = "cpu-fallback"))]
+        let dimensions = hef_embedder
+            .as_ref()
+            .map(|h| h.output_dim())
+            .or_else(|| cpu_fallback.as_ref().map(|c| c.output_dim()))
+            .unwrap_or(crate::inference::MINI_LM_DIM);
+        #[cfg(all(not(feature = "hailo"), feature = "cpu-fallback"))]
         let dimensions = cpu_fallback
             .as_ref()
             .map(|c| c.output_dim())
@@ -168,6 +215,8 @@ impl HailoEmbedder {
             device: device_opt.map(Mutex::new),
             #[cfg(feature = "cpu-fallback")]
             cpu_fallback,
+            #[cfg(all(feature = "hailo", feature = "cpu-fallback"))]
+            hef_embedder,
         })
     }
 
@@ -220,11 +269,16 @@ impl HailoEmbedder {
     /// the ModelLoaded gate trips and `embed` starts dispatching to
     /// the NPU's vstream API.
     pub fn embed(&self, text: &str) -> Result<Vec<f32>> {
-        // Iter 137: dispatch order:
-        //   1. CPU fallback if loaded (real semantic vectors today)
-        //   2. NPU HEF inference (only path that exercises the device,
-        //      currently NoModelLoaded — pending HEF model surgery)
-        //   3. FeatureDisabled if neither feature is built in
+        // Iter 162 (ADR-176 P4): dispatch order:
+        //   1. NPU HEF (real NPU acceleration, ~73 FPS encoder)
+        //   2. CPU fallback (host CPU BERT-6, ~7 FPS / Pi worker)
+        //   3. NoModelLoaded — health probes still serve
+        //   4. FeatureDisabled if neither feature is built in
+        #[cfg(all(feature = "hailo", feature = "cpu-fallback"))]
+        if let Some(hef) = &self.hef_embedder {
+            return hef.embed(text);
+        }
+
         #[cfg(feature = "cpu-fallback")]
         if let Some(cpu) = &self.cpu_fallback {
             return cpu.embed(text);
@@ -233,12 +287,6 @@ impl HailoEmbedder {
         #[cfg(feature = "hailo")]
         {
             let _ = text;
-            // Hold the device lock briefly — preserves the contract
-            // that the real HEF-based inference path needs
-            // single-writer access to the vstream descriptors.
-            if let Some(dev) = &self.device {
-                let _guard = dev.lock().unwrap_or_else(|p| p.into_inner());
-            }
             return Err(HailoError::NoModelLoaded);
         }
 
@@ -278,9 +326,13 @@ impl HailoEmbedder {
     /// configured into the vdevice. No callers need to change — the
     /// signal flips automatically.
     pub fn has_model(&self) -> bool {
-        // Iter 133 path-C: CPU fallback counts as a loaded model.
-        // The cluster's `validate_fleet` flow correctly marks workers
-        // ready=true when CPU fallback is wired even with no HEF.
+        // Iter 162 (ADR-176 P4): HEF + cpu-fallback both count.
+        #[cfg(all(feature = "hailo", feature = "cpu-fallback"))]
+        {
+            if self.hef_embedder.is_some() {
+                return true;
+            }
+        }
         #[cfg(feature = "cpu-fallback")]
         {
             if self.cpu_fallback.is_some() {