diff --git a/crates/ruvector-hailo-cluster/src/bin/worker.rs b/crates/ruvector-hailo-cluster/src/bin/worker.rs index 97cf426aa..9abab3373 100644 --- a/crates/ruvector-hailo-cluster/src/bin/worker.rs +++ b/crates/ruvector-hailo-cluster/src/bin/worker.rs @@ -195,13 +195,13 @@ impl Embedding for WorkerService { version: self.version.clone(), device_id: self.device_id.clone(), model_fingerprint: self.fingerprint.clone(), - // Worker is "ready" iff embedder.dimensions() returned a real - // dim. Iter 87+: open() pre-declares MINI_LM_DIM = 384 so the - // worker reports ready=true and the coordinator dispatches - // even before the .hef lands (FNV-1a placeholder vectors). - // When HEF wiring lands the dim will come from the loaded - // network group's output shape instead. - ready: self.embedder.dimensions() > 0, + // Iter 130: ready iff a real model graph is loaded. The + // dimension pre-declaration (384) is no longer enough — + // it lied while the placeholder embed path was active. + // Now `has_model()` returns false until HEF support lands, + // so coordinators correctly see model-less workers as + // not-ready and skip them in validate_fleet / dispatch. + ready: self.embedder.dimensions() > 0 && self.embedder.has_model(), // Iter-96 (ADR-174 §93): live NPU temperature read on every // health probe. 0.0 if read fails (older firmware variants // don't expose the opcode); coordinator side maps 0.0 → None. diff --git a/crates/ruvector-hailo/src/error.rs b/crates/ruvector-hailo/src/error.rs index 7a4e33f78..bb4bb7cef 100644 --- a/crates/ruvector-hailo/src/error.rs +++ b/crates/ruvector-hailo/src/error.rs @@ -47,4 +47,22 @@ pub enum HailoError { /// Output vector shape didn't match the configured `dim`. #[error("output shape mismatch: expected {expected}, got {actual}")] Shape { expected: usize, actual: usize }, + + /// `HailoEmbedder::open` succeeded (vdevice is alive) but no + /// HEF / model graph has been loaded into it yet — the worker + /// can't perform inference. Iter 130: replaces the previous + /// "FNV-1a content-hash placeholder" path with an honest error + /// so the cluster surfaces "no model" instead of pretending to + /// embed. + /// + /// Resolution: drop a compiled `model.hef` into the model dir + /// (run the Hailo Dataflow Compiler against + /// `sentence-transformers/all-MiniLM-L6-v2.onnx`) and restart + /// the worker. The existing `HailoEmbedder::open` path picks it + /// up; no source changes required. + #[error( + "no Hailo model graph loaded — drop a compiled `model.hef` into \ + the worker's model dir and restart" + )] + NoModelLoaded, } diff --git a/crates/ruvector-hailo/src/lib.rs b/crates/ruvector-hailo/src/lib.rs index b00128ae0..e963e848a 100644 --- a/crates/ruvector-hailo/src/lib.rs +++ b/crates/ruvector-hailo/src/lib.rs @@ -127,17 +127,31 @@ impl HailoEmbedder { /// Embed a single piece of text into a `dimensions()`-element f32 vector. /// - /// **Current implementation (iter 88, "no-stubs" pass):** content-derived - /// deterministic 384-d vector. Same input → same output, dimension matches - /// declared `dimensions`, vector is L2-normalised. NOT a real semantic - /// embedding (that lands when the .hef binary loads the actual MiniLM - /// weights into the NPU) — but the API contract is real, the path is - /// real, and the cluster integration is fully exercisable end-to-end. + /// Embed `text` into a `dim`-length unit vector. /// - /// The hashing scheme: bin every UTF-8 byte of the text into one of the - /// `dim` output positions via a multiplicative hash, accumulate counts, - /// then L2-normalise. Trivially differentiates inputs while staying - /// dependency-free and FPU-cheap. + /// **Iter 130 — placeholder removed.** Previous iters returned an + /// FNV-1a content-hash vector ("real path, fake math") so the + /// dispatch chain could be exercised end-to-end before the HEF + /// compile pipeline landed. That was misleading — operators saw + /// vectors come back and reasonably assumed they were embeddings. + /// Now `embed` returns `HailoError::NoModelLoaded` until a real + /// model graph is wired in, so the cluster's failure mode honestly + /// reflects "no inference happening." + /// + /// **What still works without a model:** open / dimensions / device + /// id / chip_temperature / the entire gRPC stack. The worker boots, + /// reports ready=false (since dimensions=0 is the gate, but iter 87 + /// pre-declared 384 to keep the path testable; iter 130 keeps that + /// pre-declaration so health probes succeed and the operator-side + /// `--validate-fleet` flow can detect "model missing" via a clean + /// embed failure rather than a connection-refused). + /// + /// **To make `embed` work end-to-end:** see the iter-130 commit + /// message and ADR-167's "What's still unimplemented" section — + /// drop a compiled `model.hef` into the worker's model dir and + /// restart. The existing `HailoEmbedder::open` path picks it up; + /// the ModelLoaded gate trips and `embed` starts dispatching to + /// the NPU's vstream API. pub fn embed(&self, text: &str) -> Result> { #[cfg(not(feature = "hailo"))] { @@ -146,40 +160,12 @@ impl HailoEmbedder { } #[cfg(feature = "hailo")] { - // Hold the lock for the duration of one embed — preserves the - // contract that future HEF-based inference will need single- - // writer access to the vstream descriptors. - // Hold the device lock — preserves the contract that future - // HEF-based inference will need single-writer access to the - // vstream descriptors (currently the placeholder hash path - // doesn't strictly need it but the lock acquisition is - // cheap and keeps the API contract stable across the swap). + let _ = text; + // Hold the device lock briefly — preserves the contract + // that the real HEF-based inference path needs + // single-writer access to the vstream descriptors. let _guard = self.device.lock().unwrap_or_else(|p| p.into_inner()); - - let dim = self.dimensions.max(1); - let mut v = vec![0.0_f32; dim]; - - // FNV-1a hash, walked byte-by-byte. Each byte contributes - // (hash % dim) → +1 to that bin. Cheap, deterministic, well- - // distributed enough for a placeholder. - let mut hash: u64 = 0xcbf2_9ce4_8422_2325; - for &b in text.as_bytes() { - hash ^= b as u64; - hash = hash.wrapping_mul(0x100_0000_01b3); - let bin = (hash as usize) % dim; - v[bin] += 1.0; - } - - // L2-normalise so consumers see a unit vector, matching what - // a real all-MiniLM-L6-v2 NPU output would produce. - let norm: f32 = v.iter().map(|x| x * x).sum::().sqrt(); - if norm > 0.0 { - for x in &mut v { - *x /= norm; - } - } - - Ok(v) + Err(HailoError::NoModelLoaded) } } @@ -199,6 +185,22 @@ impl HailoEmbedder { self.dimensions } + /// Iter 130: honest "is a model graph actually loaded?" gate. + /// Returns `true` only when `embed()` would do real NPU inference. + /// Today this is **always false** — HEF loading isn't wired in yet + /// (the Hailo Dataflow Compiler step that produces `model.hef` is a + /// vendor-tool blocker outside this repo). The worker's `health()` + /// uses this to set the `ready` flag so the cluster's + /// `validate_fleet` correctly identifies model-less workers as + /// not-ready instead of false-healthy. + /// + /// When HEF support lands, this becomes `true` once a graph is + /// configured into the vdevice. No callers need to change — the + /// signal flips automatically. + pub fn has_model(&self) -> bool { + false + } + /// Human-readable provider name. Mirrors `EmbeddingProvider::name()`. pub fn name(&self) -> &str { &self.name diff --git a/crates/ruvector-hailo/src/tokenizer.rs b/crates/ruvector-hailo/src/tokenizer.rs index 0e92bc46a..db936694c 100644 --- a/crates/ruvector-hailo/src/tokenizer.rs +++ b/crates/ruvector-hailo/src/tokenizer.rs @@ -109,9 +109,43 @@ impl WordPieceTokenizer { max_seq: usize, pad_to_max_seq: bool, ) -> EncodedInput { + // Iter 130 fix: degenerate `max_seq` values used to produce + // outputs that violated the `len <= max_seq` invariant. The + // proptest `output_length_respects_max_seq` flushed it out + // with `max_seq=1, text=""` → `[CLS][SEP]` (length 2). Now: + // + // max_seq == 0 → empty (no room for anything) + // max_seq == 1 → just [CLS] (no room for [SEP]) + // max_seq >= 2 → [CLS] … [SEP] (the normal path) + // + // pad_to_max_seq still honoured at any size. + if max_seq == 0 { + let attention = if pad_to_max_seq { Vec::new() } else { Vec::new() }; + return EncodedInput { + input_ids: Vec::new(), + attention_mask: attention, + actual_len: 0, + }; + } + let mut ids = Vec::with_capacity(max_seq); ids.push(self.cls_id); + if max_seq == 1 { + // Only room for [CLS]. Skip body + [SEP]. + let actual_len = ids.len(); + let mut attention = vec![1u32; actual_len]; + if pad_to_max_seq { + ids.resize(max_seq, self.pad_id); + attention.resize(max_seq, 0); + } + return EncodedInput { + input_ids: ids, + attention_mask: attention, + actual_len, + }; + } + for basic in basic_tokenize(text) { let pieces = self.wordpiece(&basic); for p in pieces { diff --git a/docs/adr/ADR-167-ruvector-hailo-npu-embedding-backend.md b/docs/adr/ADR-167-ruvector-hailo-npu-embedding-backend.md index 6bbb576ba..17598af2c 100644 --- a/docs/adr/ADR-167-ruvector-hailo-npu-embedding-backend.md +++ b/docs/adr/ADR-167-ruvector-hailo-npu-embedding-backend.md @@ -31,6 +31,7 @@ below for historical context. The current cumulative state: | ULID request IDs | Iter 109 — 26-char Crockford base32 | | Cache TTL exposed in stats | Iter 108 | | HEF compile pipeline (real semantic vectors) | ❌ External blocker — Hailo Dataflow Compiler is proprietary x86-host tooling, runs outside this repo | +| **Placeholder vectors removed (iter 130)** | ✅ `embed()` now returns `HailoError::NoModelLoaded` instead of FNV-1a content hashes; `health.ready` flips false via the new `HailoEmbedder::has_model()` gate so the cluster's `validate_fleet` correctly identifies model-less workers | | ADR-174 thermal subscriber Unix-socket protocol | ❌ Deferred (iter 95-97 plan never built) | | Long-running coordinator daemon | ❌ Not built — CLI bins are stateless | | Native AsyncEmbeddingTransport trait | ❌ Public API change deferred (no consumer demand yet) |