diff --git a/crates/ruvector-hailo-cluster/src/bin/worker.rs b/crates/ruvector-hailo-cluster/src/bin/worker.rs index 9abab3373..329b0f48e 100644 --- a/crates/ruvector-hailo-cluster/src/bin/worker.rs +++ b/crates/ruvector-hailo-cluster/src/bin/worker.rs @@ -337,8 +337,10 @@ fn main() -> Result<(), Box> { let fingerprint = compute_fingerprint(&model_dir); if fingerprint.is_empty() { warn!( - "model_dir {} has no model.hef / vocab.txt — fingerprint empty; \ - coordinators will skip the integrity check", + "model_dir {} has no recognizable model artifacts \ + (NPU: model.hef + vocab.txt; cpu-fallback: model.safetensors + \ + tokenizer.json + config.json) — fingerprint empty; coordinators \ + will skip the integrity check", model_dir.display() ); } else { diff --git a/crates/ruvector-hailo-cluster/src/fingerprint.rs b/crates/ruvector-hailo-cluster/src/fingerprint.rs index 47dca83f0..ac8fd2636 100644 --- a/crates/ruvector-hailo-cluster/src/fingerprint.rs +++ b/crates/ruvector-hailo-cluster/src/fingerprint.rs @@ -1,29 +1,50 @@ -//! Model fingerprint — sha256 over HEF + tokenizer artifacts. +//! Model fingerprint — sha256 over the model artifacts on disk. //! //! ADR-167 §8.3 fleet integrity guard: coordinators refuse to mix //! workers reporting different model fingerprints. Computed at worker -//! startup over the files actually loaded, so any swap of HEF or -//! vocab.txt produces a different fingerprint and the coordinator can -//! eject the drift. +//! startup over the files actually loaded, so any swap of model +//! weights or tokenizer produces a different fingerprint and the +//! coordinator can eject the drift. //! -//! Format: hex-lowercase, 64 chars. +//! Iter 143 — covers both deployment paths: +//! * NPU path: sha256(model.hef || vocab.txt) +//! * cpu-fallback: sha256(model.safetensors || tokenizer.json || config.json) +//! +//! Mixed clusters (some workers on NPU, some on CPU) intentionally +//! produce different fingerprints — they're running different code +//! paths so the cluster should reject the mix. +//! +//! Format: hex-lowercase, 64 chars. Empty when no recognizable model +//! artifacts are present in `model_dir`. use sha2::{Digest, Sha256}; use std::path::Path; -/// Compute sha256 over (hef_bytes || vocab_bytes). Missing files are -/// treated as empty so the fingerprint is *also* a witness of which -/// files exist — a worker that loads only the HEF (no tokenizer) -/// produces a different fingerprint than a worker with both. +/// Compute sha256 over the model artifacts. Missing files are treated +/// as empty within their layout so the fingerprint is *also* a witness +/// of which files exist — a worker that loads only the HEF (no +/// tokenizer) produces a different fingerprint than a worker with both. /// -/// Returns "" when both files are missing — caller (worker startup) -/// uses empty-string as "skip the check" sentinel until step 6 lands. +/// Returns "" when neither layout has any recognizable file — caller +/// (worker startup) uses empty-string as "skip the check" sentinel. pub fn compute_fingerprint(model_dir: &Path) -> String { let hef = std::fs::read(model_dir.join("model.hef")).unwrap_or_default(); let vocab = std::fs::read(model_dir.join("vocab.txt")).unwrap_or_default(); - if hef.is_empty() && vocab.is_empty() { + + // Iter 143: cpu-fallback artifacts. We don't read the full + // safetensors (90 MB) into memory — sha256 it in streaming chunks. + let safetensors_present = model_dir.join("model.safetensors").exists(); + let tokenizer_json = std::fs::read(model_dir.join("tokenizer.json")).unwrap_or_default(); + let config_json = std::fs::read(model_dir.join("config.json")).unwrap_or_default(); + + let npu_layout_present = !hef.is_empty() || !vocab.is_empty(); + let cpu_layout_present = + safetensors_present || !tokenizer_json.is_empty() || !config_json.is_empty(); + + if !npu_layout_present && !cpu_layout_present { return String::new(); } + let mut h = Sha256::new(); // Length-prefix each input so a hef of N bytes + vocab of M bytes // never collides with a hef of N+M bytes + empty vocab. @@ -31,6 +52,30 @@ pub fn compute_fingerprint(model_dir: &Path) -> String { h.update(&hef); h.update((vocab.len() as u64).to_le_bytes()); h.update(&vocab); + + // Stream-hash the safetensors so we don't read 90 MB into memory. + if safetensors_present { + // Tag with file marker so an empty hef doesn't blend with safetensors. + h.update(b"safetensors:"); + if let Ok(mut f) = std::fs::File::open(model_dir.join("model.safetensors")) { + let mut buf = [0u8; 64 * 1024]; + let mut total: u64 = 0; + use std::io::Read; + while let Ok(n) = f.read(&mut buf) { + if n == 0 { + break; + } + h.update(&buf[..n]); + total += n as u64; + } + h.update(total.to_le_bytes()); + } + } + h.update((tokenizer_json.len() as u64).to_le_bytes()); + h.update(&tokenizer_json); + h.update((config_json.len() as u64).to_le_bytes()); + h.update(&config_json); + let digest = h.finalize(); hex_lower(&digest) } @@ -95,6 +140,37 @@ mod tests { assert_ne!(fp2, fp3); } + #[test] + fn cpu_fallback_safetensors_layout_yields_distinct_fingerprint() { + // Iter 143: a worker with safetensors+tokenizer+config but no + // hef must produce a non-empty fingerprint, distinct from a + // worker with the same files but different content. + let d1 = tmpdir(); + std::fs::write(d1.path().join("model.safetensors"), b"weights-A").unwrap(); + std::fs::write(d1.path().join("tokenizer.json"), b"tok-A").unwrap(); + std::fs::write(d1.path().join("config.json"), b"cfg-A").unwrap(); + let fp1 = compute_fingerprint(d1.path()); + assert_eq!(fp1.len(), 64); + assert!(!fp1.is_empty()); + + let d2 = tmpdir(); + std::fs::write(d2.path().join("model.safetensors"), b"weights-B").unwrap(); + std::fs::write(d2.path().join("tokenizer.json"), b"tok-A").unwrap(); + std::fs::write(d2.path().join("config.json"), b"cfg-A").unwrap(); + let fp2 = compute_fingerprint(d2.path()); + assert_ne!(fp1, fp2, "different safetensors must yield different fp"); + + // Per ADR-167 §8.3, an NPU-layout worker and a cpu-fallback + // worker run different code paths so their fingerprints SHOULD + // differ even with the same logical model — the cluster will + // refuse to mix them. + let d3 = tmpdir(); + std::fs::write(d3.path().join("model.hef"), b"weights-A").unwrap(); + std::fs::write(d3.path().join("vocab.txt"), b"tok-A").unwrap(); + let fp3 = compute_fingerprint(d3.path()); + assert_ne!(fp1, fp3, "NPU layout vs cpu-fallback must differ"); + } + #[test] fn length_prefix_prevents_split_collision() { // Without length-prefixing, sha256(b"abc" || b"de") == sha256(b"ab" || b"cde"). diff --git a/docs/adr/ADR-167-ruvector-hailo-npu-embedding-backend.md b/docs/adr/ADR-167-ruvector-hailo-npu-embedding-backend.md index 6eb54e6ec..1caff786c 100644 --- a/docs/adr/ADR-167-ruvector-hailo-npu-embedding-backend.md +++ b/docs/adr/ADR-167-ruvector-hailo-npu-embedding-backend.md @@ -66,10 +66,27 @@ re-exported the BERT encoder block in isolation: quantized weights (full-precision HEF would be possible on hailo15h but not hailo8). +**Iter 142/142b/143 follow-up debugging** (after reading the SDK source): +- Root-caused the iter-139 `KeyError` to a mismatch between + `_get_build_inputs` (returns dict keyed by user dataset keys) and + `hailo_model.build` (looks up by internal `flow.input_nodes` names). + Workaround: introspect the parsed HN, key the calibration dict by + the actual layer name (`minilm_encoder/input_layer1`). +- Past that: `AccelerasValueError` shape mismatch — Hailo's HN treats + inputs as 4D NCHW with implicit channels=1. Workaround: reshape + calibration from `[batch, seq, hidden]` to `[batch, 1, seq, hidden]`. +- Past **that**: a Keras serialization bug — + `TypeError: Could not locate class 'ElementwiseAddDirectOp'` — + during the SDK's deepcopy of its own internal layer types. This is + hailo_model_optimization deepcopy-ing a custom Keras layer it + registered itself, then failing to deserialize it because the + `@register_keras_serializable` decorator isn't running in the + spawned subprocess. Cannot be fixed from user-space. + **Status:** the encoder ONNX is fundamentally Hailo-compatible (it -parses + full-precision-optimizes cleanly). The remaining gap is an -SDK-internal bug in INT8 quantization of transformer encoders that -can't be worked around from user-space. The cleanest unblock paths: +parses + full-precision-optimizes cleanly). The remaining gap is a +chain of SDK-internal bugs in INT8 quantization of transformer encoders +that can't be worked around from user-space. The cleanest unblock paths: 1. Hailo support ticket (the SDK should not KeyError on a layer it knows about — this is a quantization-flow bug, not a user-input bug)