diff --git a/crates/ruvector-hailo-cluster/deploy/compile-encoder-hef.py b/crates/ruvector-hailo-cluster/deploy/compile-encoder-hef.py index 280898942..23662e50b 100644 --- a/crates/ruvector-hailo-cluster/deploy/compile-encoder-hef.py +++ b/crates/ruvector-hailo-cluster/deploy/compile-encoder-hef.py @@ -70,29 +70,16 @@ def main(onnx_path: str, out_hef: str) -> None: print(f"==> [parse] {onnx_path}", flush=True) runner = ClientRunner(hw_arch=HW_ARCH) - # Iter 154: explicit input formats. Without these, Hailo's allocator - # treats the rank-4 mask input as an "RGB image" and applies a - # `tf_rgb_to_hailo_rgb` format conversion that requires C aligned - # to 8. Our mask has C=1 → "output features not aligned to 8" hard - # fail at compile-time. Spelling out the dim semantics tells the - # allocator these are pure feature tensors, not images. - from hailo_sdk_client.exposed_definitions import Dims + # Iter 156 — single-input form to avoid the iter-154 RGB conversion + # blocker on the rank-4 mask. Encoder runs full attention; host-side + # mean-pool applies the real attention mask post-NPU. runner.translate_onnx_model( str(onnx_path), net_name=NET_NAME, - start_node_names=["hidden_states", "attention_softmax_mask"], + start_node_names=["hidden_states"], end_node_names=["last_hidden_state"], net_input_shapes={ "hidden_states": [1, SEQ_LEN, HIDDEN], - "attention_softmax_mask": [1, 1, 1, SEQ_LEN], - }, - net_input_format={ - # rank-3 hidden_states: NWC (Hailo default for rank-3) - "hidden_states": [Dims.BATCH, Dims.WIDTH, Dims.CHANNELS], - # rank-4 mask: NCHW with C=1 — explicitly mark as feature - # tensor (not RGB image) so the allocator skips the - # rgb-to-rgb format conversion. - "attention_softmax_mask": [Dims.BATCH, Dims.CHANNELS, Dims.HEIGHT, Dims.WIDTH], }, ) @@ -120,6 +107,9 @@ def main(onnx_path: str, out_hef: str) -> None: # inside a spawned subprocess that doesn't carry the SDK's custom # layer registry. Disabling multiproc keeps the optimizer in-process # so the @register_keras_serializable decorations stay loaded. + # Iter 156 — single-input form. Drop iter-155 mask input_conversion + # (no longer needed, no mask input). Keep the rest of Hailo's BERT + # alls recipe + iter-153 multiproc disable. bert_alls = """\ model_optimization_config(calibration, batch_size=8, calibset_size=64) model_optimization_config(globals, multiproc_policy=disabled) diff --git a/crates/ruvector-hailo-cluster/deploy/export-minilm-encoder-onnx.py b/crates/ruvector-hailo-cluster/deploy/export-minilm-encoder-onnx.py index e87a8161b..370123d37 100644 --- a/crates/ruvector-hailo-cluster/deploy/export-minilm-encoder-onnx.py +++ b/crates/ruvector-hailo-cluster/deploy/export-minilm-encoder-onnx.py @@ -38,33 +38,32 @@ HIDDEN = 384 class EncoderOnly(torch.nn.Module): - """Wraps BertEncoder taking hidden_states + softmax mask as inputs. + """Wraps BertEncoder taking only hidden_states (no mask input). - Iter 144 — adopts Hailo Model Zoo's official BERT pattern (see - cfg/networks/bert_base_uncased.yaml). They split the network at - /embeddings/Add_1 (post-embedding hidden states) AND the mask - broadcast intermediate, then use `set_input_mask_to_softmax()` in - the alls script to tell the SDK how to fold the mask into each - softmax. This bypasses the iter-139/142 SDK chain - (Where → KeyError → ElementwiseAddDirectOp deserialize) by going - through the SDK's well-tested transformer codepath. + Iter 156 — single-input form. The dual-input form from iter 144 + fails at the allocator stage with + `tf_rgb_to_hailo_rgb format conversion ... features not aligned to 8` + on the rank-4 mask input (C=1, can't be aligned to 8). Hailo's + `input_conversion` script command only accepts image-color + conversions, not feature passthrough. Iter 153 fixed the original + blocker (Keras deserialize on ElementwiseAddDirectOp) so the + single-input form — which iter 144b tried before iter 153 — + should now compile cleanly. - Inputs: - hidden_states [batch, seq, hidden] float32 — host-computed embeddings - attention_softmax_mask [batch, 1, 1, seq] float32 — additive bias 0/-10000 - - The attention_softmax_mask is what gets added to the QK^T scores - pre-softmax in standard self-attention. Host computes it from the - [batch, seq] padding mask once, broadcasts to 4D, sends as input.""" + Trade-off: encoder runs full attention with no padding mask. The + worker pads input to seq=128 with [PAD] tokens, so shorter inputs + just produce meaningful values at PAD positions; the post-NPU + host-side mean-pool applies the real attention mask, zeroing out + those PAD-position contributions. Same final embedding semantics.""" def __init__(self, model): super().__init__() self.encoder = model.encoder - def forward(self, hidden_states, attention_softmax_mask): + def forward(self, hidden_states): out = self.encoder( hidden_states=hidden_states, - attention_mask=attention_softmax_mask, + attention_mask=None, return_dict=True, ) return out.last_hidden_state @@ -81,14 +80,13 @@ def main(out_dir: str) -> None: print(f"==> dummy inputs (batch=1, seq={SEQ_LEN}, hidden={HIDDEN})", flush=True) hidden_states = torch.randn(1, SEQ_LEN, HIDDEN) - attention_softmax_mask = torch.zeros(1, 1, 1, SEQ_LEN) print(f"==> torch.onnx.export → {onnx_path}", flush=True) torch.onnx.export( encoder_only, - (hidden_states, attention_softmax_mask), + (hidden_states,), str(onnx_path), - input_names=["hidden_states", "attention_softmax_mask"], + input_names=["hidden_states"], output_names=["last_hidden_state"], opset_version=OPSET, do_constant_folding=True,