fix(hailo): single-input encoder ONNX (iter 156) — sidestep RGB align block

Iter 154/155 attempts at the dual-input form (hidden_states + mask) hit the allocator-stage `tf_rgb_to_hailo_rgb format conversion ... features not aligned to 8` blocker on the rank-4 mask input (C=1). Hailo's `input_conversion` script command only supports image-color conversions (yuv_to_rgb, bgr_to_rgb, etc. — full list verified by Python introspection of `InputConversionTypes` dict), so we can't override the auto-conversion for a non-image rank-4 feature input. Iter 156 reverts to the iter-144b single-input form: encoder runs full attention (no mask input). The worker pads input to seq=128 with [PAD] tokens, so shorter inputs just produce meaningful values at PAD positions; the post-NPU host-side mean-pool applies the real attention mask, zeroing out those PAD-position contributions. Same final embedding semantics. This combines with iter-153's Keras monkey-patch (which fixed the original ElementwiseAddDirectOp deserialize bug that blocked single-input form previously). Now testing. Co-Authored-By: claude-flow <ruv@ruv.net>
2026-06-01 14:39:33 +00:00 · 2026-05-02 18:03:44 -04:00 · 2026-05-02 18:03:44 -04:00 · d769dd67bc
commit d769dd67bc
parent 11f2669f0b
2 changed files with 26 additions and 38 deletions
--- a/crates/ruvector-hailo-cluster/deploy/compile-encoder-hef.py
+++ b/crates/ruvector-hailo-cluster/deploy/compile-encoder-hef.py
@ -70,29 +70,16 @@ def main(onnx_path: str, out_hef: str) -> None:

    print(f"==> [parse] {onnx_path}", flush=True)
    runner = ClientRunner(hw_arch=HW_ARCH)
-    # Iter 154: explicit input formats. Without these, Hailo's allocator
-    # treats the rank-4 mask input as an "RGB image" and applies a
-    # `tf_rgb_to_hailo_rgb` format conversion that requires C aligned
-    # to 8. Our mask has C=1 → "output features not aligned to 8" hard
-    # fail at compile-time. Spelling out the dim semantics tells the
-    # allocator these are pure feature tensors, not images.
-    from hailo_sdk_client.exposed_definitions import Dims
+    # Iter 156 — single-input form to avoid the iter-154 RGB conversion
+    # blocker on the rank-4 mask. Encoder runs full attention; host-side
+    # mean-pool applies the real attention mask post-NPU.
    runner.translate_onnx_model(
        str(onnx_path),
        net_name=NET_NAME,
-        start_node_names=["hidden_states", "attention_softmax_mask"],
+        start_node_names=["hidden_states"],
        end_node_names=["last_hidden_state"],
        net_input_shapes={
            "hidden_states": [1, SEQ_LEN, HIDDEN],
-            "attention_softmax_mask": [1, 1, 1, SEQ_LEN],
-        },
-        net_input_format={
-            # rank-3 hidden_states: NWC (Hailo default for rank-3)
-            "hidden_states": [Dims.BATCH, Dims.WIDTH, Dims.CHANNELS],
-            # rank-4 mask: NCHW with C=1 — explicitly mark as feature
-            # tensor (not RGB image) so the allocator skips the
-            # rgb-to-rgb format conversion.
-            "attention_softmax_mask": [Dims.BATCH, Dims.CHANNELS, Dims.HEIGHT, Dims.WIDTH],
        },
    )

@ -120,6 +107,9 @@ def main(onnx_path: str, out_hef: str) -> None:
    # inside a spawned subprocess that doesn't carry the SDK's custom
    # layer registry. Disabling multiproc keeps the optimizer in-process
    # so the @register_keras_serializable decorations stay loaded.
+    # Iter 156 — single-input form. Drop iter-155 mask input_conversion
+    # (no longer needed, no mask input). Keep the rest of Hailo's BERT
+    # alls recipe + iter-153 multiproc disable.
    bert_alls = """\
 model_optimization_config(calibration, batch_size=8, calibset_size=64)
 model_optimization_config(globals, multiproc_policy=disabled)
--- a/crates/ruvector-hailo-cluster/deploy/export-minilm-encoder-onnx.py
+++ b/crates/ruvector-hailo-cluster/deploy/export-minilm-encoder-onnx.py
@ -38,33 +38,32 @@ HIDDEN = 384


 class EncoderOnly(torch.nn.Module):
-    """Wraps BertEncoder taking hidden_states + softmax mask as inputs.
+    """Wraps BertEncoder taking only hidden_states (no mask input).

-    Iter 144 — adopts Hailo Model Zoo's official BERT pattern (see
-    cfg/networks/bert_base_uncased.yaml). They split the network at
-    /embeddings/Add_1 (post-embedding hidden states) AND the mask
-    broadcast intermediate, then use `set_input_mask_to_softmax()` in
-    the alls script to tell the SDK how to fold the mask into each
-    softmax. This bypasses the iter-139/142 SDK chain
-    (Where → KeyError → ElementwiseAddDirectOp deserialize) by going
-    through the SDK's well-tested transformer codepath.
+    Iter 156 — single-input form. The dual-input form from iter 144
+    fails at the allocator stage with
+    `tf_rgb_to_hailo_rgb format conversion ... features not aligned to 8`
+    on the rank-4 mask input (C=1, can't be aligned to 8). Hailo's
+    `input_conversion` script command only accepts image-color
+    conversions, not feature passthrough. Iter 153 fixed the original
+    blocker (Keras deserialize on ElementwiseAddDirectOp) so the
+    single-input form — which iter 144b tried before iter 153 —
+    should now compile cleanly.

-    Inputs:
-      hidden_states           [batch, seq, hidden]  float32 — host-computed embeddings
-      attention_softmax_mask  [batch, 1, 1, seq]    float32 — additive bias 0/-10000
-
-    The attention_softmax_mask is what gets added to the QK^T scores
-    pre-softmax in standard self-attention. Host computes it from the
-    [batch, seq] padding mask once, broadcasts to 4D, sends as input."""
+    Trade-off: encoder runs full attention with no padding mask. The
+    worker pads input to seq=128 with [PAD] tokens, so shorter inputs
+    just produce meaningful values at PAD positions; the post-NPU
+    host-side mean-pool applies the real attention mask, zeroing out
+    those PAD-position contributions. Same final embedding semantics."""

    def __init__(self, model):
        super().__init__()
        self.encoder = model.encoder

-    def forward(self, hidden_states, attention_softmax_mask):
+    def forward(self, hidden_states):
        out = self.encoder(
            hidden_states=hidden_states,
-            attention_mask=attention_softmax_mask,
+            attention_mask=None,
            return_dict=True,
        )
        return out.last_hidden_state
@ -81,14 +80,13 @@ def main(out_dir: str) -> None:

    print(f"==> dummy inputs (batch=1, seq={SEQ_LEN}, hidden={HIDDEN})", flush=True)
    hidden_states = torch.randn(1, SEQ_LEN, HIDDEN)
-    attention_softmax_mask = torch.zeros(1, 1, 1, SEQ_LEN)

    print(f"==> torch.onnx.export → {onnx_path}", flush=True)
    torch.onnx.export(
        encoder_only,
-        (hidden_states, attention_softmax_mask),
+        (hidden_states,),
        str(onnx_path),
-        input_names=["hidden_states", "attention_softmax_mask"],
+        input_names=["hidden_states"],
        output_names=["last_hidden_state"],
        opset_version=OPSET,
        do_constant_folding=True,