diff --git a/crates/ruvector-hailo-cluster/deploy/compile-encoder-hef.py b/crates/ruvector-hailo-cluster/deploy/compile-encoder-hef.py
index 280898942..23662e50b 100644
--- a/crates/ruvector-hailo-cluster/deploy/compile-encoder-hef.py
+++ b/crates/ruvector-hailo-cluster/deploy/compile-encoder-hef.py
@@ -70,29 +70,16 @@ def main(onnx_path: str, out_hef: str) -> None:
 
     print(f"==> [parse] {onnx_path}", flush=True)
     runner = ClientRunner(hw_arch=HW_ARCH)
-    # Iter 154: explicit input formats. Without these, Hailo's allocator
-    # treats the rank-4 mask input as an "RGB image" and applies a
-    # `tf_rgb_to_hailo_rgb` format conversion that requires C aligned
-    # to 8. Our mask has C=1 → "output features not aligned to 8" hard
-    # fail at compile-time. Spelling out the dim semantics tells the
-    # allocator these are pure feature tensors, not images.
-    from hailo_sdk_client.exposed_definitions import Dims
+    # Iter 156 — single-input form to avoid the iter-154 RGB conversion
+    # blocker on the rank-4 mask. Encoder runs full attention; host-side
+    # mean-pool applies the real attention mask post-NPU.
     runner.translate_onnx_model(
         str(onnx_path),
         net_name=NET_NAME,
-        start_node_names=["hidden_states", "attention_softmax_mask"],
+        start_node_names=["hidden_states"],
         end_node_names=["last_hidden_state"],
         net_input_shapes={
             "hidden_states": [1, SEQ_LEN, HIDDEN],
-            "attention_softmax_mask": [1, 1, 1, SEQ_LEN],
-        },
-        net_input_format={
-            # rank-3 hidden_states: NWC (Hailo default for rank-3)
-            "hidden_states": [Dims.BATCH, Dims.WIDTH, Dims.CHANNELS],
-            # rank-4 mask: NCHW with C=1 — explicitly mark as feature
-            # tensor (not RGB image) so the allocator skips the
-            # rgb-to-rgb format conversion.
-            "attention_softmax_mask": [Dims.BATCH, Dims.CHANNELS, Dims.HEIGHT, Dims.WIDTH],
         },
     )
 
@@ -120,6 +107,9 @@ def main(onnx_path: str, out_hef: str) -> None:
     # inside a spawned subprocess that doesn't carry the SDK's custom
     # layer registry. Disabling multiproc keeps the optimizer in-process
     # so the @register_keras_serializable decorations stay loaded.
+    # Iter 156 — single-input form. Drop iter-155 mask input_conversion
+    # (no longer needed, no mask input). Keep the rest of Hailo's BERT
+    # alls recipe + iter-153 multiproc disable.
     bert_alls = """\
 model_optimization_config(calibration, batch_size=8, calibset_size=64)
 model_optimization_config(globals, multiproc_policy=disabled)
diff --git a/crates/ruvector-hailo-cluster/deploy/export-minilm-encoder-onnx.py b/crates/ruvector-hailo-cluster/deploy/export-minilm-encoder-onnx.py
index e87a8161b..370123d37 100644
--- a/crates/ruvector-hailo-cluster/deploy/export-minilm-encoder-onnx.py
+++ b/crates/ruvector-hailo-cluster/deploy/export-minilm-encoder-onnx.py
@@ -38,33 +38,32 @@ HIDDEN = 384
 
 
 class EncoderOnly(torch.nn.Module):
-    """Wraps BertEncoder taking hidden_states + softmax mask as inputs.
+    """Wraps BertEncoder taking only hidden_states (no mask input).
 
-    Iter 144 — adopts Hailo Model Zoo's official BERT pattern (see
-    cfg/networks/bert_base_uncased.yaml). They split the network at
-    /embeddings/Add_1 (post-embedding hidden states) AND the mask
-    broadcast intermediate, then use `set_input_mask_to_softmax()` in
-    the alls script to tell the SDK how to fold the mask into each
-    softmax. This bypasses the iter-139/142 SDK chain
-    (Where → KeyError → ElementwiseAddDirectOp deserialize) by going
-    through the SDK's well-tested transformer codepath.
+    Iter 156 — single-input form. The dual-input form from iter 144
+    fails at the allocator stage with
+    `tf_rgb_to_hailo_rgb format conversion ... features not aligned to 8`
+    on the rank-4 mask input (C=1, can't be aligned to 8). Hailo's
+    `input_conversion` script command only accepts image-color
+    conversions, not feature passthrough. Iter 153 fixed the original
+    blocker (Keras deserialize on ElementwiseAddDirectOp) so the
+    single-input form — which iter 144b tried before iter 153 —
+    should now compile cleanly.
 
-    Inputs:
-      hidden_states           [batch, seq, hidden]  float32 — host-computed embeddings
-      attention_softmax_mask  [batch, 1, 1, seq]    float32 — additive bias 0/-10000
-
-    The attention_softmax_mask is what gets added to the QK^T scores
-    pre-softmax in standard self-attention. Host computes it from the
-    [batch, seq] padding mask once, broadcasts to 4D, sends as input."""
+    Trade-off: encoder runs full attention with no padding mask. The
+    worker pads input to seq=128 with [PAD] tokens, so shorter inputs
+    just produce meaningful values at PAD positions; the post-NPU
+    host-side mean-pool applies the real attention mask, zeroing out
+    those PAD-position contributions. Same final embedding semantics."""
 
     def __init__(self, model):
         super().__init__()
         self.encoder = model.encoder
 
-    def forward(self, hidden_states, attention_softmax_mask):
+    def forward(self, hidden_states):
         out = self.encoder(
             hidden_states=hidden_states,
-            attention_mask=attention_softmax_mask,
+            attention_mask=None,
             return_dict=True,
         )
         return out.last_hidden_state
@@ -81,14 +80,13 @@ def main(out_dir: str) -> None:
 
     print(f"==> dummy inputs (batch=1, seq={SEQ_LEN}, hidden={HIDDEN})", flush=True)
     hidden_states = torch.randn(1, SEQ_LEN, HIDDEN)
-    attention_softmax_mask = torch.zeros(1, 1, 1, SEQ_LEN)
 
     print(f"==> torch.onnx.export → {onnx_path}", flush=True)
     torch.onnx.export(
         encoder_only,
-        (hidden_states, attention_softmax_mask),
+        (hidden_states,),
         str(onnx_path),
-        input_names=["hidden_states", "attention_softmax_mask"],
+        input_names=["hidden_states"],
         output_names=["last_hidden_state"],
         opset_version=OPSET,
         do_constant_folding=True,