mirror of
https://github.com/ruvnet/RuVector.git
synced 2026-06-01 14:39:33 +00:00
fix(hailo): single-input encoder ONNX (iter 156) — sidestep RGB align block
Iter 154/155 attempts at the dual-input form (hidden_states + mask) hit the allocator-stage `tf_rgb_to_hailo_rgb format conversion ... features not aligned to 8` blocker on the rank-4 mask input (C=1). Hailo's `input_conversion` script command only supports image-color conversions (yuv_to_rgb, bgr_to_rgb, etc. — full list verified by Python introspection of `InputConversionTypes` dict), so we can't override the auto-conversion for a non-image rank-4 feature input. Iter 156 reverts to the iter-144b single-input form: encoder runs full attention (no mask input). The worker pads input to seq=128 with [PAD] tokens, so shorter inputs just produce meaningful values at PAD positions; the post-NPU host-side mean-pool applies the real attention mask, zeroing out those PAD-position contributions. Same final embedding semantics. This combines with iter-153's Keras monkey-patch (which fixed the original ElementwiseAddDirectOp deserialize bug that blocked single-input form previously). Now testing. Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
parent
11f2669f0b
commit
d769dd67bc
2 changed files with 26 additions and 38 deletions
|
|
@ -70,29 +70,16 @@ def main(onnx_path: str, out_hef: str) -> None:
|
|||
|
||||
print(f"==> [parse] {onnx_path}", flush=True)
|
||||
runner = ClientRunner(hw_arch=HW_ARCH)
|
||||
# Iter 154: explicit input formats. Without these, Hailo's allocator
|
||||
# treats the rank-4 mask input as an "RGB image" and applies a
|
||||
# `tf_rgb_to_hailo_rgb` format conversion that requires C aligned
|
||||
# to 8. Our mask has C=1 → "output features not aligned to 8" hard
|
||||
# fail at compile-time. Spelling out the dim semantics tells the
|
||||
# allocator these are pure feature tensors, not images.
|
||||
from hailo_sdk_client.exposed_definitions import Dims
|
||||
# Iter 156 — single-input form to avoid the iter-154 RGB conversion
|
||||
# blocker on the rank-4 mask. Encoder runs full attention; host-side
|
||||
# mean-pool applies the real attention mask post-NPU.
|
||||
runner.translate_onnx_model(
|
||||
str(onnx_path),
|
||||
net_name=NET_NAME,
|
||||
start_node_names=["hidden_states", "attention_softmax_mask"],
|
||||
start_node_names=["hidden_states"],
|
||||
end_node_names=["last_hidden_state"],
|
||||
net_input_shapes={
|
||||
"hidden_states": [1, SEQ_LEN, HIDDEN],
|
||||
"attention_softmax_mask": [1, 1, 1, SEQ_LEN],
|
||||
},
|
||||
net_input_format={
|
||||
# rank-3 hidden_states: NWC (Hailo default for rank-3)
|
||||
"hidden_states": [Dims.BATCH, Dims.WIDTH, Dims.CHANNELS],
|
||||
# rank-4 mask: NCHW with C=1 — explicitly mark as feature
|
||||
# tensor (not RGB image) so the allocator skips the
|
||||
# rgb-to-rgb format conversion.
|
||||
"attention_softmax_mask": [Dims.BATCH, Dims.CHANNELS, Dims.HEIGHT, Dims.WIDTH],
|
||||
},
|
||||
)
|
||||
|
||||
|
|
@ -120,6 +107,9 @@ def main(onnx_path: str, out_hef: str) -> None:
|
|||
# inside a spawned subprocess that doesn't carry the SDK's custom
|
||||
# layer registry. Disabling multiproc keeps the optimizer in-process
|
||||
# so the @register_keras_serializable decorations stay loaded.
|
||||
# Iter 156 — single-input form. Drop iter-155 mask input_conversion
|
||||
# (no longer needed, no mask input). Keep the rest of Hailo's BERT
|
||||
# alls recipe + iter-153 multiproc disable.
|
||||
bert_alls = """\
|
||||
model_optimization_config(calibration, batch_size=8, calibset_size=64)
|
||||
model_optimization_config(globals, multiproc_policy=disabled)
|
||||
|
|
|
|||
|
|
@ -38,33 +38,32 @@ HIDDEN = 384
|
|||
|
||||
|
||||
class EncoderOnly(torch.nn.Module):
|
||||
"""Wraps BertEncoder taking hidden_states + softmax mask as inputs.
|
||||
"""Wraps BertEncoder taking only hidden_states (no mask input).
|
||||
|
||||
Iter 144 — adopts Hailo Model Zoo's official BERT pattern (see
|
||||
cfg/networks/bert_base_uncased.yaml). They split the network at
|
||||
/embeddings/Add_1 (post-embedding hidden states) AND the mask
|
||||
broadcast intermediate, then use `set_input_mask_to_softmax()` in
|
||||
the alls script to tell the SDK how to fold the mask into each
|
||||
softmax. This bypasses the iter-139/142 SDK chain
|
||||
(Where → KeyError → ElementwiseAddDirectOp deserialize) by going
|
||||
through the SDK's well-tested transformer codepath.
|
||||
Iter 156 — single-input form. The dual-input form from iter 144
|
||||
fails at the allocator stage with
|
||||
`tf_rgb_to_hailo_rgb format conversion ... features not aligned to 8`
|
||||
on the rank-4 mask input (C=1, can't be aligned to 8). Hailo's
|
||||
`input_conversion` script command only accepts image-color
|
||||
conversions, not feature passthrough. Iter 153 fixed the original
|
||||
blocker (Keras deserialize on ElementwiseAddDirectOp) so the
|
||||
single-input form — which iter 144b tried before iter 153 —
|
||||
should now compile cleanly.
|
||||
|
||||
Inputs:
|
||||
hidden_states [batch, seq, hidden] float32 — host-computed embeddings
|
||||
attention_softmax_mask [batch, 1, 1, seq] float32 — additive bias 0/-10000
|
||||
|
||||
The attention_softmax_mask is what gets added to the QK^T scores
|
||||
pre-softmax in standard self-attention. Host computes it from the
|
||||
[batch, seq] padding mask once, broadcasts to 4D, sends as input."""
|
||||
Trade-off: encoder runs full attention with no padding mask. The
|
||||
worker pads input to seq=128 with [PAD] tokens, so shorter inputs
|
||||
just produce meaningful values at PAD positions; the post-NPU
|
||||
host-side mean-pool applies the real attention mask, zeroing out
|
||||
those PAD-position contributions. Same final embedding semantics."""
|
||||
|
||||
def __init__(self, model):
|
||||
super().__init__()
|
||||
self.encoder = model.encoder
|
||||
|
||||
def forward(self, hidden_states, attention_softmax_mask):
|
||||
def forward(self, hidden_states):
|
||||
out = self.encoder(
|
||||
hidden_states=hidden_states,
|
||||
attention_mask=attention_softmax_mask,
|
||||
attention_mask=None,
|
||||
return_dict=True,
|
||||
)
|
||||
return out.last_hidden_state
|
||||
|
|
@ -81,14 +80,13 @@ def main(out_dir: str) -> None:
|
|||
|
||||
print(f"==> dummy inputs (batch=1, seq={SEQ_LEN}, hidden={HIDDEN})", flush=True)
|
||||
hidden_states = torch.randn(1, SEQ_LEN, HIDDEN)
|
||||
attention_softmax_mask = torch.zeros(1, 1, 1, SEQ_LEN)
|
||||
|
||||
print(f"==> torch.onnx.export → {onnx_path}", flush=True)
|
||||
torch.onnx.export(
|
||||
encoder_only,
|
||||
(hidden_states, attention_softmax_mask),
|
||||
(hidden_states,),
|
||||
str(onnx_path),
|
||||
input_names=["hidden_states", "attention_softmax_mask"],
|
||||
input_names=["hidden_states"],
|
||||
output_names=["last_hidden_state"],
|
||||
opset_version=OPSET,
|
||||
do_constant_folding=True,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue