feat(sft): add Qwen3.5 MoE support + fused checkpoint loading

- arch.py: add Qwen3_5Moe arch match, read config from text_config, _get_layers_prefix returns model.language_model.layers for Qwen3.5, _get_model_container_and_layers searches language_model attr - weights.py: load_experts_from_checkpoint_files detects fused format (gate_up_proj in weight_map) and splits into gate/up/down - wrapper.py: hidden_size fallback to text_config Verified: Qwen3.5-35B-A3B (256 experts, fused format) E2E pass. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-28 11:49:51 +00:00 · 2026-04-20 17:19:15 +08:00 · 2026-04-20 17:19:15 +08:00 · dd1da65d90
commit dd1da65d90
parent 58d7eabb9b
3 changed files with 54 additions and 20 deletions
--- a/kt-kernel/python/sft/arch.py
+++ b/kt-kernel/python/sft/arch.py
@ -98,16 +98,17 @@ def get_moe_arch_config(config) -> MOEArchConfig:
            has_shared_experts=getattr(config, "n_shared_experts", 0) > 0,
            router_type="deepseek_gate",
        )
-    if "Qwen2Moe" in arch or "Qwen3Moe" in arch:
+    if "Qwen2Moe" in arch or "Qwen3Moe" in arch or "Qwen3_5Moe" in arch:
+        cfg = getattr(config, "text_config", config)
        return MOEArchConfig(
            moe_layer_attr="mlp",
            router_attr="gate",
            experts_attr="experts",
            weight_names=("gate_proj", "up_proj", "down_proj"),
-            expert_num=config.num_experts,
-            intermediate_size=config.moe_intermediate_size,
-            num_experts_per_tok=config.num_experts_per_tok,
-            has_shared_experts=getattr(config, "shared_expert_intermediate_size", 0) > 0,
+            expert_num=cfg.num_experts,
+            intermediate_size=cfg.moe_intermediate_size,
+            num_experts_per_tok=cfg.num_experts_per_tok,
+            has_shared_experts=getattr(cfg, "shared_expert_intermediate_size", 0) > 0,
        )
    if "Mixtral" in arch:
        return MOEArchConfig(
@ -123,7 +124,7 @@ def get_moe_arch_config(config) -> MOEArchConfig:

    raise KTAMXModelNotSupportedError(
        f"Model architecture {arch} not supported for KT AMX. "
-        "Supported architectures: DeepseekV2, DeepseekV3, Qwen2Moe, Qwen3Moe, Mixtral"
+        "Supported architectures: DeepseekV2, DeepseekV3, Qwen2Moe, Qwen3Moe, Qwen3_5Moe, Mixtral"
    )


@ -154,8 +155,8 @@ def detect_fused_experts(experts: nn.Module) -> bool:

 def _get_layers_prefix(config) -> str:
    arch = config.architectures[0] if getattr(config, "architectures", None) else ""
-    if any(x in arch for x in ["Deepseek", "Qwen", "Mixtral", "Llama"]):
-        return "model.layers"
+    if "Qwen3_5Moe" in arch:
+        return "model.language_model.layers"
    return "model.layers"


@ -181,7 +182,7 @@ def _get_model_container_and_layers(model: nn.Module, *, purpose: str) -> tuple[
        if layers is not None and isinstance(layers, (list, tuple, nn.ModuleList)):
            return current, layers

-        for attr in ("model", "base_model", "pretrained_model", "module"):
+        for attr in ("model", "base_model", "pretrained_model", "module", "language_model"):
            child = getattr(current, attr, None)
            if isinstance(child, nn.Module) and child is not current:
                to_visit.append(child)