[feat](kt-kernel): adapt MXFP4 MoE backend for DeepSeek-V4-Flash (#1950)

V4-Flash routed experts ship as native MXFP4 (E2M1 nibble + ue8m0 group scale). Expose AMXFP4_KGroup_MOE through NativeMoEWrapper, add a loader that handles V4's `layers.{L}.ffn.experts.{i}.{w1,w3,w2}.{weight,scale}` naming and converts ue8m0 → bf16 via a lossless bit-cast, register the model entry, and ship an end-to-end numerical validation script. Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 20:00:06 +00:00 · 2026-04-25 18:11:53 +08:00 · 2026-04-25 18:11:53 +08:00 · 8484ef8b16
commit 8484ef8b16
parent 5c5d7d48c0
5 changed files with 322 additions and 2 deletions
--- a/kt-kernel/python/cli/utils/model_registry.py
+++ b/kt-kernel/python/cli/utils/model_registry.py
@ -81,6 +81,26 @@ BUILTIN_MODELS: list[ModelInfo] = [
        description="DeepSeek R1-0528 reasoning model (May 2025, improved reasoning depth)",
        description_zh="DeepSeek R1-0528 推理模型（2025年5月，改进的推理深度）",
    ),
+    ModelInfo(
+        name="DeepSeek-V4-Flash",
+        hf_repo="deepseek-ai/DeepSeek-V4-Flash",
+        aliases=["deepseek-v4-flash", "deepseek-v4", "dsv4", "v4-flash", "v4"],
+        type="moe",
+        default_params={
+            "kt-method": "MXFP4",
+            "kt-gpu-prefill-token-threshold": 4096,
+            "attention-backend": "flashinfer",
+            "max-total-tokens": 100000,
+            "max-running-requests": 16,
+            "chunked-prefill-size": 32768,
+            "mem-fraction-static": 0.80,
+            "watchdog-timeout": 3000,
+            "served-model-name": "DeepSeek-V4-Flash",
+            "disable-shared-experts-fusion": True,
+        },
+        description="DeepSeek V4-Flash MoE model (native MXFP4 experts, MQA + sparse index attention)",
+        description_zh="DeepSeek V4-Flash MoE 模型（原生 MXFP4 专家，MQA + 稀疏索引注意力）",
+    ),
    ModelInfo(
        name="Kimi-K2-Thinking",
        hf_repo="moonshotai/Kimi-K2-Thinking",
@ -368,6 +388,19 @@ def compute_deepseek_v3_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb:
    return total_vram // 3


+def compute_deepseek_v4_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb: float) -> int:
+    """Compute kt-num-gpu-experts for DeepSeek-V4-Flash.
+
+    V4 uses MXFP4 experts (~0.5 bytes/param vs V3 FP8's 1 byte/param) so each GPU
+    can hold ~2x more experts per VRAM unit than V3 at the same fragmentation.
+    """
+    per_gpu_gb = 16
+    if vram_per_gpu_gb < per_gpu_gb:
+        return 0
+    total_vram = int(tensor_parallel_size * (vram_per_gpu_gb - per_gpu_gb))
+    return total_vram * 2 // 3
+
+
 def compute_kimi_k2_thinking_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb: float) -> int:
    """Compute kt-num-gpu-experts for Kimi K2 Thinking."""
    per_gpu_gb = 16
@ -393,6 +426,7 @@ MODEL_COMPUTE_FUNCTIONS: dict[str, Callable[[int, float], int]] = {
    "DeepSeek-V3-0324": compute_deepseek_v3_gpu_experts,
    "DeepSeek-V3.2": compute_deepseek_v3_gpu_experts,  # Same as V3-0324
    "DeepSeek-R1-0528": compute_deepseek_v3_gpu_experts,  # Same as V3-0324
+    "DeepSeek-V4-Flash": compute_deepseek_v4_gpu_experts,
    "Kimi-K2-Thinking": compute_kimi_k2_thinking_gpu_experts,
    "MiniMax-M2": compute_minimax_m2_gpu_experts,
    "MiniMax-M2.1": compute_minimax_m2_gpu_experts,  # Same as M2