[Feature] Add avx-based kimi-k2 support (#1656)

* support Kimi-K2-Thinking original weight fix amx kernel bug * update k2 avx kernel. * feat: add CPUInfer write buffer task * [feat]: add kimi k2 cpu write buffer support - Implement write_weights_to_buffer function in k2-moe.hpp for extracting GPU expert weights - Fix down (w2) weight column-wise slicing for different TP configurations - Support three TP scenarios: cpu_tp == gpu_tp, cpu_tp > gpu_tp, cpu_tp < gpu_tp - Add comprehensive test cases for weight extraction validation - Ensure compatibility with Kimi model's MoE architecture * [fix]: correct write_weight_scale_to_buffer expert offset calculation Fixed the bug in write_weight_scale_to_buffer_task where expert offsets in GPU buffers were incorrectly calculated. Changed from using per_expert_gpu sizes to using full gpu_tp sizes, ensuring correct memory layout for multi-expert scenarios. Also added benchmark scripts for k2 moe and write buffer operations, and cleaned up debug output in test files. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * [feat]: add write buffer wrapper * [fix] fix comment --------- Co-authored-by: ouqingliang <1692110604@qq.com> Co-authored-by: Claude <noreply@anthropic.com>
2026-04-28 03:39:48 +00:00 · 2025-12-02 16:01:07 +08:00 · 2025-12-02 16:01:07 +08:00 · fcf8882075
commit fcf8882075
parent c2b8c60c4e
12 changed files with 2649 additions and 34 deletions
--- a/kt-kernel/bench/bench_k2_moe_amx.py
+++ b/kt-kernel/bench/bench_k2_moe_amx.py
@ -0,0 +1,363 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Benchmark AMX_K2_MOE_TP int4 path with packed weights and BF16 scales.
+"""
+import json
+import math
+import os
+import platform
+import subprocess
+import sys
+import time
+
+from tqdm import tqdm
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
+
+import kt_kernel_ext
+import torch
+
+# Benchmark parameters (single MoE, no layer loop)
+expert_num = 384
+hidden_size = 7168
+intermediate_size = 2048
+max_len = 25600
+num_experts_per_tok = 8
+qlen = 1
+warm_up_iter = 1000
+test_iter = 5000
+k_group_size = 32
+
+physical_to_logical_map = (
+    torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
+)
+
+worker_config = kt_kernel_ext.WorkerPoolConfig()
+worker_config.subpool_count = 2
+worker_config.subpool_numa_map = [0, 1]
+worker_config.subpool_thread_count = [40, 40]
+CPUInfer = kt_kernel_ext.CPUInfer(worker_config)
+
+
+def get_git_commit():
+    result = {}
+    try:
+        commit = (
+            subprocess.check_output(["git", "rev-parse", "HEAD"])
+            .decode("utf-8")
+            .strip()
+        )
+        commit_msg = (
+            subprocess.check_output(["git", "log", "-1", "--pretty=%B"])
+            .decode("utf-8")
+            .strip()
+        )
+        result["commit"] = commit
+        result["commit_message"] = commit_msg
+
+        dirty_output = (
+            subprocess.check_output(["git", "status", "--porcelain"])
+            .decode("utf-8")
+            .strip()
+        )
+        if dirty_output:
+            result["dirty"] = True
+            result["dirty_files"] = dirty_output.splitlines()
+        else:
+            result["dirty"] = False
+    except Exception as e:
+        result["commit"] = None
+        result["commit_message"] = None
+        result["dirty"] = None
+        result["error"] = str(e)
+    return result
+
+
+def get_system_info():
+    info = {}
+    uname = platform.uname()
+    info["system_name"] = uname.system
+    info["node_name"] = uname.node
+
+    cpu_model = None
+    if os.path.exists("/proc/cpuinfo"):
+        try:
+            with open("/proc/cpuinfo", "r") as f:
+                for line in f:
+                    if "model name" in line:
+                        cpu_model = line.split(":", 1)[1].strip()
+                        break
+        except Exception as e:
+            cpu_model = f"Error: {e}"
+    info["cpu_model"] = cpu_model
+
+    mem_total_gb = None
+    if os.path.exists("/proc/meminfo"):
+        try:
+            with open("/proc/meminfo", "r") as f:
+                for line in f:
+                    if "MemTotal" in line:
+                        mem_kb = float(line.split(":", 1)[1].split()[0])
+                        mem_total_gb = round(mem_kb / (1024 * 1024), 2)
+                        break
+        except Exception as e:
+            mem_total_gb = f"Error: {e}"
+    info["memory_size_GB"] = mem_total_gb
+
+    info["cpu_core_count"] = os.cpu_count()
+
+    sockets = set()
+    if os.path.exists("/proc/cpuinfo"):
+        try:
+            with open("/proc/cpuinfo", "r") as f:
+                for line in f:
+                    if "physical id" in line:
+                        sockets.add(line.split(":", 1)[1].strip())
+        except Exception:
+            sockets = set()
+    info["cpu_socket_count"] = len(sockets) if len(sockets) > 0 else 1
+
+    return info
+
+
+script_path = os.path.abspath(__file__)
+script_dir = os.path.dirname(script_path)
+script_name = os.path.splitext(os.path.basename(script_path))[0]
+json_path = os.path.join(script_dir, script_name + ".jsonl")
+
+
+def record_results(result, filename=json_path):
+    with open(filename, "a") as f:
+        f.write(json.dumps(result) + "\n")
+
+
+def pack_to_int32(
+    value: torch.Tensor, num_bits: int, packed_dim: int = 1
+) -> torch.Tensor:
+    if value.dtype is not torch.int8:
+        raise ValueError("Tensor must be torch.int8 before packing")
+    if not (1 <= num_bits <= 8):
+        raise ValueError(f"num_bits must be in [1, 8], got {num_bits}")
+
+    offset = 1 << (num_bits - 1)
+    value = (value + offset).to(torch.uint8)
+    device = value.device
+
+    pack_factor = 32 // num_bits
+
+    if packed_dim == 0:
+        value = value.transpose(0, 1)
+
+    rows, cols = value.shape
+    padded_cols = math.ceil(cols / pack_factor) * pack_factor
+    pad_len = padded_cols - cols
+
+    if pad_len > 0:
+        value = torch.nn.functional.pad(value, (0, pad_len))
+
+    num_groups = padded_cols // pack_factor
+    reshaped = value.view(rows, num_groups, pack_factor).to(torch.int32)
+    bit_shifts = torch.arange(pack_factor, device=device, dtype=torch.int32) * num_bits
+    packed = (reshaped << bit_shifts).sum(dim=2, dtype=torch.int32)
+
+    if packed_dim == 0:
+        packed = packed.transpose(0, 1)
+
+    return packed
+
+
+def pack_tensor_per_row(q: torch.Tensor, num_bits: int) -> torch.Tensor:
+    e, rows, cols = q.shape
+    flat = q.view(e * rows, cols)
+    packed = pack_to_int32(flat, num_bits)
+    return packed.view(e, rows, -1).contiguous()
+
+
+def quantize_k2_tensor(weights: torch.Tensor, group_size: int):
+    """
+    K2 int4 quantization producing int32-packed weights (8 int4s each) and BF16 scales.
+    """
+    weights_f32 = weights.to(torch.float32)
+    e, rows, cols = weights_f32.shape
+    if cols % group_size != 0 or cols % 2 != 0:
+        raise ValueError(
+            f"cols ({cols}) must be divisible by group_size ({group_size}) and 2"
+        )
+
+    reshaped = weights_f32.view(e, rows, cols // group_size, group_size)
+    max_abs = reshaped.abs().amax(dim=-1, keepdim=True).clamp(min=1e-8)
+    scales = (max_abs / 7.0).squeeze(-1)
+    q = torch.round(reshaped / scales.unsqueeze(-1)).clamp(-8, 7).to(torch.int8)
+    q = q.view(e, rows, cols)
+    packed = pack_tensor_per_row(q, num_bits=4).view(e, rows, cols // 8).contiguous()
+    scales = scales.to(torch.bfloat16).contiguous().view(
+        e, rows, cols // group_size
+    ).contiguous()
+    return packed, scales
+
+
+def build_quantized_layer_weights():
+    gate_proj = torch.randn(
+        (expert_num, intermediate_size, hidden_size),
+        dtype=torch.float32,
+        device="cpu",
+    ).contiguous()
+    up_proj = torch.randn(
+        (expert_num, intermediate_size, hidden_size),
+        dtype=torch.float32,
+        device="cpu",
+    ).contiguous()
+    down_proj = torch.randn(
+        (expert_num, hidden_size, intermediate_size),
+        dtype=torch.float32,
+        device="cpu",
+    ).contiguous()
+
+    gate_q, gate_scales = quantize_k2_tensor(gate_proj, k_group_size)
+    up_q, up_scales = quantize_k2_tensor(up_proj, k_group_size)
+    down_q, down_scales = quantize_k2_tensor(down_proj, k_group_size)
+
+    return {
+        "gate_qweight": gate_q,
+        "up_qweight": up_q,
+        "down_qweight": down_q,
+        "gate_scales": gate_scales,
+        "up_scales": up_scales,
+        "down_scales": down_scales,
+    }
+
+
+def bench_k2_moe():
+    with torch.inference_mode():
+        bytes_per_elem = 0.5 + 2.0 / k_group_size
+
+        quant_data = build_quantized_layer_weights()
+        config = kt_kernel_ext.moe.MOEConfig(
+            expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0
+        )
+        config.max_len = max_len
+        config.quant_config.bits = 4
+        config.quant_config.group_size = k_group_size
+        config.quant_config.zero_point = False
+
+        config.gate_proj = quant_data["gate_qweight"].data_ptr()
+        config.up_proj = quant_data["up_qweight"].data_ptr()
+        config.down_proj = quant_data["down_qweight"].data_ptr()
+
+        config.gate_scale = quant_data["gate_scales"].data_ptr()
+        config.up_scale = quant_data["up_scales"].data_ptr()
+        config.down_scale = quant_data["down_scales"].data_ptr()
+        config.pool = CPUInfer.backend_
+
+        moe = kt_kernel_ext.moe.AMXInt4_KGroup_MOE(config)
+        CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
+        CPUInfer.sync()
+
+        gen_iter = 3000
+        expert_ids = (
+            torch.rand(gen_iter * qlen, expert_num, device="cpu")
+            .argsort(dim=-1)[:, :num_experts_per_tok]
+            .reshape(gen_iter, qlen * num_experts_per_tok)
+            .contiguous()
+        )
+        weights = torch.rand(
+            (gen_iter, qlen, num_experts_per_tok), dtype=torch.float32, device="cpu"
+        ).contiguous()
+        input_tensor = torch.randn(
+            (qlen, hidden_size), dtype=torch.bfloat16, device="cpu"
+        ).contiguous()
+        output_tensor = torch.empty_like(input_tensor)
+        bsz_tensor = torch.tensor([qlen], device="cpu")
+
+        for i in tqdm(range(warm_up_iter), desc="Warm-up"):
+            CPUInfer.submit(
+                moe.forward_task(
+                    bsz_tensor.data_ptr(),
+                    num_experts_per_tok,
+                    expert_ids[i % gen_iter].data_ptr(),
+                    weights[i % gen_iter].data_ptr(),
+                    input_tensor.data_ptr(),
+                    output_tensor.data_ptr(),
+                    False,
+                )
+            )
+            CPUInfer.sync()
+
+        start = time.perf_counter()
+        for i in tqdm(range(test_iter), desc="Testing"):
+            CPUInfer.submit(
+                moe.forward_task(
+                    bsz_tensor.data_ptr(),
+                    num_experts_per_tok,
+                    expert_ids[i % gen_iter].data_ptr(),
+                    weights[i % gen_iter].data_ptr(),
+                    input_tensor.data_ptr(),
+                    output_tensor.data_ptr(),
+                    False,
+                )
+            )
+            CPUInfer.sync()
+        end = time.perf_counter()
+        total_time = end - start
+
+        time_per_iter_us = total_time / test_iter * 1e6
+        bandwidth = (
+            hidden_size
+            * intermediate_size
+            * 3
+            * num_experts_per_tok
+            * (1 / 8 * 256 * (1 - (31 / 32) ** qlen))
+            * bytes_per_elem
+            * test_iter
+            / total_time
+            / 1e9
+        )
+        flops = (
+            hidden_size
+            * intermediate_size
+            * qlen
+            * 3
+            * num_experts_per_tok
+            * 2
+            * test_iter
+            / total_time
+            / 1e12
+        )
+
+        print("Quant mode: int4_k2")
+        print("Time(s): ", total_time)
+        print("Iteration: ", test_iter)
+        print("Time(us) per iteration: ", time_per_iter_us)
+        print("Bandwidth: ", bandwidth, "GB/s")
+        print("Flops: ", flops, "TFLOPS")
+        print("")
+
+        result = {
+            "quant_mode": "int4_k2",
+            "total_time_seconds": total_time,
+            "iterations": test_iter,
+            "time_per_iteration_us": time_per_iter_us,
+            "bandwidth_GBs": bandwidth,
+            "flops_TFLOPS": flops,
+            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
+            "test_parameters": {
+                "expert_num": expert_num,
+                "hidden_size": hidden_size,
+                "intermediate_size": intermediate_size,
+                "max_len": max_len,
+                "num_experts_per_tok": num_experts_per_tok,
+                "qlen": qlen,
+                "warm_up_iter": warm_up_iter,
+                "test_iter": test_iter,
+                "k_group_size": k_group_size,
+                "bytes_per_elem": bytes_per_elem,
+            },
+        }
+        result.update(get_git_commit())
+        result.update(get_system_info())
+        record_results(result)
+
+
+if __name__ == "__main__":
+    bench_k2_moe()
--- a/kt-kernel/bench/bench_k2_write_buffer.py
+++ b/kt-kernel/bench/bench_k2_write_buffer.py
@ -0,0 +1,309 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Benchmark write_weight_scale_to_buffer for AMX_K2_MOE_TP (int4 packed weights + bf16 scales).
+"""
+import json
+import os
+import platform
+import subprocess
+import sys
+import time
+
+from tqdm import tqdm
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "build"))
+
+import kt_kernel_ext
+import torch
+
+# Benchmark parameters (single MoE, mirror examples/test_k2_write_buffer.py)
+expert_num = 384
+num_experts_per_tok = expert_num
+gpu_tp_count = 4
+
+warm_up_iter = 3
+test_iter = 7
+
+gpu_experts_num = expert_num
+
+hidden_size = 7168
+intermediate_size = 2048
+group_size = 32
+max_len = 1
+
+physical_to_logical_map = torch.arange(expert_num, dtype=torch.int64, device="cpu").contiguous()
+CPUInfer = kt_kernel_ext.CPUInfer(96)
+
+
+def get_git_commit():
+    result = {}
+    try:
+        commit = (
+            subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
+        )
+        commit_msg = (
+            subprocess.check_output(["git", "log", "-1", "--pretty=%B"])
+            .decode("utf-8")
+            .strip()
+        )
+        result["commit"] = commit
+        result["commit_message"] = commit_msg
+
+        dirty_output = (
+            subprocess.check_output(["git", "status", "--porcelain"]).decode("utf-8").strip()
+        )
+        if dirty_output:
+            result["dirty"] = True
+            result["dirty_files"] = dirty_output.splitlines()
+        else:
+            result["dirty"] = False
+    except Exception as e:
+        result["commit"] = None
+        result["commit_message"] = None
+        result["dirty"] = None
+        result["error"] = str(e)
+    return result
+
+
+def get_system_info():
+    info = {}
+    uname = platform.uname()
+    info["system_name"] = uname.system
+    info["node_name"] = uname.node
+
+    cpu_model = None
+    if os.path.exists("/proc/cpuinfo"):
+        try:
+            with open("/proc/cpuinfo", "r") as f:
+                for line in f:
+                    if "model name" in line:
+                        cpu_model = line.split(":", 1)[1].strip()
+                        break
+        except Exception as e:
+            cpu_model = f"Error: {e}"
+    info["cpu_model"] = cpu_model
+
+    mem_total_gb = None
+    if os.path.exists("/proc/meminfo"):
+        try:
+            with open("/proc/meminfo", "r") as f:
+                for line in f:
+                    if "MemTotal" in line:
+                        mem_kb = float(line.split(":", 1)[1].split()[0])
+                        mem_total_gb = round(mem_kb / (1024 * 1024), 2)
+                        break
+        except Exception as e:
+            mem_total_gb = f"Error: {e}"
+    info["memory_size_GB"] = mem_total_gb
+
+    info["cpu_core_count"] = os.cpu_count()
+
+    sockets = set()
+    if os.path.exists("/proc/cpuinfo"):
+        try:
+            with open("/proc/cpuinfo", "r") as f:
+                for line in f:
+                    if "physical id" in line:
+                        sockets.add(line.split(":", 1)[1].strip())
+        except Exception:
+            sockets = set()
+    info["cpu_socket_count"] = len(sockets) if len(sockets) > 0 else 1
+
+    return info
+
+
+script_path = os.path.abspath(__file__)
+script_dir = os.path.dirname(script_path)
+script_name = os.path.splitext(os.path.basename(script_path))[0]
+json_path = os.path.join(script_dir, script_name + ".jsonl")
+
+
+def record_results(result, filename=json_path):
+    with open(filename, "a") as f:
+        f.write(json.dumps(result) + "\n")
+
+
+def allocate_weights():
+    per_mat_weight_bytes = (hidden_size * intermediate_size) // 2
+    per_mat_scale_elems = (hidden_size * intermediate_size) // group_size
+
+    gate_q = torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8)
+    up_q = torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8)
+    down_q = torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8)
+
+    gate_scale = torch.randn(expert_num * per_mat_scale_elems, dtype=torch.bfloat16)
+    up_scale = torch.randn(expert_num * per_mat_scale_elems, dtype=torch.bfloat16)
+    down_scale = torch.randn(expert_num * per_mat_scale_elems, dtype=torch.bfloat16)
+
+    return (
+        gate_q.contiguous(),
+        up_q.contiguous(),
+        down_q.contiguous(),
+        gate_scale.contiguous(),
+        up_scale.contiguous(),
+        down_scale.contiguous(),
+        per_mat_weight_bytes,
+        per_mat_scale_elems,
+    )
+
+
+def build_moe():
+    (
+        gate_q,
+        up_q,
+        down_q,
+        gate_scale,
+        up_scale,
+        down_scale,
+        per_mat_weight_bytes,
+        per_mat_scale_elems,
+    ) = allocate_weights()
+
+    config = kt_kernel_ext.moe.MOEConfig(
+        expert_num, num_experts_per_tok, hidden_size, intermediate_size
+    )
+    config.max_len = max_len
+    config.quant_config.bits = 4
+    config.quant_config.group_size = group_size
+    config.quant_config.zero_point = False
+    config.pool = CPUInfer.backend_
+
+    config.gate_proj = gate_q.data_ptr()
+    config.up_proj = up_q.data_ptr()
+    config.down_proj = down_q.data_ptr()
+    config.gate_scale = gate_scale.data_ptr()
+    config.up_scale = up_scale.data_ptr()
+    config.down_scale = down_scale.data_ptr()
+
+    moe = kt_kernel_ext.moe.AMXInt4_KGroup_MOE(config)
+    CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
+    CPUInfer.sync()
+
+    # Buffer sizing per TP
+    weight_bytes_per_expert_per_tp = per_mat_weight_bytes // gpu_tp_count
+    scale_elems_per_expert_per_tp = per_mat_scale_elems // gpu_tp_count
+    total_weight_bytes_per_tp = gpu_experts_num * weight_bytes_per_expert_per_tp
+    total_scale_elems_per_tp = gpu_experts_num * scale_elems_per_expert_per_tp
+
+    w13_weight_bufs = [
+        torch.empty(2 * total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)
+    ]
+    w13_scale_bufs = [
+        torch.empty(2 * total_scale_elems_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count)
+    ]
+    w2_weight_bufs = [
+        torch.empty(total_weight_bytes_per_tp, dtype=torch.uint8) for _ in range(gpu_tp_count)
+    ]
+    w2_scale_bufs = [
+        torch.empty(total_scale_elems_per_tp, dtype=torch.bfloat16) for _ in range(gpu_tp_count)
+    ]
+
+    buffer_ptrs = {
+        "w13_weight_ptrs": [buf.data_ptr() for buf in w13_weight_bufs],
+        "w13_scale_ptrs": [buf.data_ptr() for buf in w13_scale_bufs],
+        "w2_weight_ptrs": [buf.data_ptr() for buf in w2_weight_bufs],
+        "w2_scale_ptrs": [buf.data_ptr() for buf in w2_scale_bufs],
+    }
+
+    buffer_shapes = {
+        "per_mat_weight_bytes": per_mat_weight_bytes,
+        "per_mat_scale_elems": per_mat_scale_elems,
+        "weight_bytes_per_expert_per_tp": weight_bytes_per_expert_per_tp,
+        "scale_elems_per_expert_per_tp": scale_elems_per_expert_per_tp,
+        "total_weight_bytes_per_tp": total_weight_bytes_per_tp,
+        "total_scale_elems_per_tp": total_scale_elems_per_tp,
+    }
+
+    keep_tensors = {
+        "gate_q": gate_q,
+        "up_q": up_q,
+        "down_q": down_q,
+        "gate_scale": gate_scale,
+        "up_scale": up_scale,
+        "down_scale": down_scale,
+        "w13_weight_bufs": w13_weight_bufs,
+        "w13_scale_bufs": w13_scale_bufs,
+        "w2_weight_bufs": w2_weight_bufs,
+        "w2_scale_bufs": w2_scale_bufs,
+    }
+
+    return moe, buffer_ptrs, buffer_shapes, keep_tensors
+
+
+def bench_write_buffer():
+    moe, buffer_ptrs, buffer_shapes, keep_tensors = build_moe()
+
+    total_weights = hidden_size * intermediate_size * expert_num * 3
+    # Throughput accounting consistent with examples/test_k2_write_buffer.py
+    bytes_per_call = total_weights // group_size + total_weights // 2
+
+    # Warm-up
+    for _ in tqdm(range(warm_up_iter), desc="Warm-up"):
+        CPUInfer.submit(
+            moe.write_weight_scale_to_buffer_task(
+                gpu_tp_count=gpu_tp_count,
+                gpu_experts_num=gpu_experts_num,
+                **buffer_ptrs,
+            )
+        )
+        CPUInfer.sync()
+    
+    total_time = 0
+    for _ in tqdm(range(test_iter), desc="Testing"):
+        start = time.perf_counter()
+        CPUInfer.submit(
+            moe.write_weight_scale_to_buffer_task(
+                gpu_tp_count=gpu_tp_count,
+                gpu_experts_num=gpu_experts_num,
+                **buffer_ptrs,
+            )
+        )
+        CPUInfer.sync()
+        end = time.perf_counter()
+        total_time += end - start
+        time.sleep(0.6)
+        print(end - start)
+
+
+    
+    time_per_iter_us = total_time / test_iter * 1e6
+    bandwidth_gbs = bytes_per_call * test_iter / total_time / 1e9
+
+    print("write_weight_scale_to_buffer benchmark")
+    print("Time(s): ", total_time)
+    print("Iteration: ", test_iter)
+    print("Time(us) per iteration: ", time_per_iter_us)
+    print("Bandwidth: ", bandwidth_gbs, "GB/s")
+    print("")
+
+    result = {
+        "op": "write_weight_scale_to_buffer",
+        "total_time_seconds": total_time,
+        "iterations": test_iter,
+        "time_per_iteration_us": time_per_iter_us,
+        "bandwidth_GBs": bandwidth_gbs,
+        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
+        "test_parameters": {
+            "expert_num": expert_num,
+            "hidden_size": hidden_size,
+            "intermediate_size": intermediate_size,
+            "group_size": group_size,
+            "max_len": max_len,
+            "num_experts_per_tok": num_experts_per_tok,
+            "gpu_tp_count": gpu_tp_count,
+            "gpu_experts_num": gpu_experts_num,
+            "warm_up_iter": warm_up_iter,
+            "test_iter": test_iter,
+            "bytes_per_call": bytes_per_call,
+        },
+        "buffer_shapes": buffer_shapes,
+        "keep_tensors_alive": list(keep_tensors.keys()),
+    }
+    result.update(get_git_commit())
+    result.update(get_system_info())
+    record_results(result)
+
+
+if __name__ == "__main__":
+    bench_write_buffer()
--- a/kt-kernel/examples/test_k2_moe_amx.py
+++ b/kt-kernel/examples/test_k2_moe_amx.py
@ -0,0 +1,319 @@
+import math
+import os
+import sys
+from typing import Dict, Literal
+
+sys.path.insert(0, os.path.dirname(__file__) + "/../build")
+
+import torch
+import kt_kernel_ext
+
+torch.manual_seed(42)
+
+hidden_size = 7168
+intermediate_size = 2048
+max_len = 25600
+
+expert_num = 16
+num_experts_per_tok = 8
+
+qlen = 1
+layer_num = 1
+CPUInfer = kt_kernel_ext.CPUInfer(40)
+validation_iter = 10
+k_group_size = 32
+debug_print_count = 16
+
+physical_to_logical_map = torch.tensor(data=range(expert_num), device="cpu", dtype=torch.int64).contiguous()
+
+
+def _pattern_uniform(groups: int) -> torch.Tensor:
+    return torch.full((groups,), 0.02, dtype=torch.float32)
+
+
+def _pattern_alternating(groups: int) -> torch.Tensor:
+    vals = torch.full((groups,), 0.015, dtype=torch.float32)
+    vals[1::2] = 0.03
+    return vals
+
+
+def _pattern_ramp(groups: int) -> torch.Tensor:
+    return torch.linspace(0.005, 0.04, steps=groups, dtype=torch.float32)
+
+
+WEIGHT_PATTERNS = {
+    "uniform_scale": ("All k-groups share the same abs max / scale", _pattern_uniform),
+    "alternating_scale": ("Alternate small / large abs max per k-group", _pattern_alternating),
+    "ramp_scale": ("Linearly increasing abs max per k-group", _pattern_ramp),
+    "random": ("Random bf16 weights (baseline)", None),
+}
+
+
+def act_fn(x):
+    return x / (1.0 + torch.exp(-x))
+
+
+def mlp_torch(input, gate_proj, up_proj, down_proj):
+    gate_buf = torch.mm(input, gate_proj.t())
+    up_buf = torch.mm(input, up_proj.t())
+    print(f"gate_buf: {gate_buf}")
+    print(f"up_buf: {up_buf}")
+    intermediate = act_fn(gate_buf) * up_buf
+    ret = torch.mm(intermediate, down_proj.t())
+    print(f"intermediate: {intermediate}")
+    print(f"mlp output: {ret}")
+    return ret
+
+
+def moe_torch(input, expert_ids, weights, gate_proj, up_proj, down_proj):
+    cnts = expert_ids.new_zeros((expert_ids.shape[0], expert_num))
+    cnts.scatter_(1, expert_ids, 1)
+    tokens_per_expert = cnts.sum(dim=0)
+    idxs = expert_ids.view(-1).argsort()
+    sorted_tokens = input[idxs // expert_ids.shape[1]]
+
+    outputs = []
+    start_idx = 0
+
+    for i, num_tokens in enumerate(tokens_per_expert):
+        end_idx = start_idx + num_tokens
+        if num_tokens == 0:
+            continue
+        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+        expert_out = mlp_torch(tokens_for_this_expert, gate_proj[i], up_proj[i], down_proj[i])
+        outputs.append(expert_out)
+        start_idx = end_idx
+
+    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
+
+    new_x = torch.empty_like(outs)
+    new_x[idxs] = outs
+    t_output = (
+        new_x.view(*expert_ids.shape, -1)
+        .type(weights.dtype)
+        .mul_(weights.unsqueeze(dim=-1))
+        .sum(dim=1)
+        .type(new_x.dtype)
+    )
+    return t_output
+
+
+def pack_to_int32(value: torch.Tensor, num_bits: int, packed_dim: Literal[0, 1] = 1) -> torch.Tensor:
+    if value.dtype is not torch.int8:
+        raise ValueError("Tensor must be torch.int8 before packing")
+    if not (1 <= num_bits <= 8):
+        raise ValueError(f"num_bits must be in [1, 8], got {num_bits}")
+
+    offset = 1 << (num_bits - 1)
+    value = (value + offset).to(torch.uint8)
+    device = value.device
+
+    pack_factor = 32 // num_bits
+
+    if packed_dim == 0:
+        value = value.transpose(0, 1)
+
+    rows, cols = value.shape
+    padded_cols = math.ceil(cols / pack_factor) * pack_factor
+    pad_len = padded_cols - cols
+
+    if pad_len > 0:
+        value = torch.nn.functional.pad(value, (0, pad_len))
+
+    num_groups = padded_cols // pack_factor
+
+    # Use int32 here
+    reshaped = value.view(rows, num_groups, pack_factor).to(torch.int32)
+    bit_shifts = torch.arange(pack_factor, device=device, dtype=torch.int32) * num_bits
+    packed = (reshaped << bit_shifts).sum(dim=2, dtype=torch.int32)
+
+    if packed_dim == 0:
+        packed = packed.transpose(0, 1)
+
+    return packed
+
+def pack_tensor_per_row(q: torch.Tensor, num_bits: int) -> torch.Tensor:
+    e, rows, cols = q.shape
+    flat = q.view(e * rows, cols)
+    packed = pack_to_int32(flat, num_bits)
+    return packed.view(e, rows, -1).contiguous()
+
+
+def quantize_k2_tensor(weights: torch.Tensor, group_size: int):
+    """
+    Symmetric max-abs/7 quantization per k-group following compressed_tensors packing.
+    Args:
+        weights: [expert_num, rows (N), cols (K)]
+    Returns:
+        packed_q: int32 tensor storing 8 int4s per element with shape [expert_num, rows * (cols // 8)]
+        scales: bfloat16 tensor with shape [expert_num, rows * (cols // group_size)]
+    """
+    weights_f32 = weights.to(torch.float32)
+    e, rows, cols = weights_f32.shape
+    if cols % group_size != 0 or cols % 2 != 0:
+        raise ValueError(f"cols ({cols}) must be divisible by group_size ({group_size}) and 2")
+
+    reshaped = weights_f32.view(e, rows, cols // group_size, group_size)
+    max_abs = reshaped.abs().amax(dim=-1, keepdim=True)
+    max_abs = torch.clamp(max_abs, min=1e-8)
+    scales = (max_abs / 7.0).squeeze(-1)
+    q = torch.round(reshaped / scales.unsqueeze(-1)).clamp(-8, 7).to(torch.int8)
+    q = q.view(e, rows, cols)
+    packed = pack_tensor_per_row(q, num_bits=4).view(e, rows, cols // 8).contiguous()
+    scales = scales.to(torch.bfloat16).contiguous().view(e, rows, cols // group_size).contiguous()
+
+    print(f"Quantized weights: {packed.shape}, scales: {scales.shape}")
+    print(f"Quantized tensors: \n{packed},\n {scales}")
+    return packed, scales
+
+
+def build_structured_tensor(shape: torch.Size, pattern: str) -> torch.Tensor:
+    if pattern == "random":
+        torch.manual_seed(42)
+        return (torch.randn(shape, dtype=torch.bfloat16, device="cpu") / 100.0).contiguous()
+
+    e, rows, cols = shape
+    groups = cols // k_group_size
+    group_builder = WEIGHT_PATTERNS[pattern][1]
+    group_vals = group_builder(groups).to(torch.float32)
+    block = group_vals.view(1, 1, groups, 1).expand(e, rows, groups, k_group_size).clone()
+    row_signs = torch.where(
+        (torch.arange(rows) % 2 == 0),
+        torch.ones(rows, dtype=torch.float32),
+        -torch.ones(rows, dtype=torch.float32),
+    ).view(1, rows, 1, 1)
+    col_offsets = torch.linspace(-0.0005, 0.0005, steps=k_group_size, dtype=torch.float32).view(1, 1, 1, k_group_size)
+    block = block * row_signs + col_offsets
+    return block.reshape(shape).to(torch.bfloat16).contiguous()
+
+
+def prepare_k2_quantized_weights(pattern: str) -> Dict[str, torch.Tensor]:
+    if pattern not in WEIGHT_PATTERNS:
+        raise ValueError(f"Unknown weight pattern: {pattern}")
+
+    gate_proj = build_structured_tensor((expert_num, intermediate_size, hidden_size), pattern)
+    up_proj = build_structured_tensor((expert_num, intermediate_size, hidden_size), pattern)
+    down_proj = build_structured_tensor((expert_num, hidden_size, intermediate_size), pattern)
+
+    gate_q, gate_scales = quantize_k2_tensor(gate_proj, k_group_size)
+    up_q, up_scales = quantize_k2_tensor(up_proj, k_group_size)
+    down_q, down_scales = quantize_k2_tensor(down_proj, k_group_size)
+
+    return {
+        "gate_qweight": gate_q.contiguous(),
+        "up_qweight": up_q.contiguous(),
+        "down_qweight": down_q.contiguous(),
+        "gate_scales": gate_scales.contiguous(),
+        "up_scales": up_scales.contiguous(),
+        "down_scales": down_scales.contiguous(),
+        "original_fp16": {
+            "gate_proj": gate_proj.to(torch.float16).contiguous(),
+            "up_proj": up_proj.to(torch.float16).contiguous(),
+            "down_proj": down_proj.to(torch.float16).contiguous(),
+        },
+    }
+
+
+def build_moes_from_quantized_data(quant_data: Dict[str, torch.Tensor]):
+    moes = []
+    with torch.inference_mode(mode=True):
+        for _ in range(layer_num):
+            config = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size, 0)
+            config.max_len = max_len
+            config.quant_config.bits = 4
+            config.quant_config.group_size = k_group_size
+            config.quant_config.zero_point = False
+
+            config.gate_proj = quant_data["gate_qweight"].data_ptr()
+            config.up_proj = quant_data["up_qweight"].data_ptr()
+            config.down_proj = quant_data["down_qweight"].data_ptr()
+
+            config.gate_scale = quant_data["gate_scales"].data_ptr()
+            config.up_scale = quant_data["up_scales"].data_ptr()
+            config.down_scale = quant_data["down_scales"].data_ptr()
+            config.pool = CPUInfer.backend_
+
+            moe = kt_kernel_ext.moe.AMXInt4_KGroup_MOE(config)
+            CPUInfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
+            CPUInfer.sync()
+            # CPUInfer.submit(moe.warm_up_task())
+            # CPUInfer.sync()
+            moes.append(moe)
+    return moes
+
+
+def run_case(pattern: str) -> Dict[str, float]:
+    print("\n" + "=" * 70)
+    desc = WEIGHT_PATTERNS[pattern][0]
+    print(f"Running case: {pattern} -> {desc}")
+    print("=" * 70)
+
+    quant_data = prepare_k2_quantized_weights(pattern)
+    moes = build_moes_from_quantized_data(quant_data)
+
+    original_weights = quant_data["original_fp16"]
+    gate_fp16 = original_weights["gate_proj"]
+    up_fp16 = original_weights["up_proj"]
+    down_fp16 = original_weights["down_proj"]
+
+    diffs = []
+    with torch.inference_mode(mode=True):
+        for i in range(validation_iter):
+            torch.manual_seed(100 + i)
+            bsz_tensor = torch.tensor([qlen], device="cpu")
+            expert_ids = torch.stack(
+                [torch.randperm(expert_num)[:num_experts_per_tok] for _ in range(qlen)]
+            ).contiguous()
+            weights = torch.randn((qlen, num_experts_per_tok), dtype=torch.float32).contiguous()
+            input_tensor = torch.randn((qlen, hidden_size), dtype=torch.bfloat16).contiguous() / 100
+            output = torch.empty((qlen, hidden_size), dtype=torch.bfloat16).contiguous()
+
+            moe = moes[i % layer_num]
+            CPUInfer.submit(
+                moe.forward_task(
+                    bsz_tensor.data_ptr(),
+                    num_experts_per_tok,
+                    expert_ids.data_ptr(),
+                    weights.data_ptr(),
+                    input_tensor.data_ptr(),
+                    output.data_ptr(),
+                    False,
+                )
+            )
+            CPUInfer.sync()
+
+            input_tensor_fp16 = input_tensor.to(torch.float16)
+            t_output = moe_torch(
+                input_tensor_fp16, expert_ids, weights, gate_fp16, up_fp16, down_fp16
+            ).to(torch.bfloat16)
+
+            t_output = t_output.flatten()
+            output = output.flatten()
+
+            diff = torch.mean(torch.abs(output - t_output)) / (torch.mean(torch.abs(t_output)) + 1e-12)
+            diffs.append(diff.item())
+            print(f"[{pattern}] Iteration {i}: relative L1 diff = {diff:.4f}")
+            print(f"           output   {output}")
+            print(f"           t_output {t_output}")
+
+    mean_diff = float(sum(diffs) / len(diffs))
+    max_diff = float(max(diffs))
+    min_diff = float(min(diffs))
+    return {"case": pattern, "description": desc, "mean": mean_diff, "max": max_diff, "min": min_diff}
+
+
+def run_k2_moe_test():
+    summary_rows = []
+    for case_name in WEIGHT_PATTERNS.keys():
+        results = run_case(case_name)
+        summary_rows.append(results)
+        # break
+
+    print("\n=== Case vs. Relative Error Summary ===")
+    print(f"{'Case':<20} {'Mean':>10} {'Max':>10} {'Min':>10}")
+    for row in summary_rows:
+        print(f"{row['case']:<20} {row['mean']*100:9.2f}% {row['max']*100:9.2f}% {row['min']*100:9.2f}%")
+
+
+if __name__ == "__main__":
+    run_k2_moe_test()
--- a/kt-kernel/examples/test_k2_write_buffer.py
+++ b/kt-kernel/examples/test_k2_write_buffer.py
@ -0,0 +1,267 @@
+import os
+import sys
+import time
+
+import torch
+import numpy as np
+
+
+# Ensure we can import the local extension
+# REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
+# if REPO_ROOT not in sys.path:
+#     sys.path.insert(0, REPO_ROOT)
+
+import kt_kernel_ext
+from kt_kernel_ext import CPUInfer
+
+
+def make_cpu_infer(thread_num=80):
+    return CPUInfer(thread_num)
+
+
+def build_config(cpuinfer, expert_num, num_experts_per_tok, hidden_size, intermediate_size, group_size):
+    cfg = kt_kernel_ext.moe.MOEConfig(expert_num, num_experts_per_tok, hidden_size, intermediate_size)
+    cfg.max_len = 1
+    cfg.quant_config.bits = 4
+    cfg.quant_config.group_size = group_size
+    cfg.quant_config.zero_point = False
+    cfg.pool = cpuinfer.backend_
+    return cfg
+
+
+def allocate_weights(expert_num, hidden_size, intermediate_size, group_size):
+    # packed int4 weights: 2 values per byte
+    per_mat_weight_bytes = (hidden_size * intermediate_size) // 2
+    per_mat_scale_elems = (hidden_size * intermediate_size) // group_size
+
+    gate_q = torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8)
+    up_q = torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8)
+    down_q = torch.randint(0, 256, (expert_num * per_mat_weight_bytes,), dtype=torch.uint8)
+
+    gate_scale = torch.randn(expert_num * per_mat_scale_elems, dtype=torch.bfloat16)
+    up_scale = torch.randn(expert_num * per_mat_scale_elems, dtype=torch.bfloat16)
+    down_scale = torch.randn(expert_num * per_mat_scale_elems, dtype=torch.bfloat16)
+
+    return (
+        gate_q,
+        up_q,
+        down_q,
+        gate_scale,
+        up_scale,
+        down_scale,
+        per_mat_weight_bytes,
+        per_mat_scale_elems,
+    )
+
+
+def main():
+    torch.manual_seed(123)
+
+    expert_num = 256 # Total experts
+    gpu_experts = expert_num  # Number of experts on GPU
+    gpu_tp_count = 2  # Number of TP parts
+    
+    num_experts_per_tok = 8
+    hidden_size = 7168
+    intermediate_size = 2048
+    group_size = 32
+
+    cpuinfer = make_cpu_infer()
+    cfg = build_config(cpuinfer, expert_num, num_experts_per_tok, hidden_size, intermediate_size, group_size)
+
+    (
+        gate_q,
+        up_q,
+        down_q,
+        gate_scale,
+        up_scale,
+        down_scale,
+        per_mat_weight_bytes,
+        per_mat_scale_elems,
+    ) = allocate_weights(expert_num, hidden_size, intermediate_size, group_size)
+
+    cfg.gate_proj = gate_q.data_ptr()
+    cfg.up_proj = up_q.data_ptr()
+    cfg.down_proj = down_q.data_ptr()
+    cfg.gate_scale = gate_scale.data_ptr()
+    cfg.up_scale = up_scale.data_ptr()
+    cfg.down_scale = down_scale.data_ptr()
+
+    moe = kt_kernel_ext.moe.AMXInt4_KGroup_MOE(cfg)
+
+    physical_to_logical_map = (
+        torch.arange(expert_num, dtype=torch.int64, device="cpu").contiguous()
+    )
+    cpuinfer.submit(moe.load_weights_task(physical_to_logical_map.data_ptr()))
+    cpuinfer.sync()
+
+    # TP configuration
+
+    # Since weights are col-major, we can directly divide the total size by tp_count
+    # Each matrix is divided into gpu_tp_count parts in memory order
+
+    # Calculate sizes per TP part (direct division since col-major)
+    weight_bytes_per_expert_per_tp = per_mat_weight_bytes // gpu_tp_count
+    scale_elems_per_expert_per_tp = per_mat_scale_elems // gpu_tp_count
+
+    # Total sizes for all gpu_experts
+    total_weight_bytes_per_tp = gpu_experts * weight_bytes_per_expert_per_tp
+    total_scale_elems_per_tp = gpu_experts * scale_elems_per_expert_per_tp
+
+    # Create buffer lists for w13 (gate+up) and w2 (down)
+    w13_weight_bufs = []
+    w13_scale_bufs = []
+    w2_weight_bufs = []
+    w2_scale_bufs = []
+
+    for tp_idx in range(gpu_tp_count):
+        # w13 combines gate and up, so needs 2x the size
+        w13_weight_bufs.append(torch.empty(2 * total_weight_bytes_per_tp, dtype=torch.uint8))
+        w13_scale_bufs.append(torch.empty(2 * total_scale_elems_per_tp, dtype=torch.bfloat16))
+        w2_weight_bufs.append(torch.empty(total_weight_bytes_per_tp, dtype=torch.uint8))
+        w2_scale_bufs.append(torch.empty(total_scale_elems_per_tp, dtype=torch.bfloat16))
+
+    # Get data pointers for all buffers
+    w13_weight_ptrs = [buf.data_ptr() for buf in w13_weight_bufs]
+    w13_scale_ptrs = [buf.data_ptr() for buf in w13_scale_bufs]
+    w2_weight_ptrs = [buf.data_ptr() for buf in w2_weight_bufs]
+    w2_scale_ptrs = [buf.data_ptr() for buf in w2_scale_bufs]
+
+    print(f"Total experts: {expert_num}, GPU experts: {gpu_experts}")
+    print(f"GPU TP count: {gpu_tp_count}")
+    print(f"Original per matrix weight bytes: {per_mat_weight_bytes}")
+    print(f"Original per matrix scale elements: {per_mat_scale_elems}")
+    print(f"Weight bytes per expert per TP: {weight_bytes_per_expert_per_tp}")
+    print(f"Scale elements per expert per TP: {scale_elems_per_expert_per_tp}")
+    print(f"Total weight bytes per TP (w13): {2 * total_weight_bytes_per_tp}")
+    print(f"Total weight bytes per TP (w2): {total_weight_bytes_per_tp}")
+    print(f"Total scale elements per TP (w13): {2 * total_scale_elems_per_tp}")
+    print(f"Total scale elements per TP (w2): {total_scale_elems_per_tp}")
+
+    for i in range(5):
+        cpuinfer.submit(
+            moe.write_weight_scale_to_buffer_task(
+                gpu_tp_count=gpu_tp_count,
+                gpu_experts_num=gpu_experts,
+                w13_weight_ptrs=w13_weight_ptrs,
+                w13_scale_ptrs=w13_scale_ptrs,
+                w2_weight_ptrs=w2_weight_ptrs,
+                w2_scale_ptrs=w2_scale_ptrs,
+            )
+        )
+        cpuinfer.sync()
+
+    begin_time = time.perf_counter_ns()
+    cpuinfer.submit(
+        moe.write_weight_scale_to_buffer_task(
+            gpu_tp_count=gpu_tp_count,
+            gpu_experts_num=gpu_experts,
+            w13_weight_ptrs=w13_weight_ptrs,
+            w13_scale_ptrs=w13_scale_ptrs,
+            w2_weight_ptrs=w2_weight_ptrs,
+            w2_scale_ptrs=w2_scale_ptrs,
+        )
+    )
+    cpuinfer.sync()
+    end_time = time.perf_counter_ns()
+    elapsed_ms = (end_time - begin_time) / 1000000
+    total_weights = hidden_size * intermediate_size * expert_num * 3
+    total_bytes = total_weights // group_size + total_weights // 2
+    print(f"write_weight_scale_to_buffer time: {elapsed_ms:.2f} ms")
+    print(f"Throughput: {total_bytes / (elapsed_ms * 1e6):.2f} GB/s")
+    def split_expert_tensor(tensor, chunk):
+        """Split tensor by experts"""
+        return [tensor[i * chunk : (i + 1) * chunk] for i in range(expert_num)]
+
+    # Split by experts first
+    gate_q_experts = split_expert_tensor(gate_q, per_mat_weight_bytes)
+    up_q_experts = split_expert_tensor(up_q, per_mat_weight_bytes)
+    down_q_experts = split_expert_tensor(down_q, per_mat_weight_bytes)
+
+    gate_scale_experts = split_expert_tensor(gate_scale, per_mat_scale_elems)
+    up_scale_experts = split_expert_tensor(up_scale, per_mat_scale_elems)
+    down_scale_experts = split_expert_tensor(down_scale, per_mat_scale_elems)
+
+    # CPU TP count is always 2 in this test setup (one TP per NUMA node)
+    cpu_tp_count = 2
+
+    # Verify buffers for each TP part
+    for tp_idx in range(gpu_tp_count):
+        expected_w13_weights = []
+        expected_w13_scales = []
+        expected_w2_weights = []
+        expected_w2_scales = []
+
+        weight13_per_tp = per_mat_weight_bytes // gpu_tp_count
+        scale13_per_tp = per_mat_scale_elems // gpu_tp_count
+        # Process each GPU expert
+        for expert_idx in range(gpu_experts):
+            # For w13 (gate and up), the slicing is straightforward
+
+            start_weight = tp_idx * weight13_per_tp
+            end_weight = (tp_idx + 1) * weight13_per_tp
+            start_scale = tp_idx * scale13_per_tp
+            end_scale = (tp_idx + 1) * scale13_per_tp
+
+            # Gate
+            gate_weight_tp = gate_q_experts[expert_idx][start_weight:end_weight]
+            gate_scale_tp = gate_scale_experts[expert_idx][start_scale:end_scale]
+
+            # Up
+            up_weight_tp = up_q_experts[expert_idx][start_weight:end_weight]
+            up_scale_tp = up_scale_experts[expert_idx][start_scale:end_scale]
+
+            # Down matrix needs special handling because it's sliced column-wise
+            # We need to reconstruct it from column slices
+            down_weight_tp_parts = []
+            down_scale_tp_parts = []
+
+            # Iterate through each column to extract the corresponding parts
+            for col_idx in range(hidden_size):
+                col_weight_start = col_idx * (intermediate_size // 2)
+                col_scale_start = col_idx * (intermediate_size // group_size)
+
+                # Direct mapping: each CPU TP corresponds to a GPU TP
+                tp_slice_weight_size = (intermediate_size // gpu_tp_count) // 2
+                tp_slice_scale_size = (intermediate_size // gpu_tp_count) // group_size
+
+                tp_weight_offset = col_weight_start + tp_idx * tp_slice_weight_size
+                tp_scale_offset = col_scale_start + tp_idx * tp_slice_scale_size
+
+                down_weight_tp_parts.append(
+                    down_q_experts[expert_idx][tp_weight_offset:tp_weight_offset + tp_slice_weight_size]
+                )
+                down_scale_tp_parts.append(
+                    down_scale_experts[expert_idx][tp_scale_offset:tp_scale_offset + tp_slice_scale_size]
+                )
+
+            # Concatenate all column slices for this TP
+            down_weight_tp = torch.cat(down_weight_tp_parts)
+            down_scale_tp = torch.cat(down_scale_tp_parts)
+
+            expected_w13_weights.append(gate_weight_tp)
+            expected_w13_weights.append(up_weight_tp)
+            expected_w13_scales.append(gate_scale_tp)
+            expected_w13_scales.append(up_scale_tp)
+            expected_w2_weights.append(down_weight_tp)
+            expected_w2_scales.append(down_scale_tp)
+
+        # Concatenate all experts for this TP part
+        expected_w13_weight = torch.cat(expected_w13_weights)
+        expected_w13_scale = torch.cat(expected_w13_scales)
+        expected_w2_weight = torch.cat(expected_w2_weights)
+        expected_w2_scale = torch.cat(expected_w2_scales)
+
+        print(f"=== Checking TP part {tp_idx} ===")
+
+        # Assert all checks pass
+        assert torch.equal(w13_weight_bufs[tp_idx], expected_w13_weight), f"w13 weight bytes mismatch for TP {tp_idx}"
+        assert torch.allclose(w13_scale_bufs[tp_idx], expected_w13_scale), f"w13 scale values mismatch for TP {tp_idx}"
+        assert torch.equal(w2_weight_bufs[tp_idx], expected_w2_weight), f"w2 weight bytes mismatch for TP {tp_idx}"
+        assert torch.allclose(w2_scale_bufs[tp_idx], expected_w2_scale), f"w2 scale values mismatch for TP {tp_idx}"
+
+    print(f"\n✓ write_weight_scale_to_buffer passed: extracted {gpu_experts} GPU experts across {gpu_tp_count} TP parts from total {expert_num} experts")
+
+
+if __name__ == "__main__":
+    main()
--- a/kt-kernel/ext_bindings.cpp
+++ b/kt-kernel/ext_bindings.cpp
@ -36,6 +36,7 @@ static const bool _is_plain_ = false;

 #if defined(__x86_64__) && defined(USE_AMX_AVX_KERNEL)
 #include "operators/amx/awq-moe.hpp"
+#include "operators/amx/k2-moe.hpp"
 #include "operators/amx/la/amx_kernels.hpp"
 #include "operators/amx/moe.hpp"
 #endif
@ -43,6 +44,7 @@ static const bool _is_plain_ = false;

 #include <cstdint>
 #include <memory>
+#include <type_traits>

 #include "operators/kvcache/kvcache.h"
 #include "operators/llamafile/linear.h"
@ -225,7 +227,9 @@ void bind_moe_module(py::module_& moe_module, const char* name) {
  using MoeClass = TP_MOE<MoeTP>;
  using MoeBindings = MOEBindings<MoeTP>;

-  py::class_<MoeClass, MoE_Interface, std::shared_ptr<MoeClass>>(moe_module, name)
+  auto moe_cls = py::class_<MoeClass, MoE_Interface, std::shared_ptr<MoeClass>>(moe_module, name);
+
+  moe_cls
      .def(py::init<GeneralMOEConfig>())
      .def("warm_up_task", &MoeBindings::WarmUpBindings::cpuinfer_interface)
      .def("load_weights_task",
@ -244,6 +248,53 @@ void bind_moe_module(py::module_& moe_module, const char* name) {
      .def("warm_up", &MoeClass::warm_up)
      .def("load_weights", &MoeClass::load_weights)
      .def("forward", &MoeClass::forward_binding);
+
+#if defined(__x86_64__) && defined(USE_AMX_AVX_KERNEL)
+  if constexpr (std::is_same_v<MoeTP, AMX_K2_MOE_TP<amx::GemmKernel224Int4SmallKGroup>>) {
+    struct WriteWeightScaleToBufferBindings {
+      struct Args {
+        CPUInfer* cpuinfer;
+        MoeClass* moe;
+        int gpu_tp_count;
+        int gpu_experts_num;
+        std::vector<uintptr_t> w13_weight_ptrs;
+        std::vector<uintptr_t> w13_scale_ptrs;
+        std::vector<uintptr_t> w2_weight_ptrs;
+        std::vector<uintptr_t> w2_scale_ptrs;
+      };
+
+      static void inner(void* args) {
+        Args* args_ = (Args*)args;
+        args_->cpuinfer->enqueue(&MoeClass::write_weight_scale_to_buffer, args_->moe,
+                                 args_->gpu_tp_count, args_->gpu_experts_num,
+                                 args_->w13_weight_ptrs, args_->w13_scale_ptrs,
+                                 args_->w2_weight_ptrs, args_->w2_scale_ptrs);
+      }
+
+      static std::pair<intptr_t, intptr_t> cpuinfer_interface(std::shared_ptr<MoeClass> moe,
+                                                              int gpu_tp_count, int gpu_experts_num,
+                                                              py::list w13_weight_ptrs, py::list w13_scale_ptrs,
+                                                              py::list w2_weight_ptrs, py::list w2_scale_ptrs) {
+        // Convert Python lists to std::vector<uintptr_t>
+        std::vector<uintptr_t> w13_weight_vec, w13_scale_vec, w2_weight_vec, w2_scale_vec;
+
+        for (auto item : w13_weight_ptrs) w13_weight_vec.push_back(py::cast<uintptr_t>(item));
+        for (auto item : w13_scale_ptrs) w13_scale_vec.push_back(py::cast<uintptr_t>(item));
+        for (auto item : w2_weight_ptrs) w2_weight_vec.push_back(py::cast<uintptr_t>(item));
+        for (auto item : w2_scale_ptrs) w2_scale_vec.push_back(py::cast<uintptr_t>(item));
+
+        Args* args = new Args{nullptr, moe.get(), gpu_tp_count, gpu_experts_num,
+                              w13_weight_vec, w13_scale_vec, w2_weight_vec, w2_scale_vec};
+        return std::make_pair((intptr_t)&inner, (intptr_t)args);
+      }
+    };
+
+    moe_cls.def("write_weight_scale_to_buffer_task", &WriteWeightScaleToBufferBindings::cpuinfer_interface,
+             py::arg("gpu_tp_count"), py::arg("gpu_experts_num"),
+             py::arg("w13_weight_ptrs"), py::arg("w13_scale_ptrs"),
+             py::arg("w2_weight_ptrs"), py::arg("w2_scale_ptrs"));
+  }
+#endif
 }

 PYBIND11_MODULE(kt_kernel_ext, m) {
@ -513,6 +564,7 @@ PYBIND11_MODULE(kt_kernel_ext, m) {
  bind_moe_module<AMX_MOE_TP<amx::GemmKernel224Int4>>(moe_module, "AMXInt4_MOE");
  bind_moe_module<AMX_MOE_TP<amx::GemmKernel224Int4_1>>(moe_module, "AMXInt4_1_MOE");
  bind_moe_module<AMX_AWQ_MOE_TP<amx::GemmKernel224Int4_1_LowKGroup>>(moe_module, "AMXInt4_1KGroup_MOE");
+  bind_moe_module<AMX_K2_MOE_TP<amx::GemmKernel224Int4SmallKGroup>>(moe_module, "AMXInt4_KGroup_MOE");
 #endif
 #if defined(USE_MOE_KERNEL)
  bind_moe_module<MOE_KERNEL_TP<moe_kernel::GemmKernelInt8, _is_plain_>>(moe_module, "Int8_KERNEL_MOE");
--- a/kt-kernel/operators/amx/k2-moe.hpp
+++ b/kt-kernel/operators/amx/k2-moe.hpp
@ -0,0 +1,929 @@
+/**
+ * @Description  : Skeleton for K2 AMX MoE operator.
+ * @Author       : Codex
+ * @Date         : 2024-07-22
+ * @Version      : 0.1.0
+ * @LastEditors  : Codex
+ * @LastEditTime : 2024-07-22
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#ifndef CPUINFER_OPERATOR_AMX_K2_MOE_H
+#define CPUINFER_OPERATOR_AMX_K2_MOE_H
+
+// #define DEBUG_K2_MOE
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+// #define FORWARD_TIME_PROFILE
+// #define FORWARD_TIME_REPORT
+
+#include <immintrin.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdio>
+#include <filesystem>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "../../cpu_backend/shared_mem_buffer.h"
+#include "../../cpu_backend/worker_pool.h"
+#include "../common.hpp"
+#include "../moe-tp.hpp"
+#include "la/amx.hpp"
+#include "llama.cpp/ggml.h"
+
+template <class T>
+class AMX_K2_MOE_TP {
+ private:
+  int tp_part_idx = 0;
+
+  void* gate_proj_ = nullptr;  // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
+  void* up_proj_ = nullptr;    // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
+  void* down_proj_ = nullptr;  // [expert_num * hidden_size * intermediate_size ( /32 if quantized)]
+
+  ggml_bf16_t* m_local_input_ = nullptr;        // [num_experts_per_tok * max_len * hidden_size]
+  ggml_bf16_t* m_local_gate_output_ = nullptr;  // [num_experts_per_tok * max_len * intermediate_size]
+  ggml_bf16_t* m_local_up_output_ = nullptr;    // [num_experts_per_tok * max_len * intermediate_size]
+  ggml_bf16_t* m_local_down_output_ = nullptr;  // [num_experts_per_tok * max_len * hidden_size]
+
+  std::vector<std::vector<int>> m_local_pos_;          // [max_len, num_experts_per_tok]
+  std::vector<int> m_local_num_;                       // [expert_num]
+  std::vector<int> m_expert_id_map_;                   // [expert_num]
+  std::vector<ggml_bf16_t*> m_local_input_ptr_;        // [expert_num]
+  std::vector<ggml_bf16_t*> m_local_gate_output_ptr_;  // [expert_num]
+  std::vector<ggml_bf16_t*> m_local_up_output_ptr_;    // [expert_num]
+  std::vector<ggml_bf16_t*> m_local_down_output_ptr_;  // [expert_num]
+
+  std::vector<std::shared_ptr<typename T::BufferA>> gate_up_ba_;
+  std::vector<std::shared_ptr<typename T::BufferB>> gate_bb_;
+  std::vector<std::shared_ptr<typename T::BufferC>> gate_bc_;
+  std::vector<std::shared_ptr<typename T::BufferB>> up_bb_;
+  std::vector<std::shared_ptr<typename T::BufferC>> up_bc_;
+  std::vector<std::shared_ptr<typename T::BufferA>> down_ba_;
+  std::vector<std::shared_ptr<typename T::BufferB>> down_bb_;
+  std::vector<std::shared_ptr<typename T::BufferC>> down_bc_;
+#ifdef CHECK
+  char verify_bb[100000000];
+  char check_bb[100000000];
+  uint8_t compare_expers = 3;
+#endif
+
+#ifdef CHECK
+  inline void load_check() {
+    // TODO: implement load_check for verification.
+  }
+
+  void verify_load_right() {
+    // TODO: implement verification helpers.
+  }
+#endif
+
+  inline void dump_buffer_b(const std::string &quantization_type, int expert_idx, const std::string &matrix_type,
+                            typename T::BufferB *buffer) {
+    auto &quant_config = config_.quant_config;
+    int &group_size = quant_config.group_size;
+
+    printf("[DUMP_BUFFER_B] TP%d %s Expert%d %s:\n", tp_part_idx, quantization_type.c_str(), expert_idx,
+           matrix_type.c_str());
+
+    // Calculate dimensions based on matrix type
+    int rows, cols, num_groups;
+    size_t scale_elem_count;
+    if (matrix_type == "gate" || matrix_type == "up") {
+      rows = config_.intermediate_size;
+      cols = config_.hidden_size;
+      num_groups = cols / group_size;
+      scale_elem_count = num_groups * rows;
+    } else { // down
+      rows = config_.hidden_size;
+      cols = config_.intermediate_size;
+      num_groups = cols / group_size;
+      scale_elem_count = num_groups * rows;
+    }
+
+    // Dump scales (as float)
+    printf("  Scales[first 16]: ");
+    for (int i = 0; i < std::min(16, (int)scale_elem_count); i++) {
+      printf("%.6f ", buffer->d[i]);
+    }
+    printf("\n");
+
+    if (scale_elem_count > 16) {
+      printf("  Scales[last 16]: ");
+      int start_idx = std::max(0, (int)scale_elem_count - 16);
+      for (int i = start_idx; i < (int)scale_elem_count; i++) {
+        printf("%.6f ", buffer->d[i]);
+      }
+      printf("\n");
+    }
+    // Dump quantized weights (as hex uint8)
+    size_t weight_size = (rows * cols) / 2; // INT4 packed
+    uint8_t *weight_ptr = (uint8_t *)buffer->b;
+
+    printf("  Weights[first 32 bytes]: ");
+    for (int i = 0; i < std::min(32, (int)weight_size); i++) {
+      printf("%02x ", weight_ptr[i]);
+    }
+    printf("\n");
+
+    if (weight_size > 32) {
+      printf("  Weights[last 32 bytes]: ");
+      int start_idx = std::max(32, (int)weight_size - 32);
+      for (int i = start_idx; i < (int)weight_size; i++) {
+        printf("%02x ", weight_ptr[i]);
+      }
+      printf("\n");
+    }
+
+    printf("  Matrix dimensions: %dx%d, Groups: %d, Group size: %d, Scale elements: %zu\n", rows, cols, num_groups,
+           group_size, scale_elem_count);
+    printf("\n");
+    fflush(stdout);
+  }
+
+#ifdef FORWARD_TIME_REPORT
+  std::chrono::time_point<std::chrono::high_resolution_clock> last_now;
+#endif
+
+ public:
+  using input_t = ggml_bf16_t;
+  using output_t = float;
+  GeneralMOEConfig config_;
+  static constexpr double ELEMENT_SIZE = T::ELEMENT_SIZE;
+
+  AMX_K2_MOE_TP(GeneralMOEConfig config, int tp_part_idx_) {
+    auto& quant_config = config.quant_config;
+    int& group_size = quant_config.group_size;
+    if (quant_config.group_size == 0 || quant_config.zero_point) {
+      throw std::runtime_error("Kimi-K2 MoE only support KGroup Int4");
+    }
+    printf("Creating AMX_K2_MOE_TP %d at numa %d\n", tp_part_idx_, numa_node_of_cpu(sched_getcpu()));
+    auto& load = config.load;
+    auto& save = config.save;
+    if (load && config.path == "") {
+      load = false;
+    }
+
+    this->tp_part_idx = tp_part_idx_;
+    config_ = config;
+    gate_proj_ = config_.gate_proj;
+    up_proj_ = config_.up_proj;
+    down_proj_ = config_.down_proj;
+
+    MemoryRequest mem_requests;
+    mem_requests.append_pointer(
+        &m_local_input_, sizeof(ggml_bf16_t) * config_.num_experts_per_tok * config_.max_len * config_.hidden_size);
+    mem_requests.append_pointer(&m_local_gate_output_, sizeof(ggml_bf16_t) * config_.num_experts_per_tok *
+                                                           config_.max_len * config_.intermediate_size);
+    mem_requests.append_pointer(&m_local_up_output_, sizeof(ggml_bf16_t) * config_.num_experts_per_tok *
+                                                         config_.max_len * config_.intermediate_size);
+    mem_requests.append_pointer(&m_local_down_output_, sizeof(ggml_bf16_t) * config_.num_experts_per_tok *
+                                                           config_.max_len * config_.hidden_size);
+
+    m_local_pos_.resize(config_.max_len);
+    for (int i = 0; i < config_.max_len; i++) {
+      m_local_pos_[i].resize(config_.num_experts_per_tok);
+    }
+    m_expert_id_map_.resize(config_.expert_num);
+    m_local_num_.resize(config_.expert_num);
+    m_local_input_ptr_.resize(config_.expert_num);
+    m_local_gate_output_ptr_.resize(config_.expert_num);
+    m_local_up_output_ptr_.resize(config_.expert_num);
+    m_local_down_output_ptr_.resize(config_.expert_num);
+
+    for (size_t i = 0; i < config_.expert_num; i++) {
+      gate_up_ba_.push_back(
+          std::make_shared<typename T::BufferA>(config_.max_len, config_.hidden_size, group_size, nullptr));
+      gate_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.intermediate_size, nullptr));
+      up_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.intermediate_size, nullptr));
+      down_ba_.push_back(
+          std::make_shared<typename T::BufferA>(config_.max_len, config_.intermediate_size, group_size, nullptr));
+      down_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.hidden_size, nullptr));
+
+      void* gate_bb_ptr =
+          std::aligned_alloc(64, T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, group_size));
+      gate_bb_.push_back(std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size,
+                                                               group_size, gate_bb_ptr));
+
+      void* up_bb_ptr =
+          std::aligned_alloc(64, T::BufferB::required_size(config_.intermediate_size, config_.hidden_size, group_size));
+      up_bb_.push_back(
+          std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, group_size, up_bb_ptr));
+
+      void* down_bb_ptr =
+          std::aligned_alloc(64, T::BufferB::required_size(config_.hidden_size, config_.intermediate_size, group_size));
+      down_bb_.push_back(std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size,
+                                                               group_size, down_bb_ptr));
+    }
+    for (int i = 0; i < config_.expert_num; i++) {
+      mem_requests.append_function([this, i](void* new_ptr) { gate_up_ba_[i]->set_data(new_ptr); },
+      T::BufferA::required_size(config_.max_len, config_.hidden_size, group_size));
+      mem_requests.append_function([this, i](void* new_ptr) { gate_bc_[i]->set_data(new_ptr); },
+                                   T::BufferC::required_size(config_.max_len, config_.intermediate_size));
+      mem_requests.append_function([this, i](void* new_ptr) { up_bc_[i]->set_data(new_ptr); },
+                                   T::BufferC::required_size(config_.max_len, config_.intermediate_size));
+      mem_requests.append_function([this, i](void* new_ptr) { down_ba_[i]->set_data(new_ptr); },
+      T::BufferA::required_size(config_.max_len, config_.intermediate_size, group_size));
+      mem_requests.append_function([this, i](void* new_ptr) { down_bc_[i]->set_data(new_ptr); },
+                                   T::BufferC::required_size(config_.max_len, config_.hidden_size));
+    }
+    shared_mem_buffer_numa.alloc(tp_part_idx, this, mem_requests);
+  }
+
+  ~AMX_K2_MOE_TP() = default;
+
+  void load_weights() {
+    auto& quant_config = config_.quant_config;
+    int& group_size = quant_config.group_size;
+    const uint64_t* physical_to_logical_map = (const uint64_t*)config_.physical_to_logical_map;
+    auto pool = config_.pool->get_subpool(tp_part_idx);
+
+    if (quant_config.group_size == 0 || quant_config.zero_point) {
+      throw std::runtime_error("Kimi AVX MOE only support KGroup Int4.");
+    }
+    if (config_.gate_scale == nullptr) {
+      throw std::runtime_error("Kimi AVX MOE only support load native weight.");
+    }
+    // load weight
+    int nth = T::recommended_nth(config_.intermediate_size);
+    pool->do_work_stealing_job(
+        nth * config_.expert_num, nullptr,
+        [this, nth, physical_to_logical_map](int task_id) {
+          uint64_t expert_idx = task_id / nth;
+          uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
+          int ith = task_id % nth;
+          // gate part
+          gate_bb_[expert_idx]->from_raw_mat(
+              (uint8_t*)config_.gate_proj +
+                  ((logical_expert_id * config_.intermediate_size * config_.hidden_size) >> 1),
+              ith, nth);
+          // up part
+          up_bb_[expert_idx]->from_raw_mat(
+              (uint8_t*)config_.up_proj +
+                  ((logical_expert_id * config_.intermediate_size * config_.hidden_size) >> 1),
+              ith, nth);
+        },
+        nullptr);
+
+    nth = T::recommended_nth(config_.hidden_size);
+    pool->do_work_stealing_job(
+        nth * config_.expert_num, nullptr,
+        [this, nth, physical_to_logical_map](int task_id) {
+          uint64_t expert_idx = task_id / nth;
+          uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
+          int ith = task_id % nth;
+          // down part
+          down_bb_[expert_idx]->from_raw_mat(
+              (uint8_t*)config_.down_proj +
+                  ((logical_expert_id * config_.hidden_size * config_.intermediate_size) >> 1),
+              ith, nth);
+        },
+        nullptr);
+
+    pool->do_work_stealing_job(
+        config_.expert_num, nullptr,
+        [this, physical_to_logical_map](int task_id) {
+          uint64_t expert_idx = task_id;
+          uint64_t logical_expert_id = expert_map(physical_to_logical_map, expert_idx);
+          size_t scale_elem_count =
+              (config_.hidden_size * config_.intermediate_size) / config_.quant_config.group_size;
+
+          // convert scales from BF16 to FP32
+          convert_or_copy(gate_bb_[expert_idx]->d,
+                          (ggml_bf16_t*)config_.gate_scale + (logical_expert_id * scale_elem_count),
+                          scale_elem_count);
+          convert_or_copy(up_bb_[expert_idx]->d,
+                          (ggml_bf16_t*)config_.up_scale + (logical_expert_id * scale_elem_count),
+                          scale_elem_count);
+          convert_or_copy(down_bb_[expert_idx]->d,
+                          (ggml_bf16_t*)config_.down_scale + (logical_expert_id * scale_elem_count),
+                          scale_elem_count);
+        },
+        nullptr);
+    // dump_buffer_b("native", 0, "down", down_bb_[0].get());
+  }
+
+  // Reconstruct weights for all experts to the output buffers
+  // This function handles the TP-specific portion of the reconstruction for all experts
+  void write_weights_to_buffer(int gpu_tp_count, int cpu_tp_count, int num_experts, const GeneralMOEConfig& full_config,
+                                const std::vector<uintptr_t>& w13_weight_ptrs,
+                                const std::vector<uintptr_t>& w13_scale_ptrs,
+                                const std::vector<uintptr_t>& w2_weight_ptrs,
+                                const std::vector<uintptr_t>& w2_scale_ptrs) const {
+    const int group_size = config_.quant_config.group_size;
+    auto pool = config_.pool->get_subpool(tp_part_idx);
+
+    // Calculate sizes for CPU TP part (this instance)
+    size_t cpu_tp_weight_elem_count = (size_t)config_.intermediate_size * config_.hidden_size;
+    size_t cpu_tp_weight_bytes = cpu_tp_weight_elem_count / 2;  // int4 packing
+    size_t cpu_tp_scale_elem_count = cpu_tp_weight_elem_count / group_size;
+
+    // Calculate sizes for GPU TP part
+    size_t gpu_tp_weight_elem_count = (size_t)full_config.intermediate_size * full_config.hidden_size / gpu_tp_count;
+    size_t gpu_tp_weight_bytes = gpu_tp_weight_elem_count / 2;  // int4 packing
+    size_t gpu_tp_scale_elem_count = gpu_tp_weight_elem_count / group_size;
+
+    // Determine mapping: which GPU TP parts should this CPU TP part write to?
+    // Since weights are col-major and we slice directly by memory order:
+    // - If cpu_tp_count >= gpu_tp_count: multiple(or one) CPU TPs write to one GPU TP
+    // - If cpu_tp_count < gpu_tp_count: one CPU TP writes to multiple GPU TPs
+    if (cpu_tp_count >= gpu_tp_count) {
+      // Multiple CPU TPs map to one GPU TP
+      int target_gpu_tp = tp_part_idx / (cpu_tp_count / gpu_tp_count);
+      int local_idx = tp_part_idx % (cpu_tp_count / gpu_tp_count);
+
+      // Get pointers for this GPU TP part
+      uint8_t* w13_weight_dst = (uint8_t*)w13_weight_ptrs[target_gpu_tp];
+      ggml_bf16_t* w13_scale_dst = (ggml_bf16_t*)w13_scale_ptrs[target_gpu_tp];
+      uint8_t* w2_weight_dst = (uint8_t*)w2_weight_ptrs[target_gpu_tp];
+      ggml_bf16_t* w2_scale_dst = (ggml_bf16_t*)w2_scale_ptrs[target_gpu_tp];
+
+      // Calculate offset within the GPU TP buffer
+      size_t offset_in_gpu_weight = local_idx * cpu_tp_weight_bytes;
+      size_t offset_in_gpu_scale = local_idx * cpu_tp_scale_elem_count;
+
+      // Process only the first num_experts experts (GPU experts)
+      int nth = T::recommended_nth(config_.intermediate_size);
+      nth = 1;
+      pool->do_work_stealing_job(
+          nth * num_experts, nullptr,
+          [&, this](int task_id) {
+            int expert_id = task_id / nth;
+            // int ith = task_id % nth;
+            // auto [n_start, n_end] = T::split_range_n(config_.intermediate_size, ith, nth);
+
+            // Calculate base offsets for this expert in the GPU buffers
+            // For w13: each expert has gate+up, so the offset needs to account for 2x size
+            size_t w13_expert_base_weight = expert_id * 2 * gpu_tp_weight_bytes;
+            size_t w13_expert_base_scale = expert_id * 2 * gpu_tp_scale_elem_count;
+            size_t w2_expert_base_weight = expert_id * gpu_tp_weight_bytes;
+            size_t w2_expert_base_scale = expert_id * gpu_tp_scale_elem_count;
+
+            // Gate (first part of w13 for this expert)
+            uint8_t* gate_weight_src = (uint8_t*)gate_bb_[expert_id]->b;
+            float* gate_scale_src = gate_bb_[expert_id]->d;
+            std::memcpy(w13_weight_dst + w13_expert_base_weight + offset_in_gpu_weight,
+                       gate_weight_src, cpu_tp_weight_bytes);
+            convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_expert_base_scale + offset_in_gpu_scale),
+                           gate_scale_src, cpu_tp_scale_elem_count);
+
+            // Up (second part of w13 for this expert, immediately after gate)
+            uint8_t* up_weight_src = (uint8_t*)up_bb_[expert_id]->b;
+            float* up_scale_src = up_bb_[expert_id]->d;
+            std::memcpy(w13_weight_dst + w13_expert_base_weight + offset_in_gpu_weight + gpu_tp_weight_bytes,
+                       up_weight_src, cpu_tp_weight_bytes);
+            convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_expert_base_scale + offset_in_gpu_scale + gpu_tp_scale_elem_count),
+                           up_scale_src, cpu_tp_scale_elem_count);
+
+            // Down (w2) - need to handle column-wise slicing
+            // The down matrix is transposed compared to gate/up, so we need to extract by columns
+            // When multiple CPU TPs map to one GPU TP, each CPU TP has a slice of intermediate dimension
+            // CPU TP internal layout: each column has config_.intermediate_size elements
+            // GPU expects: each column has full_config.intermediate_size elements
+            size_t cpu_tps_per_gpu = cpu_tp_count / gpu_tp_count;
+
+            for (size_t col = 0; col < config_.hidden_size; col++) {
+              // GPU buffer column width is full_config.intermediate_size / gpu_tp_count
+              size_t gpu_col_offset = col * ((full_config.intermediate_size / gpu_tp_count) >> 1);
+              size_t cpu_col_offset = col * (config_.intermediate_size >> 1);
+              size_t gpu_col_slice_offset = local_idx * (config_.intermediate_size >> 1);
+
+              std::memcpy(w2_weight_dst + w2_expert_base_weight + gpu_col_offset + gpu_col_slice_offset,
+                         (uint8_t*)down_bb_[expert_id]->b + cpu_col_offset,
+                         config_.intermediate_size / 2);
+
+              // Same for scales
+              size_t gpu_scale_col_offset = col * ((full_config.intermediate_size / gpu_tp_count) / group_size);
+              size_t cpu_scale_col_offset = col * (config_.intermediate_size / group_size);
+              size_t gpu_scale_slice_offset = local_idx * (config_.intermediate_size / group_size);
+
+              convert_or_copy((ggml_bf16_t*)(w2_scale_dst + w2_expert_base_scale + gpu_scale_col_offset + gpu_scale_slice_offset),
+                             down_bb_[expert_id]->d + cpu_scale_col_offset,
+                             config_.intermediate_size / group_size);
+            }
+          },
+          nullptr);
+    } else {
+      // cpu_tp_count < gpu_tp_count: one CPU TP writes to multiple GPU TPs
+      // Each CPU TP part contains data for multiple GPU TP parts
+      int gpu_tps_per_cpu_tp = gpu_tp_count / cpu_tp_count;
+
+      // This CPU TP part writes to GPU TP indices: [start_gpu_tp, start_gpu_tp + gpu_tps_per_cpu_tp)
+      int start_gpu_tp = tp_part_idx * gpu_tps_per_cpu_tp;
+
+      // Size of data per GPU TP within this CPU TP
+      size_t data_per_gpu_tp_weight = cpu_tp_weight_bytes / gpu_tps_per_cpu_tp;
+      size_t data_per_gpu_tp_scale = cpu_tp_scale_elem_count / gpu_tps_per_cpu_tp;
+
+      // Process all experts for this GPU TP
+      pool->do_work_stealing_job(
+          gpu_tps_per_cpu_tp * num_experts, nullptr,
+          [&, this](int task_id) {
+            int expert_id = task_id % num_experts;
+            int local_gpu_idx = task_id / num_experts;
+            int gpu_tp_idx = start_gpu_tp + local_gpu_idx;
+
+            // Get pointers for this GPU TP part
+            uint8_t* w13_weight_dst = (uint8_t*)w13_weight_ptrs[gpu_tp_idx];
+            ggml_bf16_t* w13_scale_dst = (ggml_bf16_t*)w13_scale_ptrs[gpu_tp_idx];
+            uint8_t* w2_weight_dst = (uint8_t*)w2_weight_ptrs[gpu_tp_idx];
+            ggml_bf16_t* w2_scale_dst = (ggml_bf16_t*)w2_scale_ptrs[gpu_tp_idx];
+
+            // Calculate offsets within CPU TP buffers
+            size_t cpu_offset_weight = local_gpu_idx * data_per_gpu_tp_weight;
+            size_t cpu_offset_scale = local_gpu_idx * data_per_gpu_tp_scale;
+
+            // Calculate offsets for this expert in GPU buffers
+            // For w13: each expert has gate+up, so the offset needs to account for 2x size
+            size_t w13_gpu_expert_offset_weight = expert_id * 2 * gpu_tp_weight_bytes;
+            size_t w13_gpu_expert_offset_scale = expert_id * 2 * gpu_tp_scale_elem_count;
+            size_t w2_gpu_expert_offset_weight = expert_id * gpu_tp_weight_bytes;
+            size_t w2_gpu_expert_offset_scale = expert_id * gpu_tp_scale_elem_count;
+
+            // Gate (first part of w13 for this expert)
+            uint8_t* gate_weight_src = (uint8_t*)gate_bb_[expert_id]->b + cpu_offset_weight;
+            float* gate_scale_src = gate_bb_[expert_id]->d + cpu_offset_scale;
+            std::memcpy(w13_weight_dst + w13_gpu_expert_offset_weight,
+                        gate_weight_src, data_per_gpu_tp_weight);
+            convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_gpu_expert_offset_scale),
+                            gate_scale_src, data_per_gpu_tp_scale);
+
+            // Up (second part of w13 for this expert, immediately after gate)
+            uint8_t* up_weight_src = (uint8_t*)up_bb_[expert_id]->b + cpu_offset_weight;
+            float* up_scale_src = up_bb_[expert_id]->d + cpu_offset_scale;
+            std::memcpy(w13_weight_dst + w13_gpu_expert_offset_weight + gpu_tp_weight_bytes,
+                        up_weight_src, data_per_gpu_tp_weight);
+            convert_or_copy((ggml_bf16_t*)(w13_scale_dst + w13_gpu_expert_offset_scale + gpu_tp_scale_elem_count),
+                            up_scale_src, data_per_gpu_tp_scale);
+
+            // Down (w2) - need to handle column-wise slicing
+            // The down matrix is transposed compared to gate/up, so we need to extract by columns
+            for (size_t col = 0; col < config_.hidden_size; col++) {
+              // Calculate the offset within the column for this GPU TP part
+              size_t col_offset_weight = (col * config_.intermediate_size / 2) + (local_gpu_idx * data_per_gpu_tp_weight / config_.hidden_size);
+              size_t col_offset_scale = (col * (config_.intermediate_size / group_size)) + (local_gpu_idx * data_per_gpu_tp_scale / config_.hidden_size);
+
+              // Copy weights column by column
+              std::memcpy(w2_weight_dst + w2_gpu_expert_offset_weight + (col * (config_.intermediate_size / gpu_tps_per_cpu_tp) / 2),
+                          (uint8_t*)down_bb_[expert_id]->b + col_offset_weight,
+                          (config_.intermediate_size / gpu_tps_per_cpu_tp) / 2);
+
+              // Copy scales column by column
+              convert_or_copy((ggml_bf16_t*)(w2_scale_dst + w2_gpu_expert_offset_scale + col * ((config_.intermediate_size / gpu_tps_per_cpu_tp) / group_size)),
+                              down_bb_[expert_id]->d + col_offset_scale,
+                              (config_.intermediate_size / gpu_tps_per_cpu_tp) / group_size);
+            }
+          },
+          nullptr);
+    }
+  }
+
+  void warm_up() {
+    int qlen = config_.max_len;
+    std::vector<uint8_t> input(sizeof(ggml_bf16_t) * qlen * config_.hidden_size);
+    std::vector<uint8_t> output(sizeof(ggml_bf16_t) * qlen * config_.hidden_size);
+    std::vector<int64_t> expert_ids(qlen * config_.num_experts_per_tok);
+    std::vector<float> weights(qlen * config_.num_experts_per_tok);
+    for (int i = 0; i < qlen * config_.num_experts_per_tok; i++) {
+      expert_ids[i] = i % config_.expert_num;
+      weights[i] = 0.01;
+    }
+    forward(qlen, config_.num_experts_per_tok, expert_ids.data(), weights.data(), input.data(), output.data());
+  }
+
+  void forward(int qlen, int k, const int64_t* expert_ids, const float* weights, const void* input, void* output) {
+    if (qlen > 1) {
+      forward_prefill(qlen, k, expert_ids, weights, input, output);
+    } else {
+      forward_decode(k, expert_ids, weights, input, output);
+    }
+  }
+
+#ifndef DIRECT_OR_POOL_BY_QLEN
+#define DIRECT_OR_POOL_BY_QLEN(var, fn)                          \
+  do {                                                           \
+    if (qlen < 10) {                                             \
+      for (int i = 0; i < (var); i++) {                          \
+        (fn)(i);                                                 \
+      }                                                          \
+    } else {                                                     \
+      pool->do_work_stealing_job((var), nullptr, (fn), nullptr); \
+    }                                                            \
+  } while (0)
+#endif
+
+  void forward_prefill(int qlen, int k, const int64_t* expert_ids, const float* weights, const void* input,
+                       void* output) {
+    for (int i = 0; i < qlen; i ++)
+      forward_decode(k, expert_ids + i * k, weights + i * k, (ggml_bf16_t*)input + i * config_.hidden_size, (float*)output + i * config_.hidden_size);
+  }
+
+  void forward_decode(int k, const int64_t* expert_ids, const float* weights, const void* input, void* output) {
+    int qlen = 1;
+    auto pool = config_.pool->get_subpool(tp_part_idx);
+    auto& quant_config = config_.quant_config;
+    int& group_size = quant_config.group_size;
+#ifdef FORWARD_TIME_PROFILE
+    auto start_time = std::chrono::high_resolution_clock::now();
+    auto last = start_time;
+    // 用于保存各阶段耗时（单位：微秒）
+    long prepare_time = 0, cpy_input_time = 0, q_input_time = 0, up_gate_time = 0;
+    long act_time = 0, q_down_time = 0, down_time = 0, weight_time = 0;
+    int max_local_num = 0;  // 记录最大的 local num
+#endif
+
+    int activated_expert = 0;
+    for (int i = 0; i < k; i++) {
+      if (expert_ids[i] < config_.num_gpu_experts || expert_ids[i] >= config_.expert_num) {
+        continue;
+      }
+      m_expert_id_map_[activated_expert] = expert_ids[i];
+      activated_expert++;
+    }
+    
+    size_t offset = 0;
+    for (int i = 0; i < activated_expert; i++) {
+      auto expert_idx = m_expert_id_map_[i];
+      m_local_gate_output_ptr_[expert_idx] = m_local_gate_output_ + offset * config_.intermediate_size;
+      m_local_up_output_ptr_[expert_idx] = m_local_up_output_ + offset * config_.intermediate_size;
+      m_local_down_output_ptr_[expert_idx] = m_local_down_output_ + offset * config_.hidden_size;
+      offset += qlen;
+    }
+
+    gate_up_ba_[0]->from_mat(qlen, (ggml_bf16_t*)input, 0, 1);
+
+#ifdef FORWARD_TIME_PROFILE
+    {
+      auto now_time = std::chrono::high_resolution_clock::now();
+      q_input_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
+      last = now_time;
+    }
+#endif
+
+    // calc gate & up
+    int nth = T::recommended_nth(config_.intermediate_size);
+    pool->do_work_stealing_job(
+        nth * activated_expert * 2, [](int _) { T::config(); },
+        [this, nth, qlen](int task_id2) {
+          int& group_size = config_.quant_config.group_size;
+          int task_id = task_id2 / 2;
+          bool do_up = task_id2 % 2;
+          int expert_idx = m_expert_id_map_[task_id / nth];
+
+          int ith = task_id % nth;
+          if (do_up) {
+            amx::vec_mul_kgroup(qlen, config_.intermediate_size, config_.hidden_size, group_size, gate_up_ba_[0],
+                                up_bb_[expert_idx], up_bc_[expert_idx], ith, nth);
+            up_bc_[expert_idx]->to_mat(qlen, m_local_up_output_ptr_[expert_idx], ith, nth);
+          } else {
+            amx::vec_mul_kgroup(qlen, config_.intermediate_size, config_.hidden_size, group_size, gate_up_ba_[0],
+                                gate_bb_[expert_idx], gate_bc_[expert_idx], ith, nth);
+            gate_bc_[expert_idx]->to_mat(qlen, m_local_gate_output_ptr_[expert_idx], ith, nth);
+      }
+    },
+    nullptr);
+
+#ifdef DEBUG_K2_MOE
+    if (activated_expert > 0) {
+      int print_elems = std::min(config_.intermediate_size, 16);
+      for (int dbg = 0; dbg < activated_expert; ++dbg) {
+        int sample_expert = m_expert_id_map_[dbg];
+        ggml_bf16_t* gate_ptr = m_local_gate_output_ptr_[sample_expert];
+        if (gate_ptr == nullptr) {
+          continue;
+        }
+
+        printf("[K2][TP %d] gate_out (expert %d, first %d elems): ", tp_part_idx, sample_expert, print_elems);
+        for (int idx = 0; idx < print_elems; idx++) {
+          float val = ggml_bf16_to_fp32(gate_ptr[idx]);
+          printf("%.6f ", val);
+        }
+        printf("\n");
+
+        int tail_start = config_.intermediate_size > print_elems ? config_.intermediate_size - print_elems : 0;
+        printf("[K2][TP %d] gate_out (expert %d, last %d elems): ", tp_part_idx, sample_expert, print_elems);
+        for (int idx = 0; idx < print_elems; idx++) {
+          float val = ggml_bf16_to_fp32(gate_ptr[tail_start + idx]);
+          printf("%.6f ", val);
+        }
+        printf("\n");
+      }
+    }
+#endif
+
+#ifdef FORWARD_TIME_PROFILE
+    {
+      auto now_time = std::chrono::high_resolution_clock::now();
+      up_gate_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
+      last = now_time;
+    }
+#endif
+
+    // act
+    for (int task_id = 0; task_id < nth * activated_expert; task_id++) {
+      int expert_idx = m_expert_id_map_[task_id / nth];
+      int ith = task_id % nth;
+      auto [n_start, n_end] = T::split_range_n(config_.intermediate_size, ith, nth);
+      for (int i = 0; i < qlen; i++) {
+        ggml_bf16_t* gate_output_ptr = &m_local_gate_output_ptr_[expert_idx][i * config_.intermediate_size];
+        ggml_bf16_t* up_output_ptr = &m_local_up_output_ptr_[expert_idx][i * config_.intermediate_size];
+        for (int j = n_start; j < n_end; j += 32) {
+          __m512 gate_val0, gate_val1, up_val0, up_val1;
+          avx512_32xbf16_to_32xfp32((__m512i*)(gate_output_ptr + j), &gate_val0, &gate_val1);
+          avx512_32xbf16_to_32xfp32((__m512i*)(up_output_ptr + j), &up_val0, &up_val1);
+          __m512 result0 = amx::act_fn(gate_val0, up_val0);
+          __m512 result1 = amx::act_fn(gate_val1, up_val1);
+          avx512_32xfp32_to_32xbf16(&result0, &result1, (__m512i*)(gate_output_ptr + j));
+        }
+      }
+    }
+
+    
+#ifdef FORWARD_TIME_PROFILE
+    {
+      auto now_time = std::chrono::high_resolution_clock::now();
+      act_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
+      last = now_time;
+    }
+#endif
+
+    // quant, get down a
+    pool->do_work_stealing_job(
+        activated_expert, nullptr,
+        [this, qlen](int task_id) {
+          int expert_idx = m_expert_id_map_[task_id];
+          down_ba_[expert_idx]->from_mat(qlen, m_local_gate_output_ptr_[expert_idx], 0, 1);
+        },
+        nullptr);
+#ifdef FORWARD_TIME_PROFILE
+    {
+      auto now_time = std::chrono::high_resolution_clock::now();
+      q_down_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
+      last = now_time;
+    }
+#endif
+
+    // * down
+    nth = T::recommended_nth(config_.hidden_size);
+    pool->do_work_stealing_job(
+        nth * activated_expert, [](int _) { T::config(); },
+        [this, nth, qlen](int task_id) {
+          int& group_size = config_.quant_config.group_size;
+          int expert_idx = m_expert_id_map_[task_id / nth];
+          int ith = task_id % nth;
+          amx::vec_mul_kgroup(qlen, config_.hidden_size, config_.intermediate_size, group_size, down_ba_[expert_idx],
+                              down_bb_[expert_idx], down_bc_[expert_idx], ith, nth);
+          down_bc_[expert_idx]->to_mat(qlen, m_local_down_output_ptr_[expert_idx], ith, nth);
+        },
+        nullptr);
+
+#ifdef DEBUG_K2_MOE
+    if (activated_expert > 0) {
+      int print_elems = std::min(config_.hidden_size, 16);
+      for (int dbg = 0; dbg < activated_expert; ++dbg) {
+        int sample_expert = m_expert_id_map_[dbg];
+        ggml_bf16_t* down_ptr = m_local_down_output_ptr_[sample_expert];
+        if (down_ptr == nullptr) {
+          continue;
+        }
+
+        printf("[K2][TP %d] down_out (expert %d, first %d elems): ", tp_part_idx, sample_expert, print_elems);
+        for (int idx = 0; idx < print_elems; idx++) {
+          float val = ggml_bf16_to_fp32(down_ptr[idx]);
+          printf("%.6f ", val);
+        }
+        printf("\n");
+
+        int tail_start = config_.hidden_size > print_elems ? config_.hidden_size - print_elems : 0;
+        printf("[K2][TP %d] down_out (expert %d, last %d elems): ", tp_part_idx, sample_expert, print_elems);
+        for (int idx = 0; idx < print_elems; idx++) {
+          float val = ggml_bf16_to_fp32(down_ptr[tail_start + idx]);
+          printf("%.6f ", val);
+        }
+        printf("\n");
+      }
+    }
+#endif
+
+#ifdef FORWARD_TIME_PROFILE
+    {
+      auto now_time = std::chrono::high_resolution_clock::now();
+      down_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
+      last = now_time;
+    }
+#endif
+
+    // get output
+    for (int e = 0; e < config_.hidden_size; e += 32) {
+      __m512 x0 = _mm512_setzero_ps();
+      __m512 x1 = _mm512_setzero_ps();
+      for (int j = 0; j < k; j++) {
+        if (expert_ids[j] < config_.num_gpu_experts || expert_ids[j] >= config_.expert_num) {
+          continue;
+        }
+        __m512 weight = _mm512_set1_ps(weights[j]);
+        __m512 down_output0, down_output1;
+        avx512_32xbf16_to_32xfp32((__m512i*)(m_local_down_output_ptr_[expert_ids[j]] +
+                                              m_local_pos_[0][j] * config_.hidden_size + e),
+                                  &down_output0, &down_output1);
+        x0 = _mm512_fmadd_ps(down_output0, weight, x0);
+        x1 = _mm512_fmadd_ps(down_output1, weight, x1);
+      }
+      auto f32out = (__m512*)((float*)output + e);
+      f32out[0] = x0;
+      f32out[1] = x1;
+    }
+
+#ifdef FORWARD_TIME_PROFILE
+    {
+      auto now_time = std::chrono::high_resolution_clock::now();
+      weight_time = std::chrono::duration_cast<std::chrono::microseconds>(now_time - last).count();
+      last = now_time;
+    }
+    auto end_time = std::chrono::high_resolution_clock::now();
+    auto forward_total_time = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
+    // 在函数末尾一次性打印所有阶段的耗时，并附带 max_local_num 和 qlen
+    printf(
+        "Profiling Results (numa[%d]): activated_expert: %d, q_input: %ld us, "
+        "up_gate: %ld us, act: %ld us, q_down: %ld us, down: %ld us, weight: %ld us, total: %ld us\n",
+        tp_part_idx, activated_expert, q_input_time, up_gate_time, act_time, q_down_time, down_time, weight_time,
+        forward_total_time);
+#endif
+  }
+};
+
+template <typename K>
+class TP_MOE<AMX_K2_MOE_TP<K>> : public TP_MOE_Common<AMX_K2_MOE_TP<K>> {
+ public:
+  using TP_MOE_Common<AMX_K2_MOE_TP<K>>::TP_MOE_Common;
+
+  void load_weights() {
+    auto& config = this->config;
+    auto& tps = this->tps;
+    auto& tp_count = this->tp_count;
+    auto pool = config.pool;
+    const uint64_t* physical_to_logical_map = (const uint64_t*)config.physical_to_logical_map;
+
+    if (config.gate_scale == nullptr) {
+      throw std::runtime_error("K2 MoE only supports Packed Int4 with KGroup Scale");
+    }
+    printf("From Packed Int4 with KGroup Scale\n");
+    int& group_size = config.quant_config.group_size;
+    for (auto i = 0; i < tp_count; i++) {
+      auto& tpc = tps[i]->config_;
+      size_t weight_elem_count = tpc.intermediate_size * tpc.hidden_size;
+      tpc.gate_proj = new uint8_t[(tpc.expert_num * weight_elem_count) / 2];
+      tpc.up_proj = new uint8_t[(tpc.expert_num * weight_elem_count) / 2];
+      tpc.down_proj = new uint8_t[(tpc.expert_num * weight_elem_count) / 2];
+
+      size_t scales_elem_count = (tpc.hidden_size / group_size) * tpc.intermediate_size;
+
+      tpc.gate_scale = new ggml_bf16_t[(tpc.expert_num * scales_elem_count)];
+      tpc.up_scale = new ggml_bf16_t[(tpc.expert_num * scales_elem_count)];
+      tpc.down_scale = new ggml_bf16_t[(tpc.expert_num * scales_elem_count)];
+
+      if (tps[i]->config_.load == false) {
+        pool->get_subpool(i)->do_work_stealing_job(
+            tpc.expert_num, nullptr,
+            [&](int expert_id_) { // weight and scale are all in col majored.
+              size_t expert_id = expert_map(physical_to_logical_map, expert_id_);
+
+              // weight and scale TP-slicing for gate and up
+              memcpy((uint8_t*)tpc.gate_proj + ((expert_id * weight_elem_count) >> 1),
+                      (uint8_t*)config.gate_proj +
+                          ((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
+                      ((sizeof(uint8_t) * weight_elem_count) >> 1));
+
+              memcpy((uint8_t*)tpc.up_proj + ((expert_id * weight_elem_count) >> 1),
+                      (uint8_t*)config.up_proj +
+                          ((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
+                      ((sizeof(uint8_t) * weight_elem_count) >> 1));
+
+              memcpy((ggml_bf16_t*)tpc.gate_scale + (expert_id * scales_elem_count),
+                      (ggml_bf16_t*)config.gate_scale + 
+                          (expert_id * (config.hidden_size / group_size) * config.intermediate_size + 
+                          i * scales_elem_count),
+                        sizeof(ggml_bf16_t) * scales_elem_count);
+
+              memcpy((ggml_bf16_t*)tpc.up_scale + (expert_id * scales_elem_count),
+                      (ggml_bf16_t*)config.up_scale + 
+                          (expert_id * (config.hidden_size / group_size) * config.intermediate_size + 
+                          i * scales_elem_count),
+                        sizeof(ggml_bf16_t) * scales_elem_count);
+
+              // memcpy((uint8_t*)tpc.down_proj + ((expert_id * weight_elem_count) >> 1),
+              //         (uint8_t*)config.down_proj +
+              //             ((expert_id * config.intermediate_size * config.hidden_size + i * weight_elem_count) >> 1),
+              //         ((sizeof(uint8_t) * weight_elem_count) >> 1));
+
+              // memcpy((ggml_bf16_t*)tpc.down_scale + (expert_id * scales_elem_count),
+              //         (ggml_bf16_t*)config.down_scale + 
+              //             (expert_id * (config.intermediate_size / group_size) * config.hidden_size + 
+              //             i * scales_elem_count),
+              //           sizeof(ggml_bf16_t) * scales_elem_count);
+
+              // weight and scale TP-slicing for down (by column)
+              for (size_t col = 0; col < config.hidden_size; col++) {
+                memcpy((uint8_t*)tpc.down_proj + ((expert_id * weight_elem_count + col * tpc.intermediate_size) >> 1),
+                        (uint8_t*)config.down_proj + ((expert_id * config.intermediate_size * config.hidden_size +
+                                                      col * config.intermediate_size + i * tpc.intermediate_size) >>
+                                                      1),
+                        (sizeof(uint8_t) * tpc.intermediate_size) >> 1);
+                memcpy((ggml_bf16_t*)tpc.down_scale + (expert_id * scales_elem_count + col * (tpc.intermediate_size / group_size)),
+                        (ggml_bf16_t*)config.down_scale + ((expert_id * (config.intermediate_size / group_size) * config.hidden_size) + 
+                                                            col * (config.intermediate_size / group_size) + i * (tpc.intermediate_size / group_size)),
+                        sizeof(ggml_bf16_t) * (tpc.intermediate_size / group_size));
+              }
+            },
+            nullptr);
+      }
+      printf("TP %d load weight done.\n", i);
+    }
+
+    DO_TPS_LOAD_WEIGHTS(pool);
+
+    for (auto i = 0; i < tp_count; i++) {
+      auto& tpc = tps[i]->config_;
+      delete[] (uint8_t*)(tpc.gate_proj);
+      delete[] (uint8_t*)(tpc.up_proj);
+      delete[] (uint8_t*)(tpc.down_proj);
+
+      delete[] (ggml_bf16_t*)(tpc.gate_scale);
+      delete[] (ggml_bf16_t*)(tpc.up_scale);
+      delete[] (ggml_bf16_t*)(tpc.down_scale);
+    }
+
+    this->weights_loaded = true;
+  }
+
+  void write_weight_scale_to_buffer(int gpu_tp_count, int gpu_experts_num,
+                                    const std::vector<uintptr_t>& w13_weight_ptrs,
+                                    const std::vector<uintptr_t>& w13_scale_ptrs,
+                                    const std::vector<uintptr_t>& w2_weight_ptrs,
+                                    const std::vector<uintptr_t>& w2_scale_ptrs) {
+    if (this->weights_loaded == false) {
+      throw std::runtime_error("Not Loaded");
+    }
+    if (this->tps.empty()) {
+      throw std::runtime_error("No TP parts initialized");
+    }
+
+    // Validate input vector sizes
+    if (w13_weight_ptrs.size() != gpu_tp_count || w13_scale_ptrs.size() != gpu_tp_count ||
+        w2_weight_ptrs.size() != gpu_tp_count || w2_scale_ptrs.size() != gpu_tp_count) {
+      throw std::runtime_error("Pointer arrays size must match gpu_tp_count");
+    }
+
+    // Each TP part writes to its corresponding buffer
+    for (int tp_idx = 0; tp_idx < this->tp_count; tp_idx++) {
+      // Note: w13 combines gate and up projections
+      // Split w13 pointers for gate and up
+      this->tps[tp_idx]->write_weights_to_buffer(
+          gpu_tp_count, this->tp_count,
+          gpu_experts_num, this->config,
+          w13_weight_ptrs, w13_scale_ptrs, //gate + up use w13
+          w2_weight_ptrs, w2_scale_ptrs);    // down uses w2
+    }
+  }
+
+  void merge_results(int qlen, void* output, bool incremental) {
+    auto pool = this->config.pool;
+    auto merge_fn = [this, output, incremental](int token_nth) {
+      auto& local_output_numa = this->local_output_numa;
+      auto& tp_configs = this->tp_configs;
+      auto& tp_count = this->tp_count;
+      auto& config = this->config;
+      float* merge_to = local_output_numa[0] + token_nth * tp_configs[0].hidden_size;
+      if (incremental) {
+        for (int e = 0; e < config.hidden_size; e += 32) {
+          __m512 x0, x1;
+          avx512_32xbf16_to_32xfp32((__m512i*)((ggml_bf16_t*)output + token_nth * config.hidden_size + e), &x0, &x1);
+          *((__m512*)(merge_to + e)) = _mm512_add_ps(*((__m512*)(merge_to + e)), x0);
+          *((__m512*)(merge_to + e + 16)) = _mm512_add_ps(*((__m512*)(merge_to + e + 16)), x1);
+        }
+      }
+      for (int i = 1; i < tp_count; i++) {
+        float* merge_from = local_output_numa[i] + token_nth * tp_configs[i].hidden_size;
+        for (int e = 0; e < tp_configs[i].hidden_size; e += 16) {
+          *((__m512*)(merge_to + e)) = _mm512_add_ps(*((__m512*)(merge_to + e)), *((__m512*)(merge_from + e)));
+        }
+      }
+      for (int e = 0; e < config.hidden_size; e += 32) {
+        __m512 x0 = *(__m512*)(merge_to + e);
+        __m512 x1 = *(__m512*)(merge_to + e + 16);
+        avx512_32xfp32_to_32xbf16(&x0, &x1, (__m512i*)((ggml_bf16_t*)output + token_nth * config.hidden_size + e));
+      }
+    };
+    for (int i = 0; i < qlen; i++) {
+      merge_fn(i);
+    }
+  }
+
+  void merge_results(int qlen, void* output) { merge_results(qlen, output, false); }
+};
+
+#endif  // CPUINFER_OPERATOR_AMX_K2_MOE_H
--- a/kt-kernel/operators/amx/la/amx_buffers.hpp
+++ b/kt-kernel/operators/amx/la/amx_buffers.hpp
@ -4,6 +4,7 @@
 #include <cassert>
 #include <cstdint>
 #include <cstdio>
+#include <cstring>
 #include <limits>
 #include <vector>

@ -344,9 +345,6 @@ struct BufferAKGroupImpl {
  static constexpr int K_STEP = K::K_STEP;
  static constexpr int K_BLOCK = K::K_BLOCK;

-  using index_t = Packed2DLayout::index_t;
-  Packed2DLayout pack;
-
  static size_t required_size(int max_m, int k, int k_group_size) {
    ASSERT_RELEASE(k % k_group_size == 0, "k must be multiple of k_group_size");
    return sizeof(int8_t) * max_m * k + sizeof(float) * max_m * (k / k_group_size);
@ -355,18 +353,12 @@ struct BufferAKGroupImpl {
  BufferAKGroupImpl(int max_m, int k, int k_group_size, void* ptr)
      : max_m(max_m),
        k(k),
-        k_group_size(k_group_size),
-        pack({{static_cast<index_t>(K_STEP), 'c'},
-              {static_cast<index_t>(M_STEP), 'r'},
-              {static_cast<index_t>(k_group_size / K_STEP), 'c'},
-              {static_cast<index_t>(K_BLOCK / k_group_size), 'c'},
-              {static_cast<index_t>(max_m / M_STEP), 'r'},
-              {static_cast<index_t>(k / K_BLOCK), 'c'}}) {
+        k_group_size(k_group_size) {
    ASSERT_RELEASE(k % k_group_size == 0, "k must be multiple of k_group_size");
    ASSERT_RELEASE(max_m % M_STEP == 0, "max_m must be multiple of M_STEP");
    ASSERT_RELEASE(k % K_STEP == 0, "k must be multiple of K_STEP");
    ASSERT_RELEASE(K_BLOCK % k_group_size == 0, "K_BLOCK must be multiple of k_group_size");
-    ASSERT_RELEASE(k % K_BLOCK == 0, "k must be multiple of K_BLOCK");
+    // ASSERT_RELEASE(k % K_BLOCK == 0, "k must be multiple of K_BLOCK");
    k_group_count = k / k_group_size;

    set_data(ptr);
@ -922,6 +914,77 @@ struct BufferBInt4WithZeroImpl {
  float* get_min(int n, int n_begin) { return mins + n_begin; }
 };

+// BufferB for Signed Int4 with KGroup Scale (no zero point)
+// Used for K2 MoE - signed int4 range: [-8, 7]
+template <typename K>
+struct BufferBInt4KGroupImpl {
+  using dt = typename K::dt;
+  dt* b;      // packed signed int4 weights, col majored
+  float* d;   // scales only (no mins/zero-points), row majored
+  int n, k, k_group_size, k_group_count;
+
+  static constexpr int N_STEP = K::N_STEP;
+  static constexpr int K_STEP = K::K_STEP;
+  static constexpr bool SCALE = true;
+
+  // Size calculation: packed int4 weights + scales (NO mins)
+  static size_t required_size(int n, int k, int k_group_size) {
+    return sizeof(int8_t) * n * k / 2 + sizeof(float) * n * (k / k_group_size);
+  }
+
+  BufferBInt4KGroupImpl(int n, int k, int k_group_size, void* ptr) : n(n), k(k), k_group_size(k_group_size) {
+    assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
+    assert(n % N_STEP == 0);
+    assert(k % K_STEP == 0);
+    if (n % N_STEP || k % K_STEP || k % k_group_size) {
+      printf("BufferBInt4KGroupImpl: n: %d, k: %d, N_STEP: %d, K_STEP: %d, k_group_size: %d\n", n, k, N_STEP,
+             K_STEP, k_group_size);
+      throw std::runtime_error("n or k is not aligned to N_STEP or K_STEP");
+    }
+    k_group_count = k / k_group_size;
+    b = reinterpret_cast<dt*>(ptr);
+    d = reinterpret_cast<float*>(offset_pointer(b, n * k / 2));
+  }
+
+  // Load from packed signed int4 format
+  // Input: proj is packed int4 weights (2 int4 values per byte)
+  // Each int4 value is in range [-8, 7] (signed)
+  void from_raw_mat(uint8_t* proj, int ith, int nth) {
+    auto [n_start, n_end] = K::split_range_n(n, ith, nth);
+    if (n_start >= n_end) {
+      return;
+    }
+    const size_t row_bytes = static_cast<size_t>(k) / 2;
+    const size_t rows = static_cast<size_t>(n_end - n_start);
+    uint8_t* dst_weights = reinterpret_cast<uint8_t*>(b) + n_start * row_bytes;
+    const uint8_t* src_weights = proj + n_start * row_bytes;
+    std::memcpy(dst_weights, src_weights, rows * row_bytes);
+  }
+
+  // Get pointer to submatrix for computation
+  dt* get_submat(int n, int k, int n_begin, int k_begin) {
+    const size_t row_bytes = static_cast<size_t>(k) / 2;
+    const size_t row_offset = static_cast<size_t>(n_begin) * row_bytes;
+    const size_t col_offset = static_cast<size_t>(k_begin) / 2;
+    return reinterpret_cast<dt*>(reinterpret_cast<uint8_t*>(b) + row_offset + col_offset);
+  }
+
+  // Get scale pointer for a specific row and k_group
+  float* get_scale(int n, int n_begin, int k, int k_begin) {
+      int k_group_idx = k_begin / k_group_size;
+      return d + n_begin * (k / k_group_size) +  k_group_idx;
+  }
+
+  // Split range for parallel processing
+  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
+    int n_per_thread = (n + nth - 1) / nth;
+    n_per_thread = (n_per_thread + N_STEP - 1) / N_STEP * N_STEP;
+    int n_start = std::min(ith * n_per_thread, n);
+    int n_end = std::min(n_start + n_per_thread, n);
+    return {n_start, n_end};
+  }
+};
+
 template <typename K>
 struct BufferBInt4WithZeroKGroupImpl {
  using dt = typename K::dt;
--- a/kt-kernel/operators/amx/la/amx_kernels.hpp
+++ b/kt-kernel/operators/amx/la/amx_kernels.hpp
@ -1015,8 +1015,9 @@ struct GemmKernel224Int8 {
  static void avx_kernel(int m, int n, int k, int m_begin, int n_begin, int k_block_begin, float* c, BufferA* ba,
                         BufferB* bb) {
    __m512i* c512 = (__m512i*)c;
+    int m_block_end = std::min(m - m_begin, M_STEP);
    if (k_block_begin == 0) {
-      for (int m_i = 0; m_i < m; m_i++) {
+      for (int m_i = 0; m_i < m_block_end; m_i++) {
        c512[m_i * 2] = _mm512_setzero_si512();
        c512[m_i * 2 + 1] = _mm512_setzero_si512();
      }
@ -1028,7 +1029,7 @@ struct GemmKernel224Int8 {

      int32_t* a32 = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin + k_begin);
      __m512i* b512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin + k_begin);
-      for (int m_i = 0; m_i < m && m_i < M_STEP; m_i++) {
+      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512i ma = _mm512_set1_epi32(a32[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
@ -1239,8 +1240,9 @@ struct GemmKernel224Int4 {
                         BufferB* bb) {
    using K = GemmKernel224Int4;
    __m512i* c512 = (__m512i*)c;
+    int m_block_end = std::min(m - m_begin, M_STEP);
    if (k_block_begin == 0) {
-      for (int m_i = 0; m_i < m; m_i++) {
+      for (int m_i = 0; m_i < m_block_end; m_i++) {
        c512[m_i * 2] = _mm512_setzero_si512();
        c512[m_i * 2 + 1] = _mm512_setzero_si512();
      }
@ -1250,7 +1252,7 @@ struct GemmKernel224Int4 {
      int32_t* a32_lo = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin + k_begin);
      int32_t* a32_hi = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin + k_begin + K::K_STEP);
      __m512i* b512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin + k_begin);
-      for (int m_i = 0; m_i < m && m_i < M_STEP; m_i++) {
+      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512i ma_lo = _mm512_set1_epi32(a32_lo[m_i * 16 + k_i]);
          __m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
@ -1533,8 +1535,9 @@ struct GemmKernel224Int4_1 {
                         BufferB* bb) {
    using K = GemmKernel224Int4_1;
    __m512i* c512 = (__m512i*)c;
+    int m_block_end = std::min(m - m_begin, M_STEP);
    if (k_block_begin == 0) {
-      for (int m_i = 0; m_i < m; m_i++) {
+      for (int m_i = 0; m_i < m_block_end; m_i++) {
        c512[m_i * 2] = _mm512_setzero_si512();
        c512[m_i * 2 + 1] = _mm512_setzero_si512();
      }
@ -1543,7 +1546,7 @@ struct GemmKernel224Int4_1 {
      int32_t* a32_lo = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin + k_begin);
      int32_t* a32_hi = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin + k_begin + K::K_STEP);
      __m512i* b512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin + k_begin);
-      for (int m_i = 0; m_i < m && m_i < M_STEP; m_i++) {
+      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512i ma_lo = _mm512_set1_epi32(a32_lo[m_i * 16 + k_i]);
          __m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
@ -2193,10 +2196,11 @@ struct GemmKernel224Int4KGroup {
                         BufferB* bb, int k_group_size) {
    using K = GemmKernel224Int4KGroup;
    __m512i* c512 = (__m512i*)int_c;
+    int m_block_end = std::min(m - m_begin, M_STEP);

    // Initialize int_c to zero at the start of k_group
    if (k_block_begin % k_group_size == 0) {
-      for (int m_i = 0; m_i < m && m_i < M_STEP; m_i++) {
+      for (int m_i = 0; m_i < m_block_end; m_i++) {
        c512[m_i * 2] = _mm512_setzero_si512();
        c512[m_i * 2 + 1] = _mm512_setzero_si512();
      }
@ -2205,7 +2209,7 @@ struct GemmKernel224Int4KGroup {
    if (k_offset == 0) {
      int32_t* a32_lo = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin);
      __m512i* b512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin);
-      for (int m_i = 0; m_i < m && m_i < M_STEP; m_i++) {
+      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512i ma_lo = _mm512_set1_epi32(a32_lo[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
@ -2217,7 +2221,7 @@ struct GemmKernel224Int4KGroup {
    } else {
      int32_t* a32_hi = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin);
      __m512i* b512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin - K::K_STEP);
-      for (int m_i = 0; m_i < m && m_i < M_STEP; m_i++) {
+      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
@ -2471,8 +2475,9 @@ struct GemmKernel224Int4_1KGroup {
                         BufferB* bb, int k_group_size) {
    using K = GemmKernel224Int4_1KGroup;
    __m512i* c512 = (__m512i*)int_c;
+    int m_block_end = std::min(m - m_begin, M_STEP);
    if (k_block_begin % k_group_size == 0) {
-      for (int m_i = 0; m_i < m; m_i++) {
+      for (int m_i = 0; m_i < m_block_end; m_i++) {
        c512[m_i * 2] = _mm512_setzero_si512();
        c512[m_i * 2 + 1] = _mm512_setzero_si512();
      }
@ -2481,7 +2486,7 @@ struct GemmKernel224Int4_1KGroup {
    if (k_offset == 0) {
      int32_t* a32_lo = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin);
      __m512i* b512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin);
-      for (int m_i = 0; m_i < m && m_i < M_STEP; m_i++) {
+      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512i ma_lo = _mm512_set1_epi32(a32_lo[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
@ -2493,7 +2498,7 @@ struct GemmKernel224Int4_1KGroup {
    } else {
      int32_t* a32_hi = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin);
      __m512i* b512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin - K::K_STEP);
-      for (int m_i = 0; m_i < m && m_i < M_STEP; m_i++) {
+      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
@ -2746,8 +2751,9 @@ struct GemmKernel224Int4_1_LowKGroup {
                         BufferB* bb, int k_group_size) {
    using K = GemmKernel224Int4_1_LowKGroup;
    __m512i* c512 = (__m512i*)int_c;
+    int m_block_end = std::min(m - m_begin, M_STEP);
    if (k_block_begin % k_group_size == 0) {
-      for (int m_i = 0; m_i < m; m_i++) {
+      for (int m_i = 0; m_i < m_block_end; m_i++) {
        c512[m_i * 2] = _mm512_setzero_si512();
        c512[m_i * 2 + 1] = _mm512_setzero_si512();
      }
@ -2756,7 +2762,7 @@ struct GemmKernel224Int4_1_LowKGroup {
    if (k_offset == 0) {
      int32_t* a32_lo = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin);
      __m512i* b512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin);
-      for (int m_i = 0; m_i < m && m_i < M_STEP; m_i++) {
+      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512i ma_lo = _mm512_set1_epi32(a32_lo[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
@ -2768,7 +2774,7 @@ struct GemmKernel224Int4_1_LowKGroup {
    } else {
      int32_t* a32_hi = (int32_t*)ba->get_submat(m, k, m_begin, k_block_begin);
      __m512i* b512 = (__m512i*)bb->get_submat(n, k, n_begin, k_block_begin - K::K_STEP);
-      for (int m_i = 0; m_i < m && m_i < M_STEP; m_i++) {
+      for (int m_i = 0; m_i < m_block_end; m_i++) {
        for (int k_i = 0; k_i < 16; k_i++) {
          __m512i ma_hi = _mm512_set1_epi32(a32_hi[m_i * 16 + k_i]);
          for (int n_i = 0; n_i < 2; n_i++) {
@ -2837,6 +2843,110 @@ struct GemmKernel224Int4_1_LowKGroup {
  }
 };

+// K2 Signed Int4 K-group quantization kernel (AVX only, no AMX)
+// For K2 MoE - signed int4 range: [-8, 7]
+struct GemmKernel224Int4SmallKGroup {
+  using dt = uint8_t;  // packed int4 type
+  using output_t = int32_t;
+  static constexpr double ELEMENT_SIZE = 0.5;
+  static const int VNNI_BLK = 4;
+  
+  static const int M_STEP = 1;
+  static const int N_STEP = 32;
+  static const int K_STEP = 32;
+
+  static inline const int N_BLOCK = 256;
+  // K_BLOCK should match k_group_size for proper scaling
+  static inline const int K_BLOCK = 7168;  // Will be overridden by k_group_size
+
+  static std::string name() { return "K2_INT4_KGROUP"; }
+  static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }
+  static std::pair<int, int> split_range_n(int n, int ith, int nth) {
+    int n_start = N_BLOCK * ith;
+    int n_end = std::min(n, N_BLOCK * (ith + 1));
+    return {n_start, n_end};
+  }
+  static void config() {}
+
+  alignas(64) static constexpr uint8_t hi_mask_arr[32] = {
+      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0,
+      0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0
+    };
+
+  alignas(64) static constexpr uint8_t lo_mask_arr[32] = {
+      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
+      0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F
+    };
+  
+  alignas(64) static constexpr uint8_t sign_xor_arr[32] = {
+      0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88,
+      0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88, 0x88
+    };
+  static __m256i hi_mask() { return *((__m256i*)(&hi_mask_arr[0])); }
+  static __m256i lo_mask() { return *((__m256i*)(&lo_mask_arr[0])); }
+  static __m256i sign_xor_mask() { return *((__m256i*)(&sign_xor_arr[0])); }
+
+  using BufferA = BufferAKGroupImpl<GemmKernel224Int4SmallKGroup>;
+  using BufferB = BufferBInt4KGroupImpl<GemmKernel224Int4SmallKGroup>;  // Use new signed int4 buffer
+  using BufferC = BufferCReduceImpl<GemmKernel224Int4SmallKGroup>;
+
+  // K-group aware AVX kernel for signed int4
+  static inline __m512i compressed_int4_to_int8_avx512(__m256i b256) {
+    b256 = _mm256_xor_si256(b256, sign_xor_mask());
+    __m256i b_hi = _mm256_and_si256(b256, hi_mask());
+    __m256i b_lo = _mm256_slli_epi16(_mm256_andnot_si256(hi_mask(), b256), 4);
+
+    __m256i unpack_lo = _mm256_unpacklo_epi8(b_lo, b_hi);
+    __m256i unpack_hi = _mm256_unpackhi_epi8(b_lo, b_hi);
+    __m512i result = _mm512_inserti64x4(_mm512_castsi256_si512(unpack_lo), unpack_hi, 1);
+    const __m512i lane_shuffle = _mm512_set_epi64(7, 6, 3, 2, 5, 4, 1, 0);
+    return _mm512_permutexvar_epi64(lane_shuffle, result);
+  }
+  static inline void integer_mat_vec_kgroup(int m, int n, int k, int k_group_size, BufferA* ba, BufferB *bb, BufferC* bc, int ith, int nth) {
+    auto [n_start, n_end] = split_range_n(n, ith, nth);
+    for (int m_begin = 0; m_begin < m; m_begin ++) {
+      float* c = bc->get_submat(m, n, m_begin, 0);
+      __m512i* a512 = (__m512i*)ba->get_submat(m, k, m_begin, 0);
+      
+      for (int n_block_begin = n_start; n_block_begin < n_end; n_block_begin ++) {
+        __m256i* b256 = (__m256i*)bb->get_submat(n, k, n_block_begin, 0);
+        float* as = (float*)ba->get_scale(m, m_begin, k, 0);
+        float* bs = (float*)bb->get_scale(n, n_block_begin, k, 0);
+        
+        __m512 sum = _mm512_setzero_ps();
+        #define WORK_K_BLOCK(k_block) \
+          { \
+            __m256 abscale0 = _mm256_set1_ps(as[(k_block)*2] * bs[(k_block)*2]); \
+            __m256 abscale1 = _mm256_set1_ps(as[(k_block)*2+1] * bs[(k_block)*2+1]); \
+            __m512 abscale = _mm512_insertf32x8(_mm512_castps256_ps512(abscale0), abscale1, 1); \
+            __m512i mul = _mm512_setzero_si512(); \
+            mul = _mm512_dpbssd_epi32(mul, a512[k_block], compressed_int4_to_int8_avx512(b256[k_block])); \
+            sum = _mm512_add_ps(sum, _mm512_mul_ps(abscale, _mm512_cvtepi32_ps(mul))); \
+          }
+
+        for (int k_block = 0; k_block < k / 64; k_block += 2) {
+          WORK_K_BLOCK(k_block);
+          WORK_K_BLOCK(k_block + 1);
+        }
+
+        c[n_block_begin] = _mm512_reduce_add_ps(sum) / 16;
+      }
+    }
+  }
+};
+
+inline void vec_mul_kgroup(int m, int n, int k, int k_group_size, std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferA> ba,
+                           std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferB> bb,
+                           std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferC> bc, int ith, int nth) {
+  GemmKernel224Int4SmallKGroup::integer_mat_vec_kgroup(m, n, k, k_group_size, ba.get(), bb.get(), bc.get(), ith, nth);
+}
+
+inline void mat_mul_kgroup(int m, int n, int k, int k_group_size, std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferA> ba,
+                           std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferB> bb,
+                           std::shared_ptr<GemmKernel224Int4SmallKGroup::BufferC> bc, int ith, int nth) {
+  GemmKernel224Int4SmallKGroup::integer_mat_vec_kgroup(m, n, k, k_group_size, ba.get(), bb.get(), bc.get(), ith, nth);
+}
+
 // New k-group aware matrix multiplication function
 template <typename K, bool amx_or_avx = true>
 void integer_mat_mul_kgroup(int m, int n, int k, int k_group_size, typename K::BufferA* ba, typename K::BufferB* bb,
--- a/kt-kernel/python/experts.py
+++ b/kt-kernel/python/experts.py
@ -17,7 +17,7 @@ from typing import List, Optional
 from .experts_base import BaseMoEWrapper, KExpertsCPUBuffer

 # Import backend implementations
-from .utils.amx import AMXMoEWrapper
+from .utils.amx import AMXMoEWrapper, RAWAMXMoEWrapper
 from .utils.llamafile import LlamafileMoEWrapper
 from .utils.moe_kernel import GeneralMoEWrapper

@ -77,7 +77,7 @@ class KTMoEWrapper:
            chunked_prefill_size: Maximum prefill chunk size
            cpu_save: Whether to save weights to CPU memory
            max_deferred_experts_per_token: Number of experts per token to defer. Defaults to 0.
-            method: Backend method ("AMXINT4", "AMXINT8", "LLAMAFILE", "MOE_INT4", "MOE_INT8")
+            method: Backend method ("AMXINT4", "AMXINT8", "RAWINT4", "LLAMAFILE", "MOE_INT4", "MOE_INT8")

        Returns:
            An instance of the appropriate backend implementation (e.g., AMXMoEWrapper)
@ -85,6 +85,8 @@ class KTMoEWrapper:
        # Select backend based on method
        if method in ["AMXINT4", "AMXINT8"]:
            backend_cls = AMXMoEWrapper
+        elif method == "RAWINT4":
+            backend_cls = RAWAMXMoEWrapper
        elif method == "LLAMAFILE":
            backend_cls = LlamafileMoEWrapper
        elif method in ["MOE_INT4", "MOE_INT8"]:
--- a/kt-kernel/python/utils/init.py
+++ b/kt-kernel/python/utils/init.py
@ -4,13 +4,15 @@
 Utilities for kt_kernel package.
 """

-from .amx import AMXMoEWrapper
+from .amx import AMXMoEWrapper, RAWAMXMoEWrapper
 from .llamafile import LlamafileMoEWrapper
-from .loader import SafeTensorLoader, GGUFLoader
+from .loader import SafeTensorLoader, GGUFLoader, CompressedSafeTensorLoader

 __all__ = [
    "AMXMoEWrapper",
+    "RAWAMXMoEWrapper",
    "LlamafileMoEWrapper",
    "SafeTensorLoader",
+    "CompressedSafeTensorLoader",
    "GGUFLoader",
 ]
--- a/kt-kernel/python/utils/amx.py
+++ b/kt-kernel/python/utils/amx.py
@ -4,16 +4,16 @@ import ctypes

 # Use relative imports for package structure
 from ..experts_base import BaseMoEWrapper
-from .loader import SafeTensorLoader
+from .loader import SafeTensorLoader, CompressedSafeTensorLoader
 from kt_kernel_ext.moe import MOEConfig

 try:
-    from kt_kernel_ext.moe import AMXInt4_MOE, AMXInt8_MOE
+    from kt_kernel_ext.moe import AMXInt4_MOE, AMXInt8_MOE, AMXInt4_KGroup_MOE

    _HAS_AMX_SUPPORT = True
 except (ImportError, AttributeError):
    _HAS_AMX_SUPPORT = False
-    AMXInt4_MOE, AMXInt8_MOE = None, None
+    AMXInt4_MOE, AMXInt8_MOE, AMXInt4_KGroup_MOE = None, None, None

 from typing import Optional

@ -301,3 +301,152 @@ class AMXMoEWrapper(BaseMoEWrapper):
            del self.gate_scales
            del self.up_scales
            del self.down_scales
+
+
+class RAWAMXMoEWrapper(BaseMoEWrapper):
+    """Wrapper for RAWINT4 experts stored in compressed SafeTensor format."""
+
+    _compressed_loader_instance = None
+
+    def __init__(
+        self,
+        layer_idx: int,
+        num_experts: int,
+        num_experts_per_tok: int,
+        hidden_size: int,
+        moe_intermediate_size: int,
+        num_gpu_experts: int,
+        cpuinfer_threads: int,
+        threadpool_count: int,
+        weight_path: str,
+        chunked_prefill_size: int,
+        cpu_save: bool = False,
+        max_deferred_experts_per_token: Optional[int] = None,
+        method: str = "RAWINT4",
+    ):
+        if not _HAS_AMX_SUPPORT or AMXInt4_KGroup_MOE is None:
+            raise RuntimeError("AMX backend with RAWINT4 support is not available.")
+
+        super().__init__(
+            layer_idx=layer_idx,
+            num_experts=num_experts,
+            num_experts_per_tok=num_experts_per_tok,
+            hidden_size=hidden_size,
+            moe_intermediate_size=moe_intermediate_size,
+            num_gpu_experts=num_gpu_experts,
+            cpuinfer_threads=cpuinfer_threads,
+            threadpool_count=threadpool_count,
+            weight_path=weight_path,
+            chunked_prefill_size=chunked_prefill_size,
+            cpu_save=cpu_save,
+            max_deferred_experts_per_token=max_deferred_experts_per_token,
+            method=method,
+        )
+
+        if RAWAMXMoEWrapper._compressed_loader_instance is None:
+            RAWAMXMoEWrapper._compressed_loader_instance = CompressedSafeTensorLoader(weight_path)
+        self.loader = RAWAMXMoEWrapper._compressed_loader_instance
+
+        self.gate_weights = None
+        self.up_weights = None
+        self.down_weights = None
+        self.gate_scales = None
+        self.up_scales = None
+        self.down_scales = None
+
+    def load_weights_from_tensors(
+        self,
+        gate_proj: torch.Tensor,
+        up_proj: torch.Tensor,
+        down_proj: torch.Tensor,
+        physical_to_logical_map_cpu: torch.Tensor,
+    ):
+        raise NotImplementedError("RAWINT4 wrapper expects pre-quantized safetensor weights.")
+
+    def load_weights(self, physical_to_logical_map_cpu: torch.Tensor):
+        base_key = f"model.layers.{self.layer_idx}"
+        weights = self.loader.load_experts(base_key)
+
+        self.gate_weights = torch.stack(weights["gate"], dim=0).contiguous()
+        self.up_weights = torch.stack(weights["up"], dim=0).contiguous()
+        self.down_weights = torch.stack(weights["down"], dim=0).contiguous()
+
+        self.gate_scales = torch.stack(weights["gate_scale"], dim=0).to(torch.bfloat16).contiguous()
+        self.up_scales = torch.stack(weights["up_scale"], dim=0).to(torch.bfloat16).contiguous()
+        self.down_scales = torch.stack(weights["down_scale"], dim=0).to(torch.bfloat16).contiguous()
+
+        moe_config = MOEConfig(
+            self.num_experts,
+            self.num_experts_per_tok,
+            self.hidden_size,
+            self.moe_intermediate_size,
+            self.num_gpu_experts,
+        )
+        moe_config.layer_idx = self.layer_idx
+        moe_config.pool = self.cpu_infer.backend_
+        moe_config.max_len = self.chunked_prefill_size
+
+        moe_config.quant_config.bits = 4
+        moe_config.quant_config.group_size = 32
+        moe_config.quant_config.zero_point = False
+
+        moe_config.gate_proj = self.gate_weights.data_ptr()
+        moe_config.up_proj = self.up_weights.data_ptr()
+        moe_config.down_proj = self.down_weights.data_ptr()
+        moe_config.gate_scale = self.gate_scales.data_ptr()
+        moe_config.up_scale = self.up_scales.data_ptr()
+        moe_config.down_scale = self.down_scales.data_ptr()
+
+        self.moe = AMXInt4_KGroup_MOE(moe_config)
+
+        self.cpu_infer.submit(self.moe.load_weights_task(physical_to_logical_map_cpu.data_ptr()))
+        self.cpu_infer.sync()
+
+        del self.gate_weights
+        del self.up_weights
+        del self.down_weights
+        del self.gate_scales
+        del self.up_scales
+        del self.down_scales
+
+    def submit_write_weight_scale_to_buffer(
+        self,
+        gpu_tp_count: int,
+        gpu_experts_num: int,
+        w13_weight_ptrs,
+        w13_scale_ptrs,
+        w2_weight_ptrs,
+        w2_scale_ptrs,
+    ):
+        """
+        Submit the write_weight_scale_to_buffer task for RAWINT4 KGroup AMX implementation.
+
+        This method submits the C++-exposed task `write_weight_scale_to_buffer_task` to the
+        shared CPUInfer queue. The pointer lists should be plain integer lists (e.g. from
+        tensor.data_ptr()).
+        """
+        if self.moe is None:
+            raise RuntimeError("MoE instance not initialized; cannot submit write_weight_scale_to_buffer task.")
+
+        if not hasattr(self.moe, "write_weight_scale_to_buffer_task"):
+            raise NotImplementedError(
+                "write_weight_scale_to_buffer_task is not available for this backend implementation."
+            )
+
+        self.cpu_infer.submit(
+            self.moe.write_weight_scale_to_buffer_task(
+                gpu_tp_count,
+                gpu_experts_num,
+                w13_weight_ptrs,
+                w13_scale_ptrs,
+                w2_weight_ptrs,
+                w2_scale_ptrs,
+            )
+        )
+
+    def sync_write_weight_scale_to_buffer(self):
+        """
+        Block until previously submitted write_weight_scale_to_buffer tasks finish.
+        """
+        # The CPUInfer.sync() call blocks until pending tasks complete.
+        self.cpu_infer.sync()
--- a/kt-kernel/python/utils/loader.py
+++ b/kt-kernel/python/utils/loader.py
@ -237,6 +237,56 @@ class SafeTensorLoader:
        return name in self.tensor_file_map


+class CompressedSafeTensorLoader(SafeTensorLoader):
+    """Loader for compressed SafeTensor layouts (RAWINT4 weights)."""
+
+    def load_experts(self, base_key: str, device: str = "cpu"):
+        """Load raw expert weights stored in compressed safetensor format."""
+
+        experts_prefix = f"{base_key}.mlp.experts"
+
+        expert_idx = 0
+        while self.has_tensor(f"{experts_prefix}.{expert_idx}.up_proj.weight_packed"):
+            expert_idx += 1
+
+        if expert_idx == 0:
+            raise ValueError(f"No experts found for key {experts_prefix}")
+
+        def load_projection(proj_name: str):
+            weight_entries = []
+            scale_entries = []
+
+            for exp_id in range(expert_idx):
+                weight_key = f"{experts_prefix}.{exp_id}.{proj_name}_proj.weight_packed"
+                scale_key = f"{experts_prefix}.{exp_id}.{proj_name}_proj.weight_scale"
+
+                if not self.has_tensor(weight_key):
+                    raise KeyError(f"Missing tensor: {weight_key}")
+                if not self.has_tensor(scale_key):
+                    raise KeyError(f"Missing tensor: {scale_key}")
+
+                weight_tensor = self.load_tensor(weight_key, device).contiguous()
+                scale_tensor = self.load_tensor(scale_key, device).contiguous()
+
+                weight_entries.append(weight_tensor)
+                scale_entries.append(scale_tensor)
+
+            return weight_entries, scale_entries
+
+        gate_weights, gate_scales = load_projection("gate")
+        up_weights, up_scales = load_projection("up")
+        down_weights, down_scales = load_projection("down")
+
+        return {
+            "gate": gate_weights,
+            "up": up_weights,
+            "down": down_weights,
+            "gate_scale": gate_scales,
+            "up_scale": up_scales,
+            "down_scale": down_scales,
+        }
+
+
 class GGUFLoader:
    """
    GGUF format loader using the official gguf library (gguf.gguf_reader.GGUFReader)