(kt-kernel): add numa_nodes parameter for explicit NUMA node mapping (#1891)

Add numa_nodes parameter to BaseMoEWrapper and all subclasses, allowing users to explicitly specify which NUMA node IDs to use for subpool mapping instead of always defaulting to sequential [0, 1, ..., N-1]. This enables running multiple KTransformers instances on different NUMA nodes of the same machine, e.g. --kt-threadpool-count 1 --kt-numa-nodes 1 to bind to NUMA node 1. Previously this required external numactl workarounds since subpool_numa_map was hardcoded to start from 0.
2026-04-28 11:49:51 +00:00 · 2026-03-31 10:27:50 +08:00 · 2026-03-31 10:27:50 +08:00 · 3903c9afcc
commit 3903c9afcc
parent bdf4bb76c5
5 changed files with 34 additions and 6 deletions
--- a/kt-kernel/python/experts.py
+++ b/kt-kernel/python/experts.py
@ -65,6 +65,7 @@ class KTMoEWrapper:
        cpu_save: bool = False,
        max_deferred_experts_per_token: Optional[int] = None,
        method: str = "AMXINT4",
+        numa_nodes: Optional[List[int]] = None,
    ):
        """
        Factory method to create the appropriate backend implementation.
@ -85,6 +86,7 @@ class KTMoEWrapper:
            chunked_prefill_size: Maximum prefill chunk size
            cpu_save: Whether to save weights to CPU memory
            max_deferred_experts_per_token: Number of experts per token to defer. Defaults to 0.
+            numa_nodes: Explicit list of NUMA node IDs for subpool mapping. If None, defaults to sequential.
            method: Backend method ("AMXINT4", "AMXINT8", "RAWINT4", "FP8", "BF16", "LLAMAFILE", "MOE_INT4", "MOE_INT8")

        Returns:
@ -117,6 +119,7 @@ class KTMoEWrapper:
            cpu_save=cpu_save,
            max_deferred_experts_per_token=max_deferred_experts_per_token,
            method=method,
+            numa_nodes=numa_nodes,
        )

    # Forward static methods to the base class