mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-04-28 11:49:51 +00:00
V4-Flash routed experts ship as native MXFP4 (E2M1 nibble + ue8m0 group
scale). Expose AMXFP4_KGroup_MOE through NativeMoEWrapper, add a loader
that handles V4's `layers.{L}.ffn.experts.{i}.{w1,w3,w2}.{weight,scale}`
naming and converts ue8m0 → bf16 via a lossless bit-cast, register the
model entry, and ship an end-to-end numerical validation script.
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
433 lines
15 KiB
Python
433 lines
15 KiB
Python
"""
|
||
Model registry for kt-cli.
|
||
|
||
Provides a registry of supported models with fuzzy matching capabilities.
|
||
"""
|
||
|
||
import re
|
||
from dataclasses import dataclass, field
|
||
from pathlib import Path
|
||
from typing import Callable, Optional
|
||
|
||
import yaml
|
||
|
||
from kt_kernel.cli.config.settings import get_settings
|
||
|
||
|
||
@dataclass
|
||
class ModelInfo:
|
||
"""Information about a supported model."""
|
||
|
||
name: str
|
||
hf_repo: str
|
||
aliases: list[str] = field(default_factory=list)
|
||
type: str = "moe" # moe, dense
|
||
gpu_vram_gb: float = 0
|
||
cpu_ram_gb: float = 0
|
||
default_params: dict = field(default_factory=dict)
|
||
description: str = ""
|
||
description_zh: str = ""
|
||
max_tensor_parallel_size: Optional[int] = None # Maximum tensor parallel size for this model
|
||
|
||
|
||
# Built-in model registry
|
||
BUILTIN_MODELS: list[ModelInfo] = [
|
||
ModelInfo(
|
||
name="DeepSeek-V3-0324",
|
||
hf_repo="deepseek-ai/DeepSeek-V3-0324",
|
||
aliases=["deepseek-v3-0324", "deepseek-v3", "dsv3", "deepseek3", "v3-0324"],
|
||
type="moe",
|
||
default_params={
|
||
"kt-num-gpu-experts": 1,
|
||
"attention-backend": "triton",
|
||
"disable-shared-experts-fusion": True,
|
||
"kt-method": "AMXINT4",
|
||
},
|
||
description="DeepSeek V3-0324 685B MoE model (March 2025, improved benchmarks)",
|
||
description_zh="DeepSeek V3-0324 685B MoE 模型(2025年3月,改进的基准测试)",
|
||
),
|
||
ModelInfo(
|
||
name="DeepSeek-V3.2",
|
||
hf_repo="deepseek-ai/DeepSeek-V3.2",
|
||
aliases=["deepseek-v3.2", "dsv3.2", "deepseek3.2", "v3.2"],
|
||
type="moe",
|
||
default_params={
|
||
"kt-method": "FP8",
|
||
"kt-gpu-prefill-token-threshold": 4096,
|
||
"attention-backend": "flashinfer",
|
||
"fp8-gemm-backend": "triton",
|
||
"max-total-tokens": 100000,
|
||
"max-running-requests": 16,
|
||
"chunked-prefill-size": 32768,
|
||
"mem-fraction-static": 0.80,
|
||
"watchdog-timeout": 3000,
|
||
"served-model-name": "DeepSeek-V3.2",
|
||
"disable-shared-experts-fusion": True,
|
||
},
|
||
description="DeepSeek V3.2 671B MoE model (latest)",
|
||
description_zh="DeepSeek V3.2 671B MoE 模型(最新)",
|
||
),
|
||
ModelInfo(
|
||
name="DeepSeek-R1-0528",
|
||
hf_repo="deepseek-ai/DeepSeek-R1-0528",
|
||
aliases=["deepseek-r1-0528", "deepseek-r1", "dsr1", "r1", "r1-0528"],
|
||
type="moe",
|
||
default_params={
|
||
"kt-num-gpu-experts": 1,
|
||
"attention-backend": "triton",
|
||
"disable-shared-experts-fusion": True,
|
||
"kt-method": "AMXINT4",
|
||
},
|
||
description="DeepSeek R1-0528 reasoning model (May 2025, improved reasoning depth)",
|
||
description_zh="DeepSeek R1-0528 推理模型(2025年5月,改进的推理深度)",
|
||
),
|
||
ModelInfo(
|
||
name="DeepSeek-V4-Flash",
|
||
hf_repo="deepseek-ai/DeepSeek-V4-Flash",
|
||
aliases=["deepseek-v4-flash", "deepseek-v4", "dsv4", "v4-flash", "v4"],
|
||
type="moe",
|
||
default_params={
|
||
"kt-method": "MXFP4",
|
||
"kt-gpu-prefill-token-threshold": 4096,
|
||
"attention-backend": "flashinfer",
|
||
"max-total-tokens": 100000,
|
||
"max-running-requests": 16,
|
||
"chunked-prefill-size": 32768,
|
||
"mem-fraction-static": 0.80,
|
||
"watchdog-timeout": 3000,
|
||
"served-model-name": "DeepSeek-V4-Flash",
|
||
"disable-shared-experts-fusion": True,
|
||
},
|
||
description="DeepSeek V4-Flash MoE model (native MXFP4 experts, MQA + sparse index attention)",
|
||
description_zh="DeepSeek V4-Flash MoE 模型(原生 MXFP4 专家,MQA + 稀疏索引注意力)",
|
||
),
|
||
ModelInfo(
|
||
name="Kimi-K2-Thinking",
|
||
hf_repo="moonshotai/Kimi-K2-Thinking",
|
||
aliases=["kimi-k2-thinking", "kimi-thinking", "k2-thinking", "kimi", "k2"],
|
||
type="moe",
|
||
default_params={
|
||
"kt-method": "RAWINT4",
|
||
"kt-gpu-prefill-token-threshold": 400,
|
||
"attention-backend": "flashinfer",
|
||
"max-total-tokens": 100000,
|
||
"max-running-requests": 16,
|
||
"chunked-prefill-size": 32768,
|
||
"mem-fraction-static": 0.80,
|
||
"watchdog-timeout": 3000,
|
||
"served-model-name": "Kimi-K2-Thinking",
|
||
"disable-shared-experts-fusion": True,
|
||
},
|
||
description="Moonshot Kimi K2 Thinking MoE model",
|
||
description_zh="月之暗面 Kimi K2 Thinking MoE 模型",
|
||
),
|
||
ModelInfo(
|
||
name="MiniMax-M2",
|
||
hf_repo="MiniMaxAI/MiniMax-M2",
|
||
aliases=["minimax-m2", "m2"],
|
||
type="moe",
|
||
default_params={
|
||
"kt-method": "FP8",
|
||
"kt-gpu-prefill-token-threshold": 4096,
|
||
"attention-backend": "flashinfer",
|
||
"fp8-gemm-backend": "triton",
|
||
"max-total-tokens": 100000,
|
||
"max-running-requests": 16,
|
||
"chunked-prefill-size": 32768,
|
||
"mem-fraction-static": 0.80,
|
||
"watchdog-timeout": 3000,
|
||
"served-model-name": "MiniMax-M2",
|
||
"disable-shared-experts-fusion": True,
|
||
"tool-call-parser": "minimax-m2",
|
||
"reasoning-parser": "minimax-append-think",
|
||
},
|
||
description="MiniMax M2 MoE model",
|
||
description_zh="MiniMax M2 MoE 模型",
|
||
max_tensor_parallel_size=4, # M2 only supports up to 4-way tensor parallelism
|
||
),
|
||
ModelInfo(
|
||
name="MiniMax-M2.1",
|
||
hf_repo="MiniMaxAI/MiniMax-M2.1",
|
||
aliases=["minimax-m2.1", "m2.1"],
|
||
type="moe",
|
||
default_params={
|
||
"kt-method": "FP8",
|
||
"kt-gpu-prefill-token-threshold": 4096,
|
||
"attention-backend": "flashinfer",
|
||
"fp8-gemm-backend": "triton",
|
||
"max-total-tokens": 100000,
|
||
"max-running-requests": 16,
|
||
"chunked-prefill-size": 32768,
|
||
"mem-fraction-static": 0.80,
|
||
"watchdog-timeout": 3000,
|
||
"served-model-name": "MiniMax-M2.1",
|
||
"disable-shared-experts-fusion": True,
|
||
"tool-call-parser": "minimax-m2",
|
||
"reasoning-parser": "minimax-append-think",
|
||
},
|
||
description="MiniMax M2.1 MoE model (enhanced multi-language programming)",
|
||
description_zh="MiniMax M2.1 MoE 模型(增强多语言编程能力)",
|
||
max_tensor_parallel_size=4, # M2.1 only supports up to 4-way tensor parallelism
|
||
),
|
||
]
|
||
|
||
|
||
class ModelRegistry:
|
||
"""Registry of supported models with fuzzy matching."""
|
||
|
||
def __init__(self):
|
||
"""Initialize the model registry."""
|
||
self._models: dict[str, ModelInfo] = {}
|
||
self._aliases: dict[str, str] = {}
|
||
self._load_builtin_models()
|
||
self._load_user_models()
|
||
|
||
def _load_builtin_models(self) -> None:
|
||
"""Load built-in models."""
|
||
for model in BUILTIN_MODELS:
|
||
self._register(model)
|
||
|
||
def _load_user_models(self) -> None:
|
||
"""Load user-defined models from config."""
|
||
settings = get_settings()
|
||
registry_file = settings.config_dir / "registry.yaml"
|
||
|
||
if registry_file.exists():
|
||
try:
|
||
with open(registry_file, "r", encoding="utf-8") as f:
|
||
data = yaml.safe_load(f) or {}
|
||
|
||
for name, info in data.get("models", {}).items():
|
||
model = ModelInfo(
|
||
name=name,
|
||
hf_repo=info.get("hf_repo", ""),
|
||
aliases=info.get("aliases", []),
|
||
type=info.get("type", "moe"),
|
||
gpu_vram_gb=info.get("gpu_vram_gb", 0),
|
||
cpu_ram_gb=info.get("cpu_ram_gb", 0),
|
||
default_params=info.get("default_params", {}),
|
||
description=info.get("description", ""),
|
||
description_zh=info.get("description_zh", ""),
|
||
max_tensor_parallel_size=info.get("max_tensor_parallel_size"),
|
||
)
|
||
self._register(model)
|
||
except (yaml.YAMLError, OSError):
|
||
pass
|
||
|
||
def _register(self, model: ModelInfo) -> None:
|
||
"""Register a model."""
|
||
self._models[model.name.lower()] = model
|
||
|
||
# Register aliases
|
||
for alias in model.aliases:
|
||
self._aliases[alias.lower()] = model.name.lower()
|
||
|
||
def get(self, name: str) -> Optional[ModelInfo]:
|
||
"""Get a model by exact name or alias."""
|
||
name_lower = name.lower()
|
||
|
||
# Check direct match
|
||
if name_lower in self._models:
|
||
return self._models[name_lower]
|
||
|
||
# Check aliases
|
||
if name_lower in self._aliases:
|
||
return self._models[self._aliases[name_lower]]
|
||
|
||
return None
|
||
|
||
def search(self, query: str, limit: int = 10) -> list[ModelInfo]:
|
||
"""Search for models using fuzzy matching.
|
||
|
||
Args:
|
||
query: Search query
|
||
limit: Maximum number of results
|
||
|
||
Returns:
|
||
List of matching models, sorted by relevance
|
||
"""
|
||
query_lower = query.lower()
|
||
results: list[tuple[float, ModelInfo]] = []
|
||
|
||
for model in self._models.values():
|
||
score = self._match_score(query_lower, model)
|
||
if score > 0:
|
||
results.append((score, model))
|
||
|
||
# Sort by score descending
|
||
results.sort(key=lambda x: x[0], reverse=True)
|
||
|
||
return [model for _, model in results[:limit]]
|
||
|
||
def _match_score(self, query: str, model: ModelInfo) -> float:
|
||
"""Calculate match score for a model.
|
||
|
||
Returns a score between 0 and 1, where 1 is an exact match.
|
||
"""
|
||
# Check exact match
|
||
if query == model.name.lower():
|
||
return 1.0
|
||
|
||
# Check alias exact match
|
||
for alias in model.aliases:
|
||
if query == alias.lower():
|
||
return 0.95
|
||
|
||
# Check if query is contained in name
|
||
if query in model.name.lower():
|
||
return 0.8
|
||
|
||
# Check if query is contained in aliases
|
||
for alias in model.aliases:
|
||
if query in alias.lower():
|
||
return 0.7
|
||
|
||
# Check if query is contained in hf_repo
|
||
if query in model.hf_repo.lower():
|
||
return 0.6
|
||
|
||
# Fuzzy matching - check if all query parts are present
|
||
query_parts = re.split(r"[-_.\s]", query)
|
||
name_lower = model.name.lower()
|
||
|
||
matches = sum(1 for part in query_parts if part and part in name_lower)
|
||
if matches > 0:
|
||
return 0.5 * (matches / len(query_parts))
|
||
|
||
return 0.0
|
||
|
||
def list_all(self) -> list[ModelInfo]:
|
||
"""List all registered models."""
|
||
return list(self._models.values())
|
||
|
||
def find_local_models(self, max_depth: int = 3) -> list[tuple[ModelInfo, Path]]:
|
||
"""Find models that are downloaded locally in any configured model path.
|
||
|
||
Args:
|
||
max_depth: Maximum depth to search within each model path (default: 3)
|
||
|
||
Returns:
|
||
List of (ModelInfo, path) tuples for local models
|
||
"""
|
||
settings = get_settings()
|
||
model_paths = settings.get_model_paths()
|
||
results = []
|
||
|
||
for model in self._models.values():
|
||
found = False
|
||
# Search in all configured model directories
|
||
for models_dir in model_paths:
|
||
if not models_dir.exists():
|
||
continue
|
||
|
||
# Generate possible names to search for
|
||
possible_names = [
|
||
model.name,
|
||
model.name.lower(),
|
||
model.hf_repo.split("/")[-1],
|
||
model.hf_repo.replace("/", "--"),
|
||
]
|
||
|
||
# Search recursively up to max_depth
|
||
for depth in range(max_depth):
|
||
# Build glob pattern for current depth
|
||
# depth=0: direct children, depth=1: grandchildren, etc.
|
||
glob_pattern = "*" if depth > 0 else ""
|
||
for _ in range(depth):
|
||
glob_pattern = "*/" + glob_pattern if glob_pattern else "*"
|
||
|
||
for name in possible_names:
|
||
if depth == 0:
|
||
# Direct children: models_dir / name
|
||
search_paths = [models_dir / name]
|
||
else:
|
||
# Nested: use rglob to find directories matching the name
|
||
search_paths = list(models_dir.rglob(name))
|
||
|
||
for path in search_paths:
|
||
if path.exists() and (path / "config.json").exists():
|
||
results.append((model, path))
|
||
found = True
|
||
break
|
||
|
||
if found:
|
||
break
|
||
|
||
if found:
|
||
break
|
||
|
||
if found:
|
||
break
|
||
|
||
return results
|
||
|
||
|
||
# Global registry instance
|
||
_registry: Optional[ModelRegistry] = None
|
||
|
||
|
||
def get_registry() -> ModelRegistry:
|
||
"""Get the global model registry instance."""
|
||
global _registry
|
||
if _registry is None:
|
||
_registry = ModelRegistry()
|
||
return _registry
|
||
|
||
|
||
# ============================================================================
|
||
# Model-specific parameter computation functions
|
||
# ============================================================================
|
||
|
||
|
||
def compute_deepseek_v3_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb: float) -> int:
|
||
per_gpu_gb = 16
|
||
if vram_per_gpu_gb < per_gpu_gb:
|
||
return int(0)
|
||
total_vram = int(tensor_parallel_size * (vram_per_gpu_gb - per_gpu_gb))
|
||
|
||
return total_vram // 3
|
||
|
||
|
||
def compute_deepseek_v4_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb: float) -> int:
|
||
"""Compute kt-num-gpu-experts for DeepSeek-V4-Flash.
|
||
|
||
V4 uses MXFP4 experts (~0.5 bytes/param vs V3 FP8's 1 byte/param) so each GPU
|
||
can hold ~2x more experts per VRAM unit than V3 at the same fragmentation.
|
||
"""
|
||
per_gpu_gb = 16
|
||
if vram_per_gpu_gb < per_gpu_gb:
|
||
return 0
|
||
total_vram = int(tensor_parallel_size * (vram_per_gpu_gb - per_gpu_gb))
|
||
return total_vram * 2 // 3
|
||
|
||
|
||
def compute_kimi_k2_thinking_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb: float) -> int:
|
||
"""Compute kt-num-gpu-experts for Kimi K2 Thinking."""
|
||
per_gpu_gb = 16
|
||
if vram_per_gpu_gb < per_gpu_gb:
|
||
return int(0)
|
||
total_vram = int(tensor_parallel_size * (vram_per_gpu_gb - per_gpu_gb))
|
||
|
||
return total_vram * 2 // 3
|
||
|
||
|
||
def compute_minimax_m2_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb: float) -> int:
|
||
"""Compute kt-num-gpu-experts for MiniMax M2/M2.1."""
|
||
per_gpu_gb = 16
|
||
if vram_per_gpu_gb < per_gpu_gb:
|
||
return int(0)
|
||
total_vram = int(tensor_parallel_size * (vram_per_gpu_gb - per_gpu_gb))
|
||
|
||
return total_vram // 1
|
||
|
||
|
||
# Model name to computation function mapping
|
||
MODEL_COMPUTE_FUNCTIONS: dict[str, Callable[[int, float], int]] = {
|
||
"DeepSeek-V3-0324": compute_deepseek_v3_gpu_experts,
|
||
"DeepSeek-V3.2": compute_deepseek_v3_gpu_experts, # Same as V3-0324
|
||
"DeepSeek-R1-0528": compute_deepseek_v3_gpu_experts, # Same as V3-0324
|
||
"DeepSeek-V4-Flash": compute_deepseek_v4_gpu_experts,
|
||
"Kimi-K2-Thinking": compute_kimi_k2_thinking_gpu_experts,
|
||
"MiniMax-M2": compute_minimax_m2_gpu_experts,
|
||
"MiniMax-M2.1": compute_minimax_m2_gpu_experts, # Same as M2
|
||
}
|