Kt minimax (#1742)

[feat]: fp8 kernel and kt-cli support
2026-04-28 11:49:51 +00:00 · 2025-12-24 15:39:44 +08:00 · 2025-12-24 15:39:44 +08:00 · d8046e1bb4
commit d8046e1bb4
parent e7d277d163
65 changed files with 12111 additions and 2502 deletions
--- a/kt-kernel/python/cli/utils/model_registry.py
+++ b/kt-kernel/python/cli/utils/model_registry.py
@ -0,0 +1,374 @@
+"""
+Model registry for kt-cli.
+
+Provides a registry of supported models with fuzzy matching capabilities.
+"""
+
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Callable, Optional
+
+import yaml
+
+from kt_kernel.cli.config.settings import get_settings
+
+
+@dataclass
+class ModelInfo:
+    """Information about a supported model."""
+
+    name: str
+    hf_repo: str
+    aliases: list[str] = field(default_factory=list)
+    type: str = "moe"  # moe, dense
+    gpu_vram_gb: float = 0
+    cpu_ram_gb: float = 0
+    default_params: dict = field(default_factory=dict)
+    description: str = ""
+    description_zh: str = ""
+    max_tensor_parallel_size: Optional[int] = None  # Maximum tensor parallel size for this model
+
+
+# Built-in model registry
+BUILTIN_MODELS: list[ModelInfo] = [
+    ModelInfo(
+        name="DeepSeek-V3-0324",
+        hf_repo="deepseek-ai/DeepSeek-V3-0324",
+        aliases=["deepseek-v3-0324", "deepseek-v3", "dsv3", "deepseek3", "v3-0324"],
+        type="moe",
+        default_params={
+            "kt-num-gpu-experts": 1,
+            "attention-backend": "triton",
+            "disable-shared-experts-fusion": True,
+            "kt-method": "AMXINT4",
+        },
+        description="DeepSeek V3-0324 685B MoE model (March 2025, improved benchmarks)",
+        description_zh="DeepSeek V3-0324 685B MoE 模型（2025年3月，改进的基准测试）",
+    ),
+    ModelInfo(
+        name="DeepSeek-V3.2",
+        hf_repo="deepseek-ai/DeepSeek-V3.2",
+        aliases=["deepseek-v3.2", "dsv3.2", "deepseek3.2", "v3.2"],
+        type="moe",
+        default_params={
+            "kt-method": "FP8",
+            "kt-gpu-prefill-token-threshold": 4096,
+            "attention-backend": "flashinfer",
+            "fp8-gemm-backend": "triton",
+            "max-total-tokens": 100000,
+            "max-running-requests": 16,
+            "chunked-prefill-size": 32768,
+            "mem-fraction-static": 0.80,
+            "watchdog-timeout": 3000,
+            "served-model-name": "DeepSeek-V3.2",
+            "disable-shared-experts-fusion": True,
+        },
+        description="DeepSeek V3.2 671B MoE model (latest)",
+        description_zh="DeepSeek V3.2 671B MoE 模型（最新）",
+    ),
+    ModelInfo(
+        name="DeepSeek-R1-0528",
+        hf_repo="deepseek-ai/DeepSeek-R1-0528",
+        aliases=["deepseek-r1-0528", "deepseek-r1", "dsr1", "r1", "r1-0528"],
+        type="moe",
+        default_params={
+            "kt-num-gpu-experts": 1,
+            "attention-backend": "triton",
+            "disable-shared-experts-fusion": True,
+            "kt-method": "AMXINT4",
+        },
+        description="DeepSeek R1-0528 reasoning model (May 2025, improved reasoning depth)",
+        description_zh="DeepSeek R1-0528 推理模型（2025年5月，改进的推理深度）",
+    ),
+    ModelInfo(
+        name="Kimi-K2-Thinking",
+        hf_repo="moonshotai/Kimi-K2-Thinking",
+        aliases=["kimi-k2-thinking", "kimi-thinking", "k2-thinking", "kimi", "k2"],
+        type="moe",
+        default_params={
+            "kt-method": "RAWINT4",
+            "kt-gpu-prefill-token-threshold": 400,
+            "attention-backend": "flashinfer",
+            "max-total-tokens": 100000,
+            "max-running-requests": 16,
+            "chunked-prefill-size": 32768,
+            "mem-fraction-static": 0.80,
+            "watchdog-timeout": 3000,
+            "served-model-name": "Kimi-K2-Thinking",
+            "disable-shared-experts-fusion": True,
+        },
+        description="Moonshot Kimi K2 Thinking MoE model",
+        description_zh="月之暗面 Kimi K2 Thinking MoE 模型",
+    ),
+    ModelInfo(
+        name="MiniMax-M2",
+        hf_repo="MiniMaxAI/MiniMax-M2",
+        aliases=["minimax-m2", "m2"],
+        type="moe",
+        default_params={
+            "kt-method": "FP8",
+            "kt-gpu-prefill-token-threshold": 4096,
+            "attention-backend": "flashinfer",
+            "fp8-gemm-backend": "triton",
+            "max-total-tokens": 100000,
+            "max-running-requests": 16,
+            "chunked-prefill-size": 32768,
+            "mem-fraction-static": 0.80,
+            "watchdog-timeout": 3000,
+            "served-model-name": "MiniMax-M2",
+            "disable-shared-experts-fusion": True,
+            "tool-call-parser": "minimax-m2",
+            "reasoning-parser": "minimax-append-think",
+        },
+        description="MiniMax M2 MoE model",
+        description_zh="MiniMax M2 MoE 模型",
+        max_tensor_parallel_size=4,  # M2 only supports up to 4-way tensor parallelism
+    ),
+    ModelInfo(
+        name="MiniMax-M2.1",
+        hf_repo="MiniMaxAI/MiniMax-M2.1",
+        aliases=["minimax-m2.1", "m2.1"],
+        type="moe",
+        default_params={
+            "kt-method": "FP8",
+            "kt-gpu-prefill-token-threshold": 4096,
+            "attention-backend": "flashinfer",
+            "fp8-gemm-backend": "triton",
+            "max-total-tokens": 100000,
+            "max-running-requests": 16,
+            "chunked-prefill-size": 32768,
+            "mem-fraction-static": 0.80,
+            "watchdog-timeout": 3000,
+            "served-model-name": "MiniMax-M2.1",
+            "disable-shared-experts-fusion": True,
+            "tool-call-parser": "minimax-m2",
+            "reasoning-parser": "minimax-append-think",
+        },
+        description="MiniMax M2.1 MoE model (enhanced multi-language programming)",
+        description_zh="MiniMax M2.1 MoE 模型（增强多语言编程能力）",
+        max_tensor_parallel_size=4,  # M2.1 only supports up to 4-way tensor parallelism
+    ),
+]
+
+
+class ModelRegistry:
+    """Registry of supported models with fuzzy matching."""
+
+    def __init__(self):
+        """Initialize the model registry."""
+        self._models: dict[str, ModelInfo] = {}
+        self._aliases: dict[str, str] = {}
+        self._load_builtin_models()
+        self._load_user_models()
+
+    def _load_builtin_models(self) -> None:
+        """Load built-in models."""
+        for model in BUILTIN_MODELS:
+            self._register(model)
+
+    def _load_user_models(self) -> None:
+        """Load user-defined models from config."""
+        settings = get_settings()
+        registry_file = settings.config_dir / "registry.yaml"
+
+        if registry_file.exists():
+            try:
+                with open(registry_file, "r", encoding="utf-8") as f:
+                    data = yaml.safe_load(f) or {}
+
+                for name, info in data.get("models", {}).items():
+                    model = ModelInfo(
+                        name=name,
+                        hf_repo=info.get("hf_repo", ""),
+                        aliases=info.get("aliases", []),
+                        type=info.get("type", "moe"),
+                        gpu_vram_gb=info.get("gpu_vram_gb", 0),
+                        cpu_ram_gb=info.get("cpu_ram_gb", 0),
+                        default_params=info.get("default_params", {}),
+                        description=info.get("description", ""),
+                        description_zh=info.get("description_zh", ""),
+                        max_tensor_parallel_size=info.get("max_tensor_parallel_size"),
+                    )
+                    self._register(model)
+            except (yaml.YAMLError, OSError):
+                pass
+
+    def _register(self, model: ModelInfo) -> None:
+        """Register a model."""
+        self._models[model.name.lower()] = model
+
+        # Register aliases
+        for alias in model.aliases:
+            self._aliases[alias.lower()] = model.name.lower()
+
+    def get(self, name: str) -> Optional[ModelInfo]:
+        """Get a model by exact name or alias."""
+        name_lower = name.lower()
+
+        # Check direct match
+        if name_lower in self._models:
+            return self._models[name_lower]
+
+        # Check aliases
+        if name_lower in self._aliases:
+            return self._models[self._aliases[name_lower]]
+
+        return None
+
+    def search(self, query: str, limit: int = 10) -> list[ModelInfo]:
+        """Search for models using fuzzy matching.
+
+        Args:
+            query: Search query
+            limit: Maximum number of results
+
+        Returns:
+            List of matching models, sorted by relevance
+        """
+        query_lower = query.lower()
+        results: list[tuple[float, ModelInfo]] = []
+
+        for model in self._models.values():
+            score = self._match_score(query_lower, model)
+            if score > 0:
+                results.append((score, model))
+
+        # Sort by score descending
+        results.sort(key=lambda x: x[0], reverse=True)
+
+        return [model for _, model in results[:limit]]
+
+    def _match_score(self, query: str, model: ModelInfo) -> float:
+        """Calculate match score for a model.
+
+        Returns a score between 0 and 1, where 1 is an exact match.
+        """
+        # Check exact match
+        if query == model.name.lower():
+            return 1.0
+
+        # Check alias exact match
+        for alias in model.aliases:
+            if query == alias.lower():
+                return 0.95
+
+        # Check if query is contained in name
+        if query in model.name.lower():
+            return 0.8
+
+        # Check if query is contained in aliases
+        for alias in model.aliases:
+            if query in alias.lower():
+                return 0.7
+
+        # Check if query is contained in hf_repo
+        if query in model.hf_repo.lower():
+            return 0.6
+
+        # Fuzzy matching - check if all query parts are present
+        query_parts = re.split(r"[-_.\s]", query)
+        name_lower = model.name.lower()
+
+        matches = sum(1 for part in query_parts if part and part in name_lower)
+        if matches > 0:
+            return 0.5 * (matches / len(query_parts))
+
+        return 0.0
+
+    def list_all(self) -> list[ModelInfo]:
+        """List all registered models."""
+        return list(self._models.values())
+
+    def find_local_models(self) -> list[tuple[ModelInfo, Path]]:
+        """Find models that are downloaded locally in any configured model path.
+
+        Returns:
+            List of (ModelInfo, path) tuples for local models
+        """
+        settings = get_settings()
+        model_paths = settings.get_model_paths()
+        results = []
+
+        for model in self._models.values():
+            found = False
+            # Search in all configured model directories
+            for models_dir in model_paths:
+                if not models_dir.exists():
+                    continue
+
+                # Check common path patterns
+                possible_paths = [
+                    models_dir / model.name,
+                    models_dir / model.name.lower(),
+                    models_dir / model.hf_repo.split("/")[-1],
+                    models_dir / model.hf_repo.replace("/", "--"),
+                ]
+
+                for path in possible_paths:
+                    if path.exists() and (path / "config.json").exists():
+                        results.append((model, path))
+                        found = True
+                        break
+
+                if found:
+                    break
+
+        return results
+
+
+# Global registry instance
+_registry: Optional[ModelRegistry] = None
+
+
+def get_registry() -> ModelRegistry:
+    """Get the global model registry instance."""
+    global _registry
+    if _registry is None:
+        _registry = ModelRegistry()
+    return _registry
+
+
+# ============================================================================
+# Model-specific parameter computation functions
+# ============================================================================
+
+
+def compute_deepseek_v3_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb: float) -> int:
+    per_gpu_gb = 16
+    if vram_per_gpu_gb < per_gpu_gb:
+        return int(0)
+    total_vram = int(tensor_parallel_size * (vram_per_gpu_gb - per_gpu_gb))
+
+    return total_vram // 3
+
+
+def compute_kimi_k2_thinking_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb: float) -> int:
+    """Compute kt-num-gpu-experts for Kimi K2 Thinking."""
+    per_gpu_gb = 16
+    if vram_per_gpu_gb < per_gpu_gb:
+        return int(0)
+    total_vram = int(tensor_parallel_size * (vram_per_gpu_gb - per_gpu_gb))
+
+    return total_vram * 2 // 3
+
+
+def compute_minimax_m2_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb: float) -> int:
+    """Compute kt-num-gpu-experts for MiniMax M2/M2.1."""
+    per_gpu_gb = 16
+    if vram_per_gpu_gb < per_gpu_gb:
+        return int(0)
+    total_vram = int(tensor_parallel_size * (vram_per_gpu_gb - per_gpu_gb))
+
+    return total_vram // 1
+
+
+# Model name to computation function mapping
+MODEL_COMPUTE_FUNCTIONS: dict[str, Callable[[int, float], int]] = {
+    "DeepSeek-V3-0324": compute_deepseek_v3_gpu_experts,
+    "DeepSeek-V3.2": compute_deepseek_v3_gpu_experts,  # Same as V3-0324
+    "DeepSeek-R1-0528": compute_deepseek_v3_gpu_experts,  # Same as V3-0324
+    "Kimi-K2-Thinking": compute_kimi_k2_thinking_gpu_experts,
+    "MiniMax-M2": compute_minimax_m2_gpu_experts,
+    "MiniMax-M2.1": compute_minimax_m2_gpu_experts,  # Same as M2
+}