kvcache-ai-ktransformers/kt-kernel/python/cli/utils/model_registry.py
ErvinXie 9539ab91eb
Cli (#1765)
* [feat]: add custom option for kt run

* [feat]: depth 3
2025-12-29 15:18:42 +08:00

399 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Model registry for kt-cli.
Provides a registry of supported models with fuzzy matching capabilities.
"""
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Callable, Optional
import yaml
from kt_kernel.cli.config.settings import get_settings
@dataclass
class ModelInfo:
"""Information about a supported model."""
name: str
hf_repo: str
aliases: list[str] = field(default_factory=list)
type: str = "moe" # moe, dense
gpu_vram_gb: float = 0
cpu_ram_gb: float = 0
default_params: dict = field(default_factory=dict)
description: str = ""
description_zh: str = ""
max_tensor_parallel_size: Optional[int] = None # Maximum tensor parallel size for this model
# Built-in model registry
BUILTIN_MODELS: list[ModelInfo] = [
ModelInfo(
name="DeepSeek-V3-0324",
hf_repo="deepseek-ai/DeepSeek-V3-0324",
aliases=["deepseek-v3-0324", "deepseek-v3", "dsv3", "deepseek3", "v3-0324"],
type="moe",
default_params={
"kt-num-gpu-experts": 1,
"attention-backend": "triton",
"disable-shared-experts-fusion": True,
"kt-method": "AMXINT4",
},
description="DeepSeek V3-0324 685B MoE model (March 2025, improved benchmarks)",
description_zh="DeepSeek V3-0324 685B MoE 模型2025年3月改进的基准测试",
),
ModelInfo(
name="DeepSeek-V3.2",
hf_repo="deepseek-ai/DeepSeek-V3.2",
aliases=["deepseek-v3.2", "dsv3.2", "deepseek3.2", "v3.2"],
type="moe",
default_params={
"kt-method": "FP8",
"kt-gpu-prefill-token-threshold": 4096,
"attention-backend": "flashinfer",
"fp8-gemm-backend": "triton",
"max-total-tokens": 100000,
"max-running-requests": 16,
"chunked-prefill-size": 32768,
"mem-fraction-static": 0.80,
"watchdog-timeout": 3000,
"served-model-name": "DeepSeek-V3.2",
"disable-shared-experts-fusion": True,
},
description="DeepSeek V3.2 671B MoE model (latest)",
description_zh="DeepSeek V3.2 671B MoE 模型(最新)",
),
ModelInfo(
name="DeepSeek-R1-0528",
hf_repo="deepseek-ai/DeepSeek-R1-0528",
aliases=["deepseek-r1-0528", "deepseek-r1", "dsr1", "r1", "r1-0528"],
type="moe",
default_params={
"kt-num-gpu-experts": 1,
"attention-backend": "triton",
"disable-shared-experts-fusion": True,
"kt-method": "AMXINT4",
},
description="DeepSeek R1-0528 reasoning model (May 2025, improved reasoning depth)",
description_zh="DeepSeek R1-0528 推理模型2025年5月改进的推理深度",
),
ModelInfo(
name="Kimi-K2-Thinking",
hf_repo="moonshotai/Kimi-K2-Thinking",
aliases=["kimi-k2-thinking", "kimi-thinking", "k2-thinking", "kimi", "k2"],
type="moe",
default_params={
"kt-method": "RAWINT4",
"kt-gpu-prefill-token-threshold": 400,
"attention-backend": "flashinfer",
"max-total-tokens": 100000,
"max-running-requests": 16,
"chunked-prefill-size": 32768,
"mem-fraction-static": 0.80,
"watchdog-timeout": 3000,
"served-model-name": "Kimi-K2-Thinking",
"disable-shared-experts-fusion": True,
},
description="Moonshot Kimi K2 Thinking MoE model",
description_zh="月之暗面 Kimi K2 Thinking MoE 模型",
),
ModelInfo(
name="MiniMax-M2",
hf_repo="MiniMaxAI/MiniMax-M2",
aliases=["minimax-m2", "m2"],
type="moe",
default_params={
"kt-method": "FP8",
"kt-gpu-prefill-token-threshold": 4096,
"attention-backend": "flashinfer",
"fp8-gemm-backend": "triton",
"max-total-tokens": 100000,
"max-running-requests": 16,
"chunked-prefill-size": 32768,
"mem-fraction-static": 0.80,
"watchdog-timeout": 3000,
"served-model-name": "MiniMax-M2",
"disable-shared-experts-fusion": True,
"tool-call-parser": "minimax-m2",
"reasoning-parser": "minimax-append-think",
},
description="MiniMax M2 MoE model",
description_zh="MiniMax M2 MoE 模型",
max_tensor_parallel_size=4, # M2 only supports up to 4-way tensor parallelism
),
ModelInfo(
name="MiniMax-M2.1",
hf_repo="MiniMaxAI/MiniMax-M2.1",
aliases=["minimax-m2.1", "m2.1"],
type="moe",
default_params={
"kt-method": "FP8",
"kt-gpu-prefill-token-threshold": 4096,
"attention-backend": "flashinfer",
"fp8-gemm-backend": "triton",
"max-total-tokens": 100000,
"max-running-requests": 16,
"chunked-prefill-size": 32768,
"mem-fraction-static": 0.80,
"watchdog-timeout": 3000,
"served-model-name": "MiniMax-M2.1",
"disable-shared-experts-fusion": True,
"tool-call-parser": "minimax-m2",
"reasoning-parser": "minimax-append-think",
},
description="MiniMax M2.1 MoE model (enhanced multi-language programming)",
description_zh="MiniMax M2.1 MoE 模型(增强多语言编程能力)",
max_tensor_parallel_size=4, # M2.1 only supports up to 4-way tensor parallelism
),
]
class ModelRegistry:
"""Registry of supported models with fuzzy matching."""
def __init__(self):
"""Initialize the model registry."""
self._models: dict[str, ModelInfo] = {}
self._aliases: dict[str, str] = {}
self._load_builtin_models()
self._load_user_models()
def _load_builtin_models(self) -> None:
"""Load built-in models."""
for model in BUILTIN_MODELS:
self._register(model)
def _load_user_models(self) -> None:
"""Load user-defined models from config."""
settings = get_settings()
registry_file = settings.config_dir / "registry.yaml"
if registry_file.exists():
try:
with open(registry_file, "r", encoding="utf-8") as f:
data = yaml.safe_load(f) or {}
for name, info in data.get("models", {}).items():
model = ModelInfo(
name=name,
hf_repo=info.get("hf_repo", ""),
aliases=info.get("aliases", []),
type=info.get("type", "moe"),
gpu_vram_gb=info.get("gpu_vram_gb", 0),
cpu_ram_gb=info.get("cpu_ram_gb", 0),
default_params=info.get("default_params", {}),
description=info.get("description", ""),
description_zh=info.get("description_zh", ""),
max_tensor_parallel_size=info.get("max_tensor_parallel_size"),
)
self._register(model)
except (yaml.YAMLError, OSError):
pass
def _register(self, model: ModelInfo) -> None:
"""Register a model."""
self._models[model.name.lower()] = model
# Register aliases
for alias in model.aliases:
self._aliases[alias.lower()] = model.name.lower()
def get(self, name: str) -> Optional[ModelInfo]:
"""Get a model by exact name or alias."""
name_lower = name.lower()
# Check direct match
if name_lower in self._models:
return self._models[name_lower]
# Check aliases
if name_lower in self._aliases:
return self._models[self._aliases[name_lower]]
return None
def search(self, query: str, limit: int = 10) -> list[ModelInfo]:
"""Search for models using fuzzy matching.
Args:
query: Search query
limit: Maximum number of results
Returns:
List of matching models, sorted by relevance
"""
query_lower = query.lower()
results: list[tuple[float, ModelInfo]] = []
for model in self._models.values():
score = self._match_score(query_lower, model)
if score > 0:
results.append((score, model))
# Sort by score descending
results.sort(key=lambda x: x[0], reverse=True)
return [model for _, model in results[:limit]]
def _match_score(self, query: str, model: ModelInfo) -> float:
"""Calculate match score for a model.
Returns a score between 0 and 1, where 1 is an exact match.
"""
# Check exact match
if query == model.name.lower():
return 1.0
# Check alias exact match
for alias in model.aliases:
if query == alias.lower():
return 0.95
# Check if query is contained in name
if query in model.name.lower():
return 0.8
# Check if query is contained in aliases
for alias in model.aliases:
if query in alias.lower():
return 0.7
# Check if query is contained in hf_repo
if query in model.hf_repo.lower():
return 0.6
# Fuzzy matching - check if all query parts are present
query_parts = re.split(r"[-_.\s]", query)
name_lower = model.name.lower()
matches = sum(1 for part in query_parts if part and part in name_lower)
if matches > 0:
return 0.5 * (matches / len(query_parts))
return 0.0
def list_all(self) -> list[ModelInfo]:
"""List all registered models."""
return list(self._models.values())
def find_local_models(self, max_depth: int = 3) -> list[tuple[ModelInfo, Path]]:
"""Find models that are downloaded locally in any configured model path.
Args:
max_depth: Maximum depth to search within each model path (default: 3)
Returns:
List of (ModelInfo, path) tuples for local models
"""
settings = get_settings()
model_paths = settings.get_model_paths()
results = []
for model in self._models.values():
found = False
# Search in all configured model directories
for models_dir in model_paths:
if not models_dir.exists():
continue
# Generate possible names to search for
possible_names = [
model.name,
model.name.lower(),
model.hf_repo.split("/")[-1],
model.hf_repo.replace("/", "--"),
]
# Search recursively up to max_depth
for depth in range(max_depth):
# Build glob pattern for current depth
# depth=0: direct children, depth=1: grandchildren, etc.
glob_pattern = "*" if depth > 0 else ""
for _ in range(depth):
glob_pattern = "*/" + glob_pattern if glob_pattern else "*"
for name in possible_names:
if depth == 0:
# Direct children: models_dir / name
search_paths = [models_dir / name]
else:
# Nested: use rglob to find directories matching the name
search_paths = list(models_dir.rglob(name))
for path in search_paths:
if path.exists() and (path / "config.json").exists():
results.append((model, path))
found = True
break
if found:
break
if found:
break
if found:
break
return results
# Global registry instance
_registry: Optional[ModelRegistry] = None
def get_registry() -> ModelRegistry:
"""Get the global model registry instance."""
global _registry
if _registry is None:
_registry = ModelRegistry()
return _registry
# ============================================================================
# Model-specific parameter computation functions
# ============================================================================
def compute_deepseek_v3_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb: float) -> int:
per_gpu_gb = 16
if vram_per_gpu_gb < per_gpu_gb:
return int(0)
total_vram = int(tensor_parallel_size * (vram_per_gpu_gb - per_gpu_gb))
return total_vram // 3
def compute_kimi_k2_thinking_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb: float) -> int:
"""Compute kt-num-gpu-experts for Kimi K2 Thinking."""
per_gpu_gb = 16
if vram_per_gpu_gb < per_gpu_gb:
return int(0)
total_vram = int(tensor_parallel_size * (vram_per_gpu_gb - per_gpu_gb))
return total_vram * 2 // 3
def compute_minimax_m2_gpu_experts(tensor_parallel_size: int, vram_per_gpu_gb: float) -> int:
"""Compute kt-num-gpu-experts for MiniMax M2/M2.1."""
per_gpu_gb = 16
if vram_per_gpu_gb < per_gpu_gb:
return int(0)
total_vram = int(tensor_parallel_size * (vram_per_gpu_gb - per_gpu_gb))
return total_vram // 1
# Model name to computation function mapping
MODEL_COMPUTE_FUNCTIONS: dict[str, Callable[[int, float], int]] = {
"DeepSeek-V3-0324": compute_deepseek_v3_gpu_experts,
"DeepSeek-V3.2": compute_deepseek_v3_gpu_experts, # Same as V3-0324
"DeepSeek-R1-0528": compute_deepseek_v3_gpu_experts, # Same as V3-0324
"Kimi-K2-Thinking": compute_kimi_k2_thinking_gpu_experts,
"MiniMax-M2": compute_minimax_m2_gpu_experts,
"MiniMax-M2.1": compute_minimax_m2_gpu_experts, # Same as M2
}