mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-04-29 04:09:52 +00:00
kt-cli enhancement (#1834)
Some checks failed
Book-CI / test (push) Has been cancelled
Book-CI / test-1 (push) Has been cancelled
Book-CI / test-2 (push) Has been cancelled
Deploy / deploy (macos-latest) (push) Has been cancelled
Deploy / deploy (ubuntu-latest) (push) Has been cancelled
Deploy / deploy (windows-latest) (push) Has been cancelled
Some checks failed
Book-CI / test (push) Has been cancelled
Book-CI / test-1 (push) Has been cancelled
Book-CI / test-2 (push) Has been cancelled
Deploy / deploy (macos-latest) (push) Has been cancelled
Deploy / deploy (ubuntu-latest) (push) Has been cancelled
Deploy / deploy (windows-latest) (push) Has been cancelled
* [feat]: redesign kt run interactive configuration with i18n support - Redesign kt run with 8-step interactive flow (model selection, inference method, NUMA/CPU, GPU experts, KV cache, GPU/TP selection, parsers, host/port) - Add configuration save/load system (~/.ktransformers/run_configs.yaml) - Add i18n support for kt chat (en/zh translations) - Add universal input validators with auto-retry and Chinese comma support - Add port availability checker with auto-suggestion - Add parser configuration (--tool-call-parser, --reasoning-parser) - Remove tuna command and clean up redundant files - Fix: variable reference bug in run.py, filter to show only MoE models * [feat]: unify model selection UI and enable shared experts fusion by default - Unify kt run model selection table with kt model list display * Add Total size, MoE Size, Repo, and SHA256 status columns * Use consistent formatting and styling * Improve user decision-making with more information - Enable --disable-shared-experts-fusion by default * Change default value from False to True * Users can still override with --enable-shared-experts-fusion * [feat]: improve kt chat with performance metrics and better CJK support - Add performance metrics display after each response * Total time, TTFT (Time To First Token), TPOT (Time Per Output Token) * Accurate input/output token counts using model tokenizer * Fallback to estimation if tokenizer unavailable * Metrics shown in dim style (not prominent) - Fix Chinese character input issues * Replace Prompt.ask() with console.input() for better CJK support * Fixes backspace deletion showing half-characters - Suppress NumPy subnormal warnings * Filter "The value of the smallest subnormal" warnings * Cleaner CLI output on certain hardware environments * [fix]: correct TTFT measurement in kt chat - Move start_time initialization before API call - Previously start_time was set when receiving first chunk, causing TTFT ≈ 0ms - Now correctly measures time from request sent to first token received * [docs]: 添加 Clawdbot 集成指南 - KTransformers 企业级 AI 助手部署方案 * [docs]: 强调推荐使用 Kimi K2.5 作为核心模型,突出企业级推理能力 * [docs]: 添加 Clawdbot 飞书接入教程链接 * [feat]: improve CLI table display, model verification, and chat experience - Add sequence number (#) column to all model tables by default - Filter kt edit to show only MoE GPU models (exclude AMX) - Extend kt model verify to check *.json and *.py files in addition to weights - Fix re-verification bug where repaired files caused false failures - Suppress tokenizer debug output in kt chat token counting * [fix]: fix cpu cores. --------- Co-authored-by: skqliao <skqliao@gmail.com>
This commit is contained in:
parent
4f64665758
commit
56cbd69ac4
23 changed files with 10327 additions and 781 deletions
347
kt-kernel/python/cli/utils/quant_interactive.py
Normal file
347
kt-kernel/python/cli/utils/quant_interactive.py
Normal file
|
|
@ -0,0 +1,347 @@
|
|||
"""
|
||||
Interactive configuration for kt quant command.
|
||||
|
||||
Provides rich, multi-step interactive configuration for model quantization.
|
||||
"""
|
||||
|
||||
from typing import Optional, Dict, Any
|
||||
from pathlib import Path
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.panel import Panel
|
||||
from rich.prompt import Prompt, Confirm, IntPrompt
|
||||
from kt_kernel.cli.i18n import t
|
||||
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
def select_model_to_quantize() -> Optional[Any]:
|
||||
"""Select model to quantize interactively."""
|
||||
from kt_kernel.cli.utils.user_model_registry import UserModelRegistry
|
||||
from kt_kernel.cli.commands.model import is_amx_weights, SHA256_STATUS_MAP
|
||||
from kt_kernel.cli.utils.model_table_builder import build_moe_gpu_table
|
||||
|
||||
registry = UserModelRegistry()
|
||||
all_models = registry.list_models()
|
||||
|
||||
# Filter MoE models only (safetensors, not AMX, is_moe=True)
|
||||
quant_models = []
|
||||
for model in all_models:
|
||||
if model.format == "safetensors":
|
||||
# Skip AMX models
|
||||
is_amx, _ = is_amx_weights(model.path)
|
||||
if is_amx:
|
||||
continue
|
||||
|
||||
# Only include MoE models
|
||||
if model.is_moe:
|
||||
quant_models.append(model)
|
||||
|
||||
if not quant_models:
|
||||
console.print(f"[yellow]{t('quant_no_moe_models')}[/yellow]")
|
||||
console.print()
|
||||
console.print(f" {t('quant_only_moe')}")
|
||||
console.print()
|
||||
console.print(f" {t('quant_add_models', command='kt model scan')}")
|
||||
console.print(f" {t('quant_add_models', command='kt model add <path>')}")
|
||||
return None
|
||||
|
||||
# Display models
|
||||
console.print()
|
||||
console.print(f"[bold green]{t('quant_moe_available')}[/bold green]")
|
||||
console.print()
|
||||
|
||||
# Use shared table builder
|
||||
table, displayed_models = build_moe_gpu_table(
|
||||
models=quant_models, status_map=SHA256_STATUS_MAP, show_index=True, start_index=1
|
||||
)
|
||||
|
||||
console.print(table)
|
||||
console.print()
|
||||
|
||||
choice = IntPrompt.ask(t("quant_select_model"), default=1, show_choices=False)
|
||||
|
||||
if choice < 1 or choice > len(displayed_models):
|
||||
console.print(f"[red]{t('quant_invalid_choice')}[/red]")
|
||||
return None
|
||||
|
||||
return displayed_models[choice - 1]
|
||||
|
||||
|
||||
def configure_quantization_method() -> Dict[str, str]:
|
||||
"""Select quantization method and input type."""
|
||||
console.print()
|
||||
console.print(Panel(f"[bold cyan]{t('quant_step2_method')}[/bold cyan]", expand=False))
|
||||
console.print()
|
||||
|
||||
# Method selection
|
||||
console.print(f"[bold]{t('quant_method_label')}[/bold]")
|
||||
console.print(f" [cyan][1][/cyan] {t('quant_int4_desc')}")
|
||||
console.print(f" [cyan][2][/cyan] {t('quant_int8_desc')}")
|
||||
console.print()
|
||||
|
||||
method_choice = Prompt.ask(t("quant_select_method"), choices=["1", "2"], default="1")
|
||||
method = "int4" if method_choice == "1" else "int8"
|
||||
|
||||
console.print()
|
||||
console.print(f"[bold]{t('quant_input_type_label')}[/bold]")
|
||||
console.print(f" [cyan][1][/cyan] {t('quant_fp8_desc')}")
|
||||
console.print(f" [cyan][2][/cyan] {t('quant_fp16_desc')}")
|
||||
console.print(f" [cyan][3][/cyan] {t('quant_bf16_desc')}")
|
||||
console.print()
|
||||
|
||||
input_choice = Prompt.ask(t("quant_select_input_type"), choices=["1", "2", "3"], default="1")
|
||||
input_type_map = {"1": "fp8", "2": "fp16", "3": "bf16"}
|
||||
input_type = input_type_map[input_choice]
|
||||
|
||||
return {"method": method, "input_type": input_type}
|
||||
|
||||
|
||||
def configure_cpu_params(max_cores: int, max_numa: int) -> Dict[str, Any]:
|
||||
"""Configure CPU parameters."""
|
||||
console.print()
|
||||
console.print(Panel(f"[bold cyan]{t('quant_step3_cpu')}[/bold cyan]", expand=False))
|
||||
console.print()
|
||||
|
||||
def clamp(value: int, min_val: int, max_val: int, default: int) -> int:
|
||||
"""Clamp value to range or return default if out of bounds."""
|
||||
if min_val <= value <= max_val:
|
||||
return max(min_val, min(value, max_val))
|
||||
return default
|
||||
|
||||
default_threads = int(max_cores * 0.8)
|
||||
cpu_threads = IntPrompt.ask(t("quant_cpu_threads_prompt", max=max_cores), default=default_threads)
|
||||
cpu_threads = clamp(cpu_threads, 1, max_cores, default_threads)
|
||||
|
||||
numa_nodes = IntPrompt.ask(t("quant_numa_nodes_prompt", max=max_numa), default=max_numa)
|
||||
numa_nodes = clamp(numa_nodes, 1, max_numa, max_numa)
|
||||
|
||||
# Ask about GPU usage
|
||||
console.print()
|
||||
console.print(f"[bold]{t('quant_use_gpu_label')}[/bold]")
|
||||
console.print(f" [dim]{t('quant_gpu_speedup')}[/dim]")
|
||||
console.print()
|
||||
use_gpu = Confirm.ask(t("quant_enable_gpu"), default=True)
|
||||
|
||||
return {"cpu_threads": cpu_threads, "numa_nodes": numa_nodes, "use_gpu": use_gpu}
|
||||
|
||||
|
||||
def configure_output_path(model: Any, method: str, numa_nodes: int) -> Path:
|
||||
"""Configure output path for quantized weights."""
|
||||
from kt_kernel.cli.config.settings import get_settings
|
||||
|
||||
console.print()
|
||||
console.print(Panel(f"[bold cyan]{t('quant_step4_output')}[/bold cyan]", expand=False))
|
||||
console.print()
|
||||
|
||||
# Generate default output path
|
||||
model_path = Path(model.path)
|
||||
method_upper = method.upper()
|
||||
settings = get_settings()
|
||||
|
||||
# Priority: paths.weights > paths.models[0] > model's parent directory
|
||||
weights_dir = settings.weights_dir
|
||||
if weights_dir and weights_dir.exists():
|
||||
# Use configured weights directory (highest priority)
|
||||
default_output = weights_dir / f"{model_path.name}-AMX{method_upper}-NUMA{numa_nodes}"
|
||||
else:
|
||||
# Use first model storage path
|
||||
model_paths = settings.get_model_paths()
|
||||
if model_paths and model_paths[0].exists():
|
||||
default_output = model_paths[0] / f"{model_path.name}-AMX{method_upper}-NUMA{numa_nodes}"
|
||||
else:
|
||||
# Fallback to model's parent directory
|
||||
default_output = model_path.parent / f"{model_path.name}-AMX{method_upper}-NUMA{numa_nodes}"
|
||||
|
||||
console.print(f"[dim]{t('quant_default_path')}[/dim]", default_output)
|
||||
console.print()
|
||||
|
||||
use_default = Confirm.ask(t("quant_use_default"), default=True)
|
||||
|
||||
if use_default:
|
||||
return default_output
|
||||
|
||||
custom_path = Prompt.ask(t("quant_custom_path"), default=str(default_output))
|
||||
|
||||
return Path(custom_path)
|
||||
|
||||
|
||||
def calculate_quantized_size(source_path: Path, input_type: str, quant_method: str) -> tuple[float, float]:
|
||||
"""
|
||||
Calculate source model size and estimated quantized size.
|
||||
|
||||
Args:
|
||||
source_path: Path to source model
|
||||
input_type: Input type (fp8, fp16, bf16)
|
||||
quant_method: Quantization method (int4, int8)
|
||||
|
||||
Returns:
|
||||
Tuple of (source_size_gb, estimated_quant_size_gb)
|
||||
"""
|
||||
# Calculate source model size
|
||||
try:
|
||||
total_bytes = sum(f.stat().st_size for f in source_path.glob("*.safetensors") if f.is_file())
|
||||
source_size_gb = total_bytes / (1024**3)
|
||||
except Exception:
|
||||
return 0.0, 0.0
|
||||
|
||||
# Bits mapping
|
||||
input_bits = {"fp8": 8, "fp16": 16, "bf16": 16}
|
||||
quant_bits = {"int4": 4, "int8": 8}
|
||||
|
||||
input_bit = input_bits.get(input_type, 16)
|
||||
quant_bit = quant_bits.get(quant_method, 4)
|
||||
|
||||
# Estimate: source_size * (quant_bits / input_bits)
|
||||
ratio = quant_bit / input_bit
|
||||
estimated_size_gb = source_size_gb * ratio
|
||||
|
||||
return source_size_gb, estimated_size_gb
|
||||
|
||||
|
||||
def check_disk_space(output_path: Path, required_size_gb: float) -> tuple[float, bool]:
|
||||
"""
|
||||
Check available disk space at output path.
|
||||
|
||||
Args:
|
||||
output_path: Target output path
|
||||
required_size_gb: Required space in GB
|
||||
|
||||
Returns:
|
||||
Tuple of (available_gb, is_sufficient)
|
||||
is_sufficient is True if available >= required * 1.2
|
||||
"""
|
||||
import shutil
|
||||
|
||||
try:
|
||||
# Get parent directory that exists
|
||||
check_path = output_path.parent if not output_path.exists() else output_path
|
||||
while not check_path.exists() and check_path != check_path.parent:
|
||||
check_path = check_path.parent
|
||||
|
||||
stat = shutil.disk_usage(check_path)
|
||||
available_gb = stat.free / (1024**3)
|
||||
|
||||
# Check if available space >= required * 1.2 (20% buffer)
|
||||
is_sufficient = available_gb >= (required_size_gb * 1.2)
|
||||
|
||||
return available_gb, is_sufficient
|
||||
except Exception:
|
||||
return 0.0, False
|
||||
|
||||
|
||||
def interactive_quant_config() -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Interactive configuration for kt quant.
|
||||
|
||||
Returns configuration dict or None if cancelled.
|
||||
"""
|
||||
from kt_kernel.cli.utils.environment import detect_cpu_info
|
||||
|
||||
# Get CPU info
|
||||
cpu_info = detect_cpu_info()
|
||||
|
||||
# Step 1: Select model
|
||||
model = select_model_to_quantize()
|
||||
if not model:
|
||||
return None
|
||||
|
||||
# Step 1.5: Pre-quantization verification (optional)
|
||||
from kt_kernel.cli.utils.user_model_registry import UserModelRegistry
|
||||
from kt_kernel.cli.utils.model_verifier import pre_operation_verification
|
||||
|
||||
user_registry = UserModelRegistry()
|
||||
user_model_obj = user_registry.find_by_path(model.path)
|
||||
|
||||
if user_model_obj and user_model_obj.format == "safetensors":
|
||||
pre_operation_verification(user_model_obj, user_registry, operation_name="quantizing")
|
||||
|
||||
# Step 2: Configure quantization method
|
||||
quant_config = configure_quantization_method()
|
||||
|
||||
# Step 3: Configure CPU parameters
|
||||
cpu_config = configure_cpu_params(cpu_info.threads, cpu_info.numa_nodes) # Use logical threads
|
||||
|
||||
# Step 4: Configure output path
|
||||
output_path = configure_output_path(model, quant_config["method"], cpu_config["numa_nodes"])
|
||||
|
||||
# Step 4.5: Check if output path already exists and generate unique name
|
||||
if output_path.exists():
|
||||
console.print()
|
||||
console.print(t("quant_output_exists_warn", path=str(output_path)))
|
||||
console.print()
|
||||
|
||||
# Generate unique name by adding suffix
|
||||
original_name = output_path.name
|
||||
parent_dir = output_path.parent
|
||||
counter = 2
|
||||
|
||||
while output_path.exists():
|
||||
new_name = f"{original_name}-{counter}"
|
||||
output_path = parent_dir / new_name
|
||||
counter += 1
|
||||
|
||||
console.print(t("quant_using_unique_name", path=str(output_path)))
|
||||
console.print()
|
||||
|
||||
# Step 5: Calculate space requirements and check availability
|
||||
console.print()
|
||||
console.print(Panel(f"[bold cyan]{t('quant_disk_analysis')}[/bold cyan]", expand=False))
|
||||
console.print()
|
||||
|
||||
source_size_gb, estimated_size_gb = calculate_quantized_size(
|
||||
Path(model.path), quant_config["input_type"], quant_config["method"]
|
||||
)
|
||||
|
||||
available_gb, is_sufficient = check_disk_space(output_path, estimated_size_gb)
|
||||
|
||||
console.print(f" {t('quant_source_size'):<26} [cyan]{source_size_gb:.2f} GB[/cyan]")
|
||||
console.print(f" {t('quant_estimated_size'):<26} [yellow]{estimated_size_gb:.2f} GB[/yellow]")
|
||||
console.print(
|
||||
f" {t('quant_available_space'):<26} [{'green' if is_sufficient else 'red'}]{available_gb:.2f} GB[/{'green' if is_sufficient else 'red'}]"
|
||||
)
|
||||
console.print()
|
||||
|
||||
if not is_sufficient:
|
||||
required_with_buffer = estimated_size_gb * 1.2
|
||||
console.print(f"[bold red]⚠ {t('quant_insufficient_space')}[/bold red]")
|
||||
console.print()
|
||||
console.print(f" {t('quant_required_space'):<26} [yellow]{required_with_buffer:.2f} GB[/yellow]")
|
||||
console.print(f" {t('quant_available_space'):<26} [red]{available_gb:.2f} GB[/red]")
|
||||
console.print(f" {t('quant_shortage'):<26} [red]{required_with_buffer - available_gb:.2f} GB[/red]")
|
||||
console.print()
|
||||
console.print(f" {t('quant_may_fail')}")
|
||||
console.print()
|
||||
|
||||
if not Confirm.ask(f"[yellow]{t('quant_continue_anyway')}[/yellow]", default=False):
|
||||
console.print(f"[yellow]{t('quant_cancelled')}[/yellow]")
|
||||
return None
|
||||
console.print()
|
||||
|
||||
# Summary and confirmation
|
||||
console.print()
|
||||
console.print(Panel(f"[bold cyan]{t('quant_config_summary')}[/bold cyan]", expand=False))
|
||||
console.print()
|
||||
console.print(f" {t('quant_summary_model'):<15} {model.name}")
|
||||
console.print(f" {t('quant_summary_method'):<15} {quant_config['method'].upper()}")
|
||||
console.print(f" {t('quant_summary_input_type'):<15} {quant_config['input_type'].upper()}")
|
||||
console.print(f" {t('quant_summary_cpu_threads'):<15} {cpu_config['cpu_threads']}")
|
||||
console.print(f" {t('quant_summary_numa'):<15} {cpu_config['numa_nodes']}")
|
||||
console.print(f" {t('quant_summary_gpu'):<15} {t('yes') if cpu_config['use_gpu'] else t('no')}")
|
||||
console.print(f" {t('quant_summary_output'):<15} {output_path}")
|
||||
console.print()
|
||||
|
||||
if not Confirm.ask(f"[bold green]{t('quant_start_question')}[/bold green]", default=True):
|
||||
console.print(f"[yellow]{t('quant_cancelled')}[/yellow]")
|
||||
return None
|
||||
|
||||
return {
|
||||
"model": model,
|
||||
"method": quant_config["method"],
|
||||
"input_type": quant_config["input_type"],
|
||||
"cpu_threads": cpu_config["cpu_threads"],
|
||||
"numa_nodes": cpu_config["numa_nodes"],
|
||||
"use_gpu": cpu_config["use_gpu"],
|
||||
"output_path": output_path,
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue