kt-cli enhancement (#1834)

* [feat]: redesign kt run interactive configuration with i18n support - Redesign kt run with 8-step interactive flow (model selection, inference method, NUMA/CPU, GPU experts, KV cache, GPU/TP selection, parsers, host/port) - Add configuration save/load system (~/.ktransformers/run_configs.yaml) - Add i18n support for kt chat (en/zh translations) - Add universal input validators with auto-retry and Chinese comma support - Add port availability checker with auto-suggestion - Add parser configuration (--tool-call-parser, --reasoning-parser) - Remove tuna command and clean up redundant files - Fix: variable reference bug in run.py, filter to show only MoE models * [feat]: unify model selection UI and enable shared experts fusion by default - Unify kt run model selection table with kt model list display * Add Total size, MoE Size, Repo, and SHA256 status columns * Use consistent formatting and styling * Improve user decision-making with more information - Enable --disable-shared-experts-fusion by default * Change default value from False to True * Users can still override with --enable-shared-experts-fusion * [feat]: improve kt chat with performance metrics and better CJK support - Add performance metrics display after each response * Total time, TTFT (Time To First Token), TPOT (Time Per Output Token) * Accurate input/output token counts using model tokenizer * Fallback to estimation if tokenizer unavailable * Metrics shown in dim style (not prominent) - Fix Chinese character input issues * Replace Prompt.ask() with console.input() for better CJK support * Fixes backspace deletion showing half-characters - Suppress NumPy subnormal warnings * Filter "The value of the smallest subnormal" warnings * Cleaner CLI output on certain hardware environments * [fix]: correct TTFT measurement in kt chat - Move start_time initialization before API call - Previously start_time was set when receiving first chunk, causing TTFT ≈ 0ms - Now correctly measures time from request sent to first token received * [docs]: 添加 Clawdbot 集成指南 - KTransformers 企业级 AI 助手部署方案 * [docs]: 强调推荐使用 Kimi K2.5 作为核心模型，突出企业级推理能力 * [docs]: 添加 Clawdbot 飞书接入教程链接 * [feat]: improve CLI table display, model verification, and chat experience - Add sequence number (#) column to all model tables by default - Filter kt edit to show only MoE GPU models (exclude AMX) - Extend kt model verify to check *.json and *.py files in addition to weights - Fix re-verification bug where repaired files caused false failures - Suppress tokenizer debug output in kt chat token counting * [fix]: fix cpu cores. --------- Co-authored-by: skqliao <skqliao@gmail.com>
2026-04-28 20:00:06 +00:00 · 2026-02-04 16:44:54 +08:00 · 2026-02-04 16:44:54 +08:00 · 56cbd69ac4
commit 56cbd69ac4
parent 4f64665758
23 changed files with 10327 additions and 781 deletions
--- a/kt-kernel/python/cli/commands/quant.py
+++ b/kt-kernel/python/cli/commands/quant.py
@ -35,12 +35,12 @@ class QuantMethod(str, Enum):


 def quant(
-    model: str = typer.Argument(
-        ...,
+    model: Optional[str] = typer.Argument(
+        None,
        help="Model name or path to quantize",
    ),
-    method: QuantMethod = typer.Option(
-        QuantMethod.INT4,
+    method: Optional[QuantMethod] = typer.Option(
+        None,
        "--method",
        "-m",
        help="Quantization method",
@ -51,8 +51,8 @@ def quant(
        "-o",
        help="Output path for quantized weights",
    ),
-    input_type: str = typer.Option(
-        "fp8",
+    input_type: Optional[str] = typer.Option(
+        None,
        "--input-type",
        "-i",
        help="Input weight type (fp8, fp16, bf16)",
@ -72,6 +72,11 @@ def quant(
        "--no-merge",
        help="Don't merge safetensor files",
    ),
+    gpu: bool = typer.Option(
+        False,
+        "--gpu",
+        help="Use GPU for conversion (faster)",
+    ),
    yes: bool = typer.Option(
        False,
        "--yes",
@ -79,54 +84,231 @@ def quant(
        help="Skip confirmation prompts",
    ),
 ) -> None:
-    """Quantize model weights for CPU inference."""
-    settings = get_settings()
-    console.print()
+    """Quantize model weights for CPU inference.

-    # Resolve input path
-    input_path = _resolve_input_path(model, settings)
-    if input_path is None:
-        print_error(t("quant_input_not_found", path=model))
+    If no model is specified, interactive mode will be activated.
+    """
+    settings = get_settings()
+
+    # Check if we should use interactive mode
+    # Interactive mode triggers when: no model, or missing critical parameters
+    needs_interactive = model is None or method is None or cpu_threads is None or numa_nodes is None
+    is_interactive = False
+
+    if needs_interactive and sys.stdin.isatty():
+        # Use interactive configuration (includes verification in Step 1.5)
+        from kt_kernel.cli.utils.quant_interactive import interactive_quant_config
+
+        console.print()
+        console.print(f"[bold cyan]═══ {t('quant_interactive_title')} ═══[/bold cyan]")
+        console.print()
+        console.print(f"[yellow]{t('quant_new_model_notice')}[/yellow]")
+        console.print()
+
+        config = interactive_quant_config()
+        if config is None:
+            # User cancelled
+            raise typer.Exit(0)
+
+        # Extract configuration
+        model_obj = config["model"]
+        model = model_obj.id
+        input_path = Path(model_obj.path)
+        method = QuantMethod(config["method"])
+        input_type = config["input_type"]
+        cpu_threads = config["cpu_threads"]
+        numa_nodes = config["numa_nodes"]
+        output = config["output_path"]
+        gpu = config["use_gpu"]
+        is_interactive = True
+
+        console.print()
+        print_success(t("quant_config_complete"))
+        console.print()
+    else:
+        # Non-interactive mode - require model parameter
+        if model is None:
+            print_error("Model argument is required in non-interactive mode")
+            console.print()
+            console.print("Usage: kt quant <model>")
+            console.print("   Or: kt quant  (for interactive mode)")
+            raise typer.Exit(1)
+
+        # Set defaults for optional parameters
+        method = method or QuantMethod.INT4
+        input_type = input_type or "fp8"
+
+        console.print()
+
+        # Resolve input path
+        input_path = _resolve_input_path(model, settings)
+        if input_path is None:
+            print_error(t("quant_input_not_found", path=model))
+            raise typer.Exit(1)
+
+        # Pre-quantization verification (only in non-interactive mode)
+        # Interactive mode already did verification in interactive_quant_config()
+        from kt_kernel.cli.utils.user_model_registry import UserModelRegistry
+        from kt_kernel.cli.utils.model_verifier import pre_operation_verification
+
+        user_registry = UserModelRegistry()
+        user_model_obj = user_registry.find_by_path(str(input_path))
+
+        if user_model_obj and user_model_obj.format == "safetensors":
+            pre_operation_verification(user_model_obj, user_registry, operation_name="quantizing")
+
+    # Get user model info for both modes (needed later for registering quantized model)
+    from kt_kernel.cli.utils.user_model_registry import UserModelRegistry
+
+    user_registry = UserModelRegistry()
+    user_model_obj = user_registry.find_by_path(str(input_path))
+
+    # Validate that it's a MoE model (not AMX or GGUF)
+    from kt_kernel.cli.commands.model import is_amx_weights
+
+    # Check if it's AMX (already quantized)
+    is_amx, _ = is_amx_weights(str(input_path))
+    if is_amx:
+        print_error("Cannot quantize AMX models (already quantized)")
+        console.print()
+        console.print(f"  The model at {input_path} is already in AMX format.")
        raise typer.Exit(1)

-    print_info(t("quant_input_path", path=str(input_path)))
+    # Check if it's a MoE model
+    from kt_kernel.cli.utils.analyze_moe_model import analyze_moe_model

-    # Resolve output path
-    if output is None:
-        output = input_path.parent / f"{input_path.name}-{method.value.upper()}"
-
-    print_info(t("quant_output_path", path=str(output)))
-    print_info(t("quant_method", method=method.value.upper()))
-
-    # Detect CPU configuration
-    cpu = detect_cpu_info()
-    final_cpu_threads = cpu_threads or cpu.cores
-    final_numa_nodes = numa_nodes or cpu.numa_nodes
-
-    print_info(f"CPU threads: {final_cpu_threads}")
-    print_info(f"NUMA nodes: {final_numa_nodes}")
-
-    # Check if output exists
-    if output.exists():
-        print_warning(f"Output path already exists: {output}")
+    moe_result = None  # Store for later use when registering quantized model
+    try:
+        moe_result = analyze_moe_model(str(input_path), use_cache=True)
+        if not moe_result or not moe_result.get("is_moe"):
+            print_error("Only MoE models can be quantized to AMX format")
+            console.print()
+            console.print(f"  The model at {input_path} is not a MoE model.")
+            console.print("  AMX quantization is designed for MoE models (e.g., DeepSeek-V3).")
+            raise typer.Exit(1)
+    except Exception as e:
+        print_warning(f"Could not detect MoE information: {e}")
+        console.print()
        if not yes:
-            if not confirm("Overwrite?", default=False):
+            if not confirm("Continue quantization anyway?", default=False):
+                raise typer.Exit(1)
+
+    # Detect CPU configuration and resolve output path (only needed in non-interactive mode)
+    if not is_interactive:
+        print_info(t("quant_input_path", path=str(input_path)))
+
+        # Detect CPU configuration (needed for output path)
+        cpu = detect_cpu_info()
+        final_cpu_threads = cpu_threads or cpu.cores
+        final_numa_nodes = numa_nodes or cpu.numa_nodes
+
+        # Resolve output path
+        if output is None:
+            # Priority: paths.weights > paths.models[0] > model's parent directory
+            weights_dir = settings.weights_dir
+
+            if weights_dir and weights_dir.exists():
+                # Use configured weights directory (highest priority)
+                output = weights_dir / f"{input_path.name}-AMX{method.value.upper()}-NUMA{final_numa_nodes}"
+            else:
+                # Use first model storage path
+                model_paths = settings.get_model_paths()
+                if model_paths and model_paths[0].exists():
+                    output = model_paths[0] / f"{input_path.name}-AMX{method.value.upper()}-NUMA{final_numa_nodes}"
+                else:
+                    # Fallback to model's parent directory
+                    output = input_path.parent / f"{input_path.name}-AMX{method.value.upper()}-NUMA{final_numa_nodes}"
+
+        print_info(t("quant_output_path", path=str(output)))
+        print_info(t("quant_method", method=method.value.upper()))
+        print_info(t("quant_cpu_threads", threads=final_cpu_threads))
+        print_info(t("quant_numa_nodes", nodes=final_numa_nodes))
+
+        # Calculate space requirements
+        console.print()
+        console.print(f"[bold cyan]{t('quant_disk_analysis')}[/bold cyan]")
+        console.print()
+
+        # Calculate source model size
+        try:
+            total_bytes = sum(f.stat().st_size for f in input_path.glob("*.safetensors") if f.is_file())
+            source_size_gb = total_bytes / (1024**3)
+        except Exception:
+            source_size_gb = 0.0
+
+        # Estimate quantized size
+        input_bits = {"fp8": 8, "fp16": 16, "bf16": 16}
+        quant_bits = {"int4": 4, "int8": 8}
+        input_bit = input_bits.get(input_type, 16)
+        quant_bit = quant_bits.get(method.value, 4)
+        ratio = quant_bit / input_bit
+        estimated_size_gb = source_size_gb * ratio
+
+        # Check available space
+        import shutil
+
+        try:
+            check_path = output.parent if not output.exists() else output
+            while not check_path.exists() and check_path != check_path.parent:
+                check_path = check_path.parent
+            stat = shutil.disk_usage(check_path)
+            available_gb = stat.free / (1024**3)
+        except Exception:
+            available_gb = 0.0
+
+        is_sufficient = available_gb >= (estimated_size_gb * 1.2)
+
+        console.print(f"  {t('quant_source_size'):<26} {source_size_gb:.2f} GB")
+        console.print(f"  {t('quant_estimated_size'):<26} {estimated_size_gb:.2f} GB")
+        console.print(f"  {t('quant_available_space'):<26} {available_gb:.2f} GB")
+        console.print()
+
+        if not is_sufficient:
+            required_with_buffer = estimated_size_gb * 1.2
+            print_warning(t("quant_insufficient_space"))
+            console.print()
+            console.print(f"  {t('quant_required_space'):<26} {required_with_buffer:.2f} GB")
+            console.print(f"  {t('quant_available_space'):<26} {available_gb:.2f} GB")
+            console.print(f"  {t('quant_shortage'):<26} {required_with_buffer - available_gb:.2f} GB")
+            console.print()
+            console.print(f"  {t('quant_may_fail')}")
+            console.print()
+
+            if not yes:
+                if not confirm(t("quant_continue_anyway"), default=False):
+                    raise typer.Abort()
+            console.print()
+
+        # Check if output exists and generate unique name
+        if output.exists():
+            print_warning(t("quant_output_exists", path=str(output)))
+            console.print()
+
+            # Generate unique name by adding suffix
+            original_name = output.name
+            parent_dir = output.parent
+            counter = 2
+
+            while output.exists():
+                new_name = f"{original_name}-{counter}"
+                output = parent_dir / new_name
+                counter += 1
+
+            print_success(t("quant_using_unique", path=str(output)))
+            console.print()
+
+        # Confirm (only show if not using --yes flag)
+        if not yes:
+            console.print()
+            print_warning(t("quant_time_warning"))
+            console.print()
+
+            if not confirm(t("prompt_continue")):
                raise typer.Abort()
-
-    # Confirm
-    if not yes:
-        console.print()
-        console.print("[bold]Quantization Settings:[/bold]")
-        console.print(f"  Input: {input_path}")
-        console.print(f"  Output: {output}")
-        console.print(f"  Method: {method.value.upper()}")
-        console.print(f"  Input type: {input_type}")
-        console.print()
-        print_warning("Quantization may take 30-60 minutes depending on model size.")
-        console.print()
-
-        if not confirm(t("prompt_continue")):
-            raise typer.Abort()
+    else:
+        # Interactive mode: cpu_threads and numa_nodes already set
+        final_cpu_threads = cpu_threads
+        final_numa_nodes = numa_nodes

    # Find conversion script
    kt_kernel_path = _find_kt_kernel_path()
@ -141,37 +323,145 @@ def quant(

    # Build command
    cmd = [
-        sys.executable, str(script_path),
-        "--input-path", str(input_path),
-        "--input-type", input_type,
-        "--output", str(output),
-        "--quant-method", method.value,
-        "--cpuinfer-threads", str(final_cpu_threads),
-        "--threadpool-count", str(final_numa_nodes),
+        sys.executable,
+        str(script_path),
+        "--input-path",
+        str(input_path),
+        "--input-type",
+        input_type,
+        "--output",
+        str(output),
+        "--quant-method",
+        method.value,
+        "--cpuinfer-threads",
+        str(final_cpu_threads),
+        "--threadpool-count",
+        str(final_numa_nodes),
    ]

    if no_merge:
        cmd.append("--no-merge-safetensor")

+    if gpu:
+        cmd.append("--gpu")
+
    # Run quantization
    console.print()
    print_step(t("quant_starting"))
    console.print()
    console.print(f"[dim]$ {' '.join(cmd)}[/dim]")
    console.print()
+    console.print("[dim]" + "=" * 80 + "[/dim]")
+    console.print()

    try:
-        process = subprocess.run(cmd)
+        # Run with real-time stdout/stderr output
+        import os
+        import time
+
+        env = os.environ.copy()
+        env["PYTHONUNBUFFERED"] = "1"  # Disable Python output buffering
+
+        # Record start time
+        start_time = time.time()
+
+        process = subprocess.run(
+            cmd,
+            stdout=None,  # Inherit parent's stdout (real-time output)
+            stderr=None,  # Inherit parent's stderr (real-time output)
+            env=env,
+        )
+
+        # Calculate elapsed time
+        elapsed_time = time.time() - start_time
+        hours = int(elapsed_time // 3600)
+        minutes = int((elapsed_time % 3600) // 60)
+        seconds = int(elapsed_time % 60)
+
+        console.print()
+        console.print("[dim]" + "=" * 80 + "[/dim]")
+        console.print()

        if process.returncode == 0:
-            console.print()
            print_success(t("quant_complete"))
            console.print()
+
+            # Display elapsed time
+            if hours > 0:
+                time_str = f"{hours}h {minutes}m {seconds}s"
+            elif minutes > 0:
+                time_str = f"{minutes}m {seconds}s"
+            else:
+                time_str = f"{seconds}s"
+            console.print(f"  [cyan]{t('quant_time_elapsed')} {time_str}[/cyan]")
+            console.print()
            console.print(f"  Quantized weights saved to: {output}")
            console.print()
-            console.print("  Use with:")
-            console.print(f"    kt run {model} --weights-path {output}")
-            console.print()
+
+            # Auto-register the quantized model
+            try:
+                from kt_kernel.cli.utils.user_model_registry import UserModel
+
+                # Generate model name from output path
+                base_name = output.name
+                suggested_name = user_registry.suggest_name(base_name)
+
+                # Determine MoE information and source model name
+                if user_model_obj:
+                    is_moe_val = user_model_obj.is_moe
+                    num_experts = user_model_obj.moe_num_experts
+                    num_active = user_model_obj.moe_num_experts_per_tok
+                    repo_type_val = user_model_obj.repo_type
+                    repo_id_val = user_model_obj.repo_id
+                    source_model_name = user_model_obj.name  # Store source model name
+                elif moe_result:
+                    is_moe_val = moe_result.get("is_moe", True)
+                    num_experts = moe_result.get("num_experts")
+                    num_active = moe_result.get("num_experts_per_tok")
+                    repo_type_val = None
+                    repo_id_val = None
+                    source_model_name = input_path.name  # Use folder name as fallback
+                else:
+                    is_moe_val = None
+                    num_experts = None
+                    num_active = None
+                    repo_type_val = None
+                    repo_id_val = None
+                    source_model_name = input_path.name  # Use folder name as fallback
+
+                # Create new model entry (AMX format uses "safetensors" format, detected by is_amx_weights())
+                new_model = UserModel(
+                    name=suggested_name,
+                    path=str(output),
+                    format="safetensors",  # AMX files are safetensors format
+                    repo_type=repo_type_val,
+                    repo_id=repo_id_val,
+                    sha256_status="not_checked",  # AMX weights don't need verification
+                    # Inherit MoE information from source model
+                    is_moe=is_moe_val,
+                    moe_num_experts=num_experts,
+                    moe_num_experts_per_tok=num_active,
+                    # AMX quantization metadata
+                    amx_source_model=source_model_name,
+                    amx_quant_method=method.value,  # "int4" or "int8"
+                    amx_numa_nodes=final_numa_nodes,
+                )
+
+                user_registry.add_model(new_model)
+                console.print()
+                print_success(t("quant_registered", name=suggested_name))
+                console.print()
+                console.print(f"  {t('quant_view_with')} [cyan]kt model list[/cyan]")
+                console.print(f"  {t('quant_use_with')}  [cyan]kt run {suggested_name}[/cyan]")
+                console.print()
+            except Exception as e:
+                # Non-fatal error - quantization succeeded but registration failed
+                console.print()
+                print_warning(t("quant_register_failed", error=str(e)))
+                console.print()
+                console.print(f"  {t('quant_use_with')}")
+                console.print(f"    kt run {model} --weights-path {output}")
+                console.print()
        else:
            print_error(f"Quantization failed with exit code {process.returncode}")
            raise typer.Exit(process.returncode)
@ -221,6 +511,7 @@ def _find_kt_kernel_path() -> Optional[Path]:
    """Find the kt-kernel installation path."""
    try:
        import kt_kernel
+
        return Path(kt_kernel.__file__).parent.parent
    except ImportError:
        pass