""" Run command for kt-cli. Starts the model inference server using SGLang + kt-kernel. """ import os import subprocess import sys from pathlib import Path from typing import Optional import click import typer from kt_kernel.cli.config.settings import get_settings from kt_kernel.cli.i18n import t from kt_kernel.cli.utils.console import ( confirm, console, print_api_info, print_error, print_info, print_server_info, print_step, print_success, print_warning, prompt_choice, ) from kt_kernel.cli.utils.environment import detect_cpu_info, detect_gpus, detect_ram_gb from kt_kernel.cli.utils.user_model_registry import UserModelRegistry @click.command( context_settings={"ignore_unknown_options": True, "allow_extra_args": True}, add_help_option=False, # We'll handle help manually to avoid conflicts ) @click.argument("model", required=False, default=None) @click.option("--host", "-H", default=None, help="Server host address") @click.option("--port", "-p", type=int, default=None, help="Server port") @click.option("--gpu-experts", type=int, default=None, help="Number of GPU experts per layer") @click.option("--cpu-threads", type=int, default=None, help="Number of CPU inference threads") @click.option("--numa-nodes", type=int, default=None, help="Number of NUMA nodes") @click.option( "--tensor-parallel-size", "--tp", "tensor_parallel_size", type=int, default=None, help="Tensor parallel size" ) @click.option("--model-path", type=click.Path(), default=None, help="Custom model path") @click.option("--weights-path", type=click.Path(), default=None, help="Custom quantized weights path") @click.option("--kt-method", default=None, help="KT quantization method") @click.option( "--kt-gpu-prefill-threshold", "kt_gpu_prefill_threshold", type=int, default=None, help="GPU prefill token threshold" ) @click.option("--attention-backend", default=None, help="Attention backend") @click.option("--max-total-tokens", "max_total_tokens", type=int, default=None, help="Maximum total tokens") @click.option("--max-running-requests", "max_running_requests", type=int, default=None, help="Maximum running requests") @click.option("--chunked-prefill-size", "chunked_prefill_size", type=int, default=None, help="Chunked prefill size") @click.option("--mem-fraction-static", "mem_fraction_static", type=float, default=None, help="Memory fraction static") @click.option("--watchdog-timeout", "watchdog_timeout", type=int, default=None, help="Watchdog timeout") @click.option("--served-model-name", "served_model_name", default=None, help="Served model name") @click.option( "--disable-shared-experts-fusion", "disable_shared_experts_fusion", is_flag=True, default=None, help="Disable shared experts fusion", ) @click.option( "--enable-shared-experts-fusion", "enable_shared_experts_fusion", is_flag=True, default=False, help="Enable shared experts fusion", ) @click.option("--quantize", "-q", is_flag=True, default=False, help="Quantize model") @click.option("--advanced", is_flag=True, default=False, help="Show advanced options") @click.option("--dry-run", "dry_run", is_flag=True, default=False, help="Show command without executing") @click.pass_context def run( ctx: click.Context, model: Optional[str], host: Optional[str], port: Optional[int], gpu_experts: Optional[int], cpu_threads: Optional[int], numa_nodes: Optional[int], tensor_parallel_size: Optional[int], model_path: Optional[str], weights_path: Optional[str], kt_method: Optional[str], kt_gpu_prefill_threshold: Optional[int], attention_backend: Optional[str], max_total_tokens: Optional[int], max_running_requests: Optional[int], chunked_prefill_size: Optional[int], mem_fraction_static: Optional[float], watchdog_timeout: Optional[int], served_model_name: Optional[str], disable_shared_experts_fusion: Optional[bool], enable_shared_experts_fusion: bool, quantize: bool, advanced: bool, dry_run: bool, ) -> None: """Start model inference server. \b Examples: kt run deepseek-v3 | kt run m2 --tensor-parallel-size 2 | kt run /path/to/model --gpu-experts 4 \b Custom Options: Pass any SGLang server option directly (e.g., kt run m2 --fp8-gemm-backend triton). Common: --fp8-gemm-backend, --tool-call-parser, --reasoning-parser, --dp-size, --enable-ma For full list: python -m sglang.launch_server --help """ # Handle --help manually since we disabled it # Check sys.argv for --help or -h since ctx.args may not be set yet if "--help" in sys.argv or "-h" in sys.argv: click.echo(ctx.get_help()) return # Handle disable/enable shared experts fusion flags if enable_shared_experts_fusion: disable_shared_experts_fusion = False # Convert Path objects from click model_path_obj = Path(model_path) if model_path else None weights_path_obj = Path(weights_path) if weights_path else None # Get extra args that weren't parsed (unknown options) # click stores these in ctx.args when ignore_unknown_options=True extra_cli_args = list(ctx.args) if ctx.args else [] # Remove --help from extra args if present (already handled) extra_cli_args = [arg for arg in extra_cli_args if arg not in ["--help", "-h"]] # Call the actual run function implementation _run_impl( model=model, host=host, port=port, gpu_experts=gpu_experts, cpu_threads=cpu_threads, numa_nodes=numa_nodes, tensor_parallel_size=tensor_parallel_size, model_path=model_path_obj, weights_path=weights_path_obj, kt_method=kt_method, kt_gpu_prefill_threshold=kt_gpu_prefill_threshold, attention_backend=attention_backend, max_total_tokens=max_total_tokens, max_running_requests=max_running_requests, chunked_prefill_size=chunked_prefill_size, mem_fraction_static=mem_fraction_static, watchdog_timeout=watchdog_timeout, served_model_name=served_model_name, disable_shared_experts_fusion=disable_shared_experts_fusion, quantize=quantize, advanced=advanced, dry_run=dry_run, extra_cli_args=extra_cli_args, ) def _run_impl( model: Optional[str], host: Optional[str], port: Optional[int], gpu_experts: Optional[int], cpu_threads: Optional[int], numa_nodes: Optional[int], tensor_parallel_size: Optional[int], model_path: Optional[Path], weights_path: Optional[Path], kt_method: Optional[str], kt_gpu_prefill_threshold: Optional[int], attention_backend: Optional[str], max_total_tokens: Optional[int], max_running_requests: Optional[int], chunked_prefill_size: Optional[int], mem_fraction_static: Optional[float], watchdog_timeout: Optional[int], served_model_name: Optional[str], disable_shared_experts_fusion: Optional[bool], quantize: bool, advanced: bool, dry_run: bool, extra_cli_args: list[str], ) -> None: """Actual implementation of run command.""" # Check if SGLang is installed before proceeding from kt_kernel.cli.utils.sglang_checker import ( check_sglang_installation, check_sglang_kt_kernel_support, print_sglang_install_instructions, print_sglang_kt_kernel_instructions, ) sglang_info = check_sglang_installation() if not sglang_info["installed"]: console.print() print_error(t("sglang_not_found")) console.print() print_sglang_install_instructions() raise typer.Exit(1) # Check if SGLang supports kt-kernel (has --kt-gpu-prefill-token-threshold parameter) kt_kernel_support = check_sglang_kt_kernel_support() if not kt_kernel_support["supported"]: console.print() print_error(t("sglang_kt_kernel_not_supported")) console.print() print_sglang_kt_kernel_instructions() raise typer.Exit(1) settings = get_settings() user_registry = UserModelRegistry() # Check if we should use interactive mode # Interactive mode triggers when: # 1. No model specified, OR # 2. Model specified but missing critical parameters (gpu_experts, tensor_parallel_size, etc.) use_interactive = False if model is None: use_interactive = True elif ( gpu_experts is None or tensor_parallel_size is None or cpu_threads is None or numa_nodes is None or max_total_tokens is None ): # Model specified but some parameters missing - use interactive use_interactive = True if use_interactive and sys.stdin.isatty(): # Use new interactive configuration flow from kt_kernel.cli.utils.run_interactive import interactive_run_config console.print() console.print("[bold cyan]═══ Interactive Run Configuration ═══[/bold cyan]") console.print() config = interactive_run_config() if config is None: # User cancelled raise typer.Exit(0) # Extract configuration from new format user_model_obj = config["model"] model = user_model_obj.id resolved_model_path = Path(config["model_path"]) resolved_weights_path = Path(config["weights_path"]) # Extract parameters gpu_experts = config["gpu_experts"] cpu_threads = config["cpu_threads"] numa_nodes = config["numa_nodes"] tensor_parallel_size = config["tp_size"] # Get kt-method and other method-specific settings kt_method = config["kt_method"] # KV cache settings (may be None for non-raw methods) max_total_tokens = config.get("kv_cache", 32768) chunked_prefill_size = config.get("chunk_prefill", 32768) kt_gpu_prefill_threshold = config.get("gpu_prefill_threshold", 500) # Memory settings mem_fraction_static = config["mem_fraction_static"] # Parser settings (optional) tool_call_parser = config.get("tool_call_parser") reasoning_parser = config.get("reasoning_parser") # Server settings host = config.get("host", "0.0.0.0") port = config.get("port", 30000) # Set CUDA_VISIBLE_DEVICES for selected GPUs selected_gpus = config["selected_gpus"] os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(gpu_id) for gpu_id in selected_gpus) # Detect hardware for parameter resolution (needed for resolve() function later) gpus = detect_gpus() cpu = detect_cpu_info() console.print() print_info(f"[green]✓[/green] Configuration complete") console.print() else: # Non-interactive mode - use traditional flow console.print() # Initialize variables that may have been set by interactive mode # These will be None in non-interactive mode and will use defaults via resolve() # If no model specified, show old interactive selection if model is None: model = _interactive_model_selection(user_registry, settings) if model is None: raise typer.Exit(0) # Detect hardware (needed for defaults) gpus = detect_gpus() cpu = detect_cpu_info() ram = detect_ram_gb() if gpus: gpu_info = f"{gpus[0].name} ({gpus[0].vram_gb}GB VRAM)" if len(gpus) > 1: gpu_info += f" + {len(gpus) - 1} more" print_info(t("run_gpu_info", name=gpus[0].name, vram=gpus[0].vram_gb)) else: print_warning(t("doctor_gpu_not_found")) gpu_info = "None" print_info(t("run_cpu_info", name=cpu.name, cores=cpu.cores, numa=cpu.numa_nodes)) print_info(t("run_ram_info", total=int(ram))) # Step 2: Resolve model console.print() print_step(t("run_checking_model")) user_model_obj = None resolved_model_path = model_path # Check if model is a path if Path(model).exists(): resolved_model_path = Path(model) print_info(t("run_model_path", path=str(resolved_model_path))) # Try to find in user registry by path user_model_obj = user_registry.find_by_path(str(resolved_model_path)) if user_model_obj: print_info(f"Using registered model: {user_model_obj.name}") else: print_warning("Using unregistered model path. Consider adding it with 'kt model add'") else: # Search in user registry by name user_model_obj = user_registry.get_model(model) if not user_model_obj: print_error(t("run_model_not_found", name=model)) console.print() # Show available models all_models = user_registry.list_models() if all_models: console.print("Available registered models:") for m in all_models[:5]: console.print(f" - {m.name}") if len(all_models) > 5: console.print(f" ... and {len(all_models) - 5} more") else: console.print("No models registered yet.") console.print() console.print(f"Add your model with: [cyan]kt model add /path/to/model[/cyan]") console.print(f"Or scan for models: [cyan]kt model scan[/cyan]") raise typer.Exit(1) # Use model path from registry resolved_model_path = Path(user_model_obj.path) # Verify path exists if not resolved_model_path.exists(): print_error(f"Model path does not exist: {resolved_model_path}") console.print() console.print(f"Run 'kt model refresh' to check all models") raise typer.Exit(1) print_info(t("run_model_path", path=str(resolved_model_path))) # Step 2.5: Pre-run verification (optional integrity check) if user_model_obj and user_model_obj.format == "safetensors": from kt_kernel.cli.utils.model_verifier import pre_operation_verification pre_operation_verification(user_model_obj, user_registry, operation_name="running") # Step 3: Check quantized weights (only if explicitly requested) resolved_weights_path = None # Only use quantized weights if explicitly specified by user if weights_path is not None: # User explicitly specified weights path resolved_weights_path = weights_path if not resolved_weights_path.exists(): print_error(t("run_weights_not_found")) console.print(f" Path: {resolved_weights_path}") raise typer.Exit(1) print_info(f"Using quantized weights: {resolved_weights_path}") elif quantize: # User requested quantization console.print() print_step(t("run_quantizing")) # TODO: Implement quantization print_warning("Quantization not yet implemented. Please run 'kt quant' manually.") raise typer.Exit(1) else: # Default: use original precision model without quantization console.print() print_info("Using original precision model (no quantization)") # Step 4: Build command # Helper to resolve parameter with fallback chain: CLI > config > default def resolve(cli_val, config_key, default): if cli_val is not None: return cli_val config_val = settings.get(config_key) return config_val if config_val is not None else default # Server configuration final_host = resolve(host, "server.host", "0.0.0.0") final_port = resolve(port, "server.port", 30000) # Tensor parallel size: CLI > config > auto-detect from GPUs final_tensor_parallel_size = resolve( tensor_parallel_size, "inference.tensor_parallel_size", len(gpus) if gpus else 1 ) # CPU/GPU configuration with smart defaults total_threads = cpu.threads # Use logical threads instead of physical cores final_cpu_threads = resolve(cpu_threads, "inference.cpu_threads", int(total_threads * 0.8)) final_numa_nodes = resolve(numa_nodes, "inference.numa_nodes", cpu.numa_nodes) final_gpu_experts = resolve(gpu_experts, "inference.gpu_experts", 1) # KT-kernel options final_kt_method = resolve(kt_method, "inference.kt_method", "AMXINT4") final_kt_gpu_prefill_threshold = resolve(kt_gpu_prefill_threshold, "inference.kt_gpu_prefill_token_threshold", 4096) # SGLang options final_attention_backend = resolve(attention_backend, "inference.attention_backend", "flashinfer") final_max_total_tokens = resolve(max_total_tokens, "inference.max_total_tokens", 40000) final_max_running_requests = resolve(max_running_requests, "inference.max_running_requests", 32) final_chunked_prefill_size = resolve(chunked_prefill_size, "inference.chunked_prefill_size", 4096) final_mem_fraction_static = resolve(mem_fraction_static, "inference.mem_fraction_static", 0.98) final_watchdog_timeout = resolve(watchdog_timeout, "inference.watchdog_timeout", 3000) final_served_model_name = resolve(served_model_name, "inference.served_model_name", "") # Performance flags final_disable_shared_experts_fusion = resolve( disable_shared_experts_fusion, "inference.disable_shared_experts_fusion", True ) # Pass extra CLI parameters extra_params = {} # Parser parameters (from interactive mode or None in non-interactive mode) final_tool_call_parser = None final_reasoning_parser = None if "tool_call_parser" in locals() and tool_call_parser: final_tool_call_parser = tool_call_parser if "reasoning_parser" in locals() and reasoning_parser: final_reasoning_parser = reasoning_parser cmd = _build_sglang_command( model_path=resolved_model_path, weights_path=resolved_weights_path, host=final_host, port=final_port, gpu_experts=final_gpu_experts, cpu_threads=final_cpu_threads, numa_nodes=final_numa_nodes, tensor_parallel_size=final_tensor_parallel_size, kt_method=final_kt_method, kt_gpu_prefill_threshold=final_kt_gpu_prefill_threshold, attention_backend=final_attention_backend, max_total_tokens=final_max_total_tokens, max_running_requests=final_max_running_requests, chunked_prefill_size=final_chunked_prefill_size, mem_fraction_static=final_mem_fraction_static, watchdog_timeout=final_watchdog_timeout, served_model_name=final_served_model_name, disable_shared_experts_fusion=final_disable_shared_experts_fusion, tool_call_parser=final_tool_call_parser, reasoning_parser=final_reasoning_parser, settings=settings, extra_model_params=extra_params, extra_cli_args=extra_cli_args, ) # Prepare environment variables env = os.environ.copy() # Add environment variables from advanced.env env.update(settings.get_env_vars()) # Add environment variables from inference.env inference_env = settings.get("inference.env", {}) if isinstance(inference_env, dict): env.update({k: str(v) for k, v in inference_env.items()}) # Step 5: Show configuration summary console.print() print_step("Configuration") # Display model name model_display_name = user_model_obj.name if user_model_obj else resolved_model_path.name console.print(f" Model: [bold]{model_display_name}[/bold]") console.print(f" Path: [dim]{resolved_model_path}[/dim]") # Key parameters console.print() console.print(f" GPU Experts: [cyan]{final_gpu_experts}[/cyan] per layer") console.print(f" CPU Threads (kt-cpuinfer): [cyan]{final_cpu_threads}[/cyan]") console.print(f" NUMA Nodes (kt-threadpool-count): [cyan]{final_numa_nodes}[/cyan]") console.print(f" Tensor Parallel: [cyan]{final_tensor_parallel_size}[/cyan]") console.print(f" Method: [cyan]{final_kt_method}[/cyan]") console.print(f" Attention: [cyan]{final_attention_backend}[/cyan]") # Weights info if resolved_weights_path: console.print() console.print(f" Quantized weights: [yellow]{resolved_weights_path}[/yellow]") console.print() console.print(f" Server: [green]http://{final_host}:{final_port}[/green]") console.print() # Step 6: Show or execute if dry_run: console.print() console.print("[bold]Command:[/bold]") console.print() console.print(f" [dim]{' '.join(cmd)}[/dim]") console.print() return # Execute with prepared environment variables # Don't print "Server started" or API info here - let sglang's logs speak for themselves # The actual startup takes time and these messages are misleading # Print the command being executed console.print() console.print("[bold]Launching server with command:[/bold]") console.print() console.print(f" [dim]{' '.join(cmd)}[/dim]") console.print() try: # Execute directly without intercepting output or signals # This allows direct output to terminal and Ctrl+C to work naturally process = subprocess.run(cmd, env=env) sys.exit(process.returncode) except FileNotFoundError: from kt_kernel.cli.utils.sglang_checker import print_sglang_install_instructions print_error(t("sglang_not_found")) console.print() print_sglang_install_instructions() raise typer.Exit(1) except Exception as e: print_error(f"Failed to start server: {e}") raise typer.Exit(1) # Dead code removed: _find_model_path() and _find_weights_path() # These functions were part of the old builtin model system def _build_sglang_command( model_path: Path, weights_path: Optional[Path], host: str, port: int, gpu_experts: int, cpu_threads: int, numa_nodes: int, tensor_parallel_size: int, kt_method: str, kt_gpu_prefill_threshold: int, attention_backend: str, max_total_tokens: int, max_running_requests: int, chunked_prefill_size: int, mem_fraction_static: float, watchdog_timeout: int, served_model_name: str, disable_shared_experts_fusion: bool, tool_call_parser: Optional[str], reasoning_parser: Optional[str], settings, extra_model_params: Optional[dict] = None, # New parameter for additional params extra_cli_args: Optional[list[str]] = None, # Extra args from CLI to pass to sglang ) -> list[str]: """Build the SGLang launch command.""" cmd = [ sys.executable, "-m", "sglang.launch_server", "--host", host, "--port", str(port), "--model", str(model_path), ] # Add kt-kernel options # kt-kernel is needed for: # 1. Quantized models (when weights_path is provided) # 2. MoE models with CPU offloading (when kt-cpuinfer > 0 or kt-num-gpu-experts is configured) use_kt_kernel = False # Check if we should use kt-kernel if weights_path: # Quantized model - always use kt-kernel use_kt_kernel = True elif cpu_threads > 0 or gpu_experts > 1: # CPU offloading configured - use kt-kernel use_kt_kernel = True if use_kt_kernel: # Add kt-weight-path: use quantized weights if available, otherwise use model path weight_path_to_use = weights_path if weights_path else model_path # Add kt-kernel configuration cmd.extend( [ "--kt-weight-path", str(weight_path_to_use), "--kt-cpuinfer", str(cpu_threads), "--kt-threadpool-count", str(numa_nodes), "--kt-num-gpu-experts", str(gpu_experts), "--kt-method", kt_method, "--kt-gpu-prefill-token-threshold", str(kt_gpu_prefill_threshold), "--kt-enable-dynamic-expert-update", # Enable dynamic expert updates ] ) # Add SGLang options cmd.extend( [ "--attention-backend", attention_backend, "--trust-remote-code", "--mem-fraction-static", str(mem_fraction_static), "--chunked-prefill-size", str(chunked_prefill_size), "--max-running-requests", str(max_running_requests), "--max-total-tokens", str(max_total_tokens), "--watchdog-timeout", str(watchdog_timeout), "--enable-mixed-chunk", "--tensor-parallel-size", str(tensor_parallel_size), "--enable-p2p-check", ] ) # Add served model name if specified if served_model_name: cmd.extend(["--served-model-name", served_model_name]) # Add performance flags if disable_shared_experts_fusion: cmd.append("--disable-shared-experts-fusion") # Add FP8 backend if using FP8 method if "FP8" in kt_method.upper(): cmd.extend(["--fp8-gemm-backend", "triton"]) # Add parsers if specified if tool_call_parser: cmd.extend(["--tool-call-parser", tool_call_parser]) if reasoning_parser: cmd.extend(["--reasoning-parser", reasoning_parser]) # Add any extra parameters from model defaults that weren't explicitly handled if extra_model_params: # List of parameters already handled above handled_params = { "kt-num-gpu-experts", "kt-cpuinfer", "kt-threadpool-count", "kt-method", "kt-gpu-prefill-token-threshold", "attention-backend", "tensor-parallel-size", "max-total-tokens", "max-running-requests", "chunked-prefill-size", "mem-fraction-static", "watchdog-timeout", "served-model-name", "disable-shared-experts-fusion", } for key, value in extra_model_params.items(): if key not in handled_params: # Add unhandled parameters dynamically cmd.append(f"--{key}") if isinstance(value, bool): # Boolean flags don't need a value if not value: # For False boolean, skip the flag entirely cmd.pop() # Remove the flag we just added else: cmd.append(str(value)) # Add extra args from settings extra_args = settings.get("advanced.sglang_args", []) if extra_args: cmd.extend(extra_args) # Add extra CLI args (user-provided options not defined in kt CLI) if extra_cli_args: cmd.extend(extra_cli_args) return cmd def _interactive_model_selection(user_registry, settings) -> Optional[str]: """Show interactive model selection interface. Returns: Selected model name or None if cancelled. """ from rich.panel import Panel from rich.prompt import Prompt # Get all user models all_models = user_registry.list_models() if not all_models: console.print() print_warning("No models registered.") console.print() console.print(f" Add models with: [cyan]kt model scan[/cyan]") console.print(f" Or manually: [cyan]kt model add /path/to/model[/cyan]") console.print() return None console.print() console.print( Panel.fit( "Select a model to run", border_style="cyan", ) ) console.print() # Build choices list choices = [] choice_map = {} # index -> model name # Show all user models console.print(f"[bold green]Available Models:[/bold green]") console.print() for i, model in enumerate(all_models, 1): # Check if path exists path_status = "✓" if model.path_exists() else "✗ Missing" console.print(f" [cyan][{i}][/cyan] [bold]{model.name}[/bold] [{path_status}]") console.print(f" [dim]{model.format} - {model.path}[/dim]") choices.append(str(i)) choice_map[str(i)] = model.name console.print() # Add cancel option cancel_idx = str(len(choices) + 1) console.print(f" [cyan][{cancel_idx}][/cyan] [dim]Cancel[/dim]") choices.append(cancel_idx) console.print() # Prompt for selection try: selection = Prompt.ask( "Select model", choices=choices, default="1" if choices else cancel_idx, ) except KeyboardInterrupt: console.print() return None if selection == cancel_idx: return None return choice_map.get(selection)