kt-cli enhancement (#1834)

* [feat]: redesign kt run interactive configuration with i18n support - Redesign kt run with 8-step interactive flow (model selection, inference method, NUMA/CPU, GPU experts, KV cache, GPU/TP selection, parsers, host/port) - Add configuration save/load system (~/.ktransformers/run_configs.yaml) - Add i18n support for kt chat (en/zh translations) - Add universal input validators with auto-retry and Chinese comma support - Add port availability checker with auto-suggestion - Add parser configuration (--tool-call-parser, --reasoning-parser) - Remove tuna command and clean up redundant files - Fix: variable reference bug in run.py, filter to show only MoE models * [feat]: unify model selection UI and enable shared experts fusion by default - Unify kt run model selection table with kt model list display * Add Total size, MoE Size, Repo, and SHA256 status columns * Use consistent formatting and styling * Improve user decision-making with more information - Enable --disable-shared-experts-fusion by default * Change default value from False to True * Users can still override with --enable-shared-experts-fusion * [feat]: improve kt chat with performance metrics and better CJK support - Add performance metrics display after each response * Total time, TTFT (Time To First Token), TPOT (Time Per Output Token) * Accurate input/output token counts using model tokenizer * Fallback to estimation if tokenizer unavailable * Metrics shown in dim style (not prominent) - Fix Chinese character input issues * Replace Prompt.ask() with console.input() for better CJK support * Fixes backspace deletion showing half-characters - Suppress NumPy subnormal warnings * Filter "The value of the smallest subnormal" warnings * Cleaner CLI output on certain hardware environments * [fix]: correct TTFT measurement in kt chat - Move start_time initialization before API call - Previously start_time was set when receiving first chunk, causing TTFT ≈ 0ms - Now correctly measures time from request sent to first token received * [docs]: 添加 Clawdbot 集成指南 - KTransformers 企业级 AI 助手部署方案 * [docs]: 强调推荐使用 Kimi K2.5 作为核心模型，突出企业级推理能力 * [docs]: 添加 Clawdbot 飞书接入教程链接 * [feat]: improve CLI table display, model verification, and chat experience - Add sequence number (#) column to all model tables by default - Filter kt edit to show only MoE GPU models (exclude AMX) - Extend kt model verify to check *.json and *.py files in addition to weights - Fix re-verification bug where repaired files caused false failures - Suppress tokenizer debug output in kt chat token counting * [fix]: fix cpu cores. --------- Co-authored-by: skqliao <skqliao@gmail.com>
2026-04-29 04:09:52 +00:00 · 2026-02-04 16:44:54 +08:00 · 2026-02-04 16:44:54 +08:00 · 56cbd69ac4
commit 56cbd69ac4
parent 4f64665758
23 changed files with 10327 additions and 781 deletions
--- a/kt-kernel/python/cli/commands/chat.py
+++ b/kt-kernel/python/cli/commands/chat.py
@ -96,9 +96,9 @@ def chat(
        kt chat -t 0.9 --max-tokens 4096 # Adjust generation parameters
    """
    if not HAS_OPENAI:
-        print_error("OpenAI Python SDK is required for chat functionality.")
+        print_error(t("chat_openai_required"))
        console.print()
-        console.print("Install it with:")
+        console.print(t("chat_install_hint"))
        console.print("  pip install openai")
        raise typer.Exit(1)

@ -114,10 +114,10 @@ def chat(
    console.print()
    console.print(
        Panel.fit(
-            f"[bold cyan]KTransformers Chat[/bold cyan]\n\n"
-            f"Server: [yellow]{final_host}:{final_port}[/yellow]\n"
-            f"Temperature: [cyan]{temperature}[/cyan] | Max tokens: [cyan]{max_tokens}[/cyan]\n\n"
-            f"[dim]Type '/help' for commands, '/quit' to exit[/dim]",
+            f"[bold cyan]{t('chat_title')}[/bold cyan]\n\n"
+            f"{t('chat_server')}: [yellow]{final_host}:{final_port}[/yellow]\n"
+            f"{t('chat_temperature')}: [cyan]{temperature}[/cyan] | {t('chat_max_tokens')}: [cyan]{max_tokens}[/cyan]\n\n"
+            f"[dim]{t('chat_help_hint')}[/dim]",
            border_style="cyan",
        )
    )
@ -152,31 +152,44 @@ def chat(
        )

        # Test connection
-        print_info("Connecting to server...")
+        print_info(t("chat_connecting"))
        models = client.models.list()
        available_models = [m.id for m in models.data]

        if not available_models:
-            print_error("No models available on server")
+            print_error(t("chat_no_models"))
            raise typer.Exit(1)

        # Select model
        if model:
            if model not in available_models:
-                print_warning(f"Model '{model}' not found. Available models: {', '.join(available_models)}")
+                print_warning(t("chat_model_not_found", model=model, available=", ".join(available_models)))
                selected_model = available_models[0]
            else:
                selected_model = model
        else:
            selected_model = available_models[0]

-        print_success(f"Connected to model: {selected_model}")
+        print_success(t("chat_connected", model=selected_model))
        console.print()

+        # Load tokenizer for accurate token counting
+        tokenizer = None
+        try:
+            from transformers import AutoTokenizer
+
+            # selected_model is the model path
+            tokenizer = AutoTokenizer.from_pretrained(selected_model, trust_remote_code=True)
+            console.print(f"[dim]Loaded tokenizer from {selected_model}[/dim]")
+            console.print()
+        except Exception as e:
+            console.print(f"[dim yellow]Warning: Could not load tokenizer, token counts will be estimated[/dim]")
+            console.print()
+
    except Exception as e:
-        print_error(f"Failed to connect to server: {e}")
+        print_error(t("chat_connect_failed", error=str(e)))
        console.print()
-        console.print("Make sure the model server is running:")
+        console.print(t("chat_server_not_running"))
        console.print("  kt run <model>")
        raise typer.Exit(1)

@ -201,12 +214,12 @@ def chat(
    # Main chat loop
    try:
        while True:
-            # Get user input
+            # Get user input - use console.input() for better CJK character support
            try:
-                user_input = Prompt.ask("[bold green]You[/bold green]")
+                user_input = console.input(f"[bold green]{t('chat_user_prompt')}[/bold green]: ")
            except (EOFError, KeyboardInterrupt):
                console.print()
-                print_info("Goodbye!")
+                print_info(t("chat_goodbye"))
                break

            if not user_input.strip():
@ -224,15 +237,19 @@ def chat(

            # Generate response
            console.print()
-            console.print("[bold cyan]Assistant[/bold cyan]")
+            console.print(f"[bold cyan]{t('chat_assistant_prompt')}[/bold cyan]")

            try:
                if stream:
                    # Streaming response
-                    response_content = _stream_response(client, selected_model, messages, temperature, max_tokens)
+                    response_content = _stream_response(
+                        client, selected_model, messages, temperature, max_tokens, tokenizer
+                    )
                else:
                    # Non-streaming response
-                    response_content = _generate_response(client, selected_model, messages, temperature, max_tokens)
+                    response_content = _generate_response(
+                        client, selected_model, messages, temperature, max_tokens, tokenizer
+                    )

                # Add assistant response to history
                messages.append({"role": "assistant", "content": response_content})
@ -240,7 +257,7 @@ def chat(
                console.print()

            except Exception as e:
-                print_error(f"Error generating response: {e}")
+                print_error(t("chat_generation_error", error=str(e)))
                # Remove the user message that caused the error
                messages.pop()
                continue
@ -252,12 +269,12 @@ def chat(
    except KeyboardInterrupt:
        console.print()
        console.print()
-        print_info("Chat interrupted. Goodbye!")
+        print_info(t("chat_interrupted"))

    # Final history save
    if save_history and messages:
        _save_history(history_file, messages, selected_model)
-        console.print(f"[dim]History saved to: {history_file}[/dim]")
+        console.print(f"[dim]{t('chat_history_saved', path=str(history_file))}[/dim]")
        console.print()


@ -267,12 +284,22 @@ def _stream_response(
    messages: list,
    temperature: float,
    max_tokens: int,
+    tokenizer=None,
 ) -> str:
    """Generate streaming response and display in real-time."""
+    import time
+
    response_content = ""
    reasoning_content = ""

+    # Performance tracking
+    first_token_time = None
+    chunk_count = 0
+
    try:
+        # Start timing before sending request
+        start_time = time.time()
+
        stream = client.chat.completions.create(
            model=model,
            messages=messages,
@ -282,33 +309,120 @@ def _stream_response(
        )

        for chunk in stream:
-            delta = chunk.choices[0].delta
-            reasoning_delta = getattr(delta, "reasoning_content", None)
-            if reasoning_delta:
-                reasoning_content += reasoning_delta
-                console.print(reasoning_delta, end="", style="dim")
-            if delta.content:
-                content = delta.content
-                response_content += content
-                console.print(content, end="")
+            delta = chunk.choices[0].delta if chunk.choices else None
+            if delta:
+                reasoning_delta = getattr(delta, "reasoning_content", None)
+                if reasoning_delta:
+                    if first_token_time is None:
+                        first_token_time = time.time()
+                    reasoning_content += reasoning_delta
+                    console.print(reasoning_delta, end="", style="dim")
+                    chunk_count += 1
+
+                if delta.content:
+                    if first_token_time is None:
+                        first_token_time = time.time()
+                    content = delta.content
+                    response_content += content
+                    console.print(content, end="")
+                    chunk_count += 1

        console.print()  # Newline after streaming

+        # Display performance metrics
+        end_time = time.time()
+        if first_token_time and chunk_count > 0:
+            ttft = first_token_time - start_time
+            total_time = end_time - start_time
+
+            # Calculate TPOT based on chunks
+            if chunk_count > 1:
+                generation_time = total_time - ttft
+                tpot = generation_time / (chunk_count - 1)
+            else:
+                tpot = 0
+
+            # Calculate accurate token counts using tokenizer
+            if tokenizer:
+                input_tokens = _count_tokens_with_tokenizer(messages, tokenizer)
+                output_tokens = _count_tokens_with_tokenizer(
+                    [{"role": "assistant", "content": response_content}], tokenizer
+                )
+                token_prefix = ""
+            else:
+                # Fallback to estimation
+                input_tokens = _estimate_tokens(messages)
+                output_tokens = _estimate_tokens([{"role": "assistant", "content": response_content}])
+                token_prefix = "~"
+
+            # Build metrics display
+            metrics = f"[dim]Total: {total_time*1000:.0f}ms | TTFT: {ttft*1000:.0f}ms"
+            if tpot > 0:
+                metrics += f" | TPOT: {tpot*1000:.1f}ms"
+            metrics += f" | In: {token_prefix}{input_tokens} | Out: {token_prefix}{output_tokens}"
+            metrics += "[/dim]"
+
+            console.print(metrics)
+
    except Exception as e:
        raise Exception(f"Streaming error: {e}")

    return response_content


+def _count_tokens_with_tokenizer(messages: list, tokenizer) -> int:
+    """Count tokens accurately using the model's tokenizer."""
+    try:
+        # Concatenate all message content
+        text = ""
+        for msg in messages:
+            role = msg.get("role", "")
+            content = msg.get("content", "")
+            # Simple format: role + content
+            text += f"{role}: {content}\n"
+
+        # Encode and count tokens - suppress any debug output from custom tokenizers
+        import os
+        import sys
+        from contextlib import redirect_stdout, redirect_stderr
+
+        with open(os.devnull, "w") as devnull:
+            with redirect_stdout(devnull), redirect_stderr(devnull):
+                tokens = tokenizer.encode(text, add_special_tokens=True)
+        return len(tokens)
+    except Exception:
+        # Fallback to estimation if tokenizer fails
+        return _estimate_tokens(messages)
+
+
+def _estimate_tokens(messages: list) -> int:
+    """Estimate token count for messages (rough approximation)."""
+    total_chars = 0
+    for msg in messages:
+        content = msg.get("content", "")
+        total_chars += len(content)
+
+    # Rough estimation:
+    # - English: ~4 chars per token
+    # - Chinese: ~1.5 chars per token
+    # Use 2.5 as average
+    return max(1, int(total_chars / 2.5))
+
+
 def _generate_response(
    client: "OpenAI",
    model: str,
    messages: list,
    temperature: float,
    max_tokens: int,
+    tokenizer=None,
 ) -> str:
    """Generate non-streaming response."""
+    import time
+
    try:
+        start_time = time.time()
+
        response = client.chat.completions.create(
            model=model,
            messages=messages,
@ -317,12 +431,36 @@ def _generate_response(
            stream=False,
        )

+        end_time = time.time()
+        total_time = end_time - start_time
+
        content = response.choices[0].message.content

        # Display as markdown
        md = Markdown(content)
        console.print(md)

+        # Calculate accurate token counts using tokenizer
+        if tokenizer:
+            input_tokens = _count_tokens_with_tokenizer(messages, tokenizer)
+            output_tokens = _count_tokens_with_tokenizer([{"role": "assistant", "content": content}], tokenizer)
+            token_prefix = ""
+        else:
+            # Fallback to API usage or estimation
+            input_tokens = response.usage.prompt_tokens if response.usage else _estimate_tokens(messages)
+            output_tokens = (
+                response.usage.completion_tokens
+                if response.usage
+                else _estimate_tokens([{"role": "assistant", "content": content}])
+            )
+            token_prefix = "" if response.usage else "~"
+
+        # Display performance metrics
+        console.print(
+            f"[dim]Time: {total_time*1000:.0f}ms | "
+            f"In: {token_prefix}{input_tokens} | Out: {token_prefix}{output_tokens}[/dim]"
+        )
+
        return content

    except Exception as e:
@ -335,20 +473,14 @@ def _handle_command(command: str, messages: list, temperature: float, max_tokens

    if cmd in ["/quit", "/exit", "/q"]:
        console.print()
-        print_info("Goodbye!")
+        print_info(t("chat_goodbye"))
        return False

    elif cmd in ["/help", "/h"]:
        console.print()
        console.print(
            Panel(
-                "[bold]Available Commands:[/bold]\n\n"
-                "/help, /h         - Show this help message\n"
-                "/quit, /exit, /q  - Exit chat\n"
-                "/clear, /c        - Clear conversation history\n"
-                "/history, /hist   - Show conversation history\n"
-                "/info, /i         - Show current settings\n"
-                "/retry, /r        - Regenerate last response",
+                f"[bold]{t('chat_help_title')}[/bold]\n\n{t('chat_help_content')}",
                title="Help",
                border_style="cyan",
            )
@ -359,19 +491,19 @@ def _handle_command(command: str, messages: list, temperature: float, max_tokens
    elif cmd in ["/clear", "/c"]:
        messages.clear()
        console.print()
-        print_success("Conversation history cleared")
+        print_success(t("chat_history_cleared"))
        console.print()
        return True

    elif cmd in ["/history", "/hist"]:
        console.print()
        if not messages:
-            print_info("No conversation history")
+            print_info(t("chat_no_history"))
        else:
            console.print(
                Panel(
                    _format_history(messages),
-                    title=f"History ({len(messages)} messages)",
+                    title=t("chat_history_title", count=len(messages)),
                    border_style="cyan",
                )
            )
@ -382,10 +514,7 @@ def _handle_command(command: str, messages: list, temperature: float, max_tokens
        console.print()
        console.print(
            Panel(
-                f"[bold]Current Settings:[/bold]\n\n"
-                f"Temperature: [cyan]{temperature}[/cyan]\n"
-                f"Max tokens: [cyan]{max_tokens}[/cyan]\n"
-                f"Messages: [cyan]{len(messages)}[/cyan]",
+                f"[bold]{t('chat_info_title')}[/bold]\n\n{t('chat_info_content', temperature=temperature, max_tokens=max_tokens, messages=len(messages))}",
                title="Info",
                border_style="cyan",
            )
@ -397,16 +526,16 @@ def _handle_command(command: str, messages: list, temperature: float, max_tokens
        if len(messages) >= 2 and messages[-1]["role"] == "assistant":
            # Remove last assistant response
            messages.pop()
-            print_info("Retrying last response...")
+            print_info(t("chat_retrying"))
            console.print()
        else:
-            print_warning("No previous response to retry")
+            print_warning(t("chat_no_retry"))
            console.print()
        return True

    else:
-        print_warning(f"Unknown command: {command}")
-        console.print("[dim]Type /help for available commands[/dim]")
+        print_warning(t("chat_unknown_command", command=command))
+        console.print(f"[dim]{t('chat_unknown_hint')}[/dim]")
        console.print()
        return True