diff --git a/Makefile b/Makefile
index a8a6095bf..f355367aa 100644
--- a/Makefile
+++ b/Makefile
@@ -766,9 +766,9 @@ clean:
 	rm -vrf llguidance
 
 # useful tools
-main: tools/main/main.cpp common/arg.cpp common/download.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
+main: tools/completion/completion.cpp common/arg.cpp common/download.cpp build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_default.o llava.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-mainvk: tools/main/main.cpp common/arg.cpp common/download.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o ggml-repack.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
+mainvk: tools/completion/completion.cpp common/arg.cpp common/download.cpp build-info.h ggml_v4_vulkan.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o llavaclip_vulkan.o llava.o ggml-backend_vulkan.o ggml-backend-reg_vulkan.o ggml-vulkan.o ggml-vulkan-shaders.o ggml-repack.o $(OBJS_FULL) $(OBJS) lib/vulkan-1.lib
 	$(CXX) $(CXXFLAGS) -DGGML_USE_VULKAN -DSD_USE_VULKAN $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 sdmain: otherarch/sdcpp/util.cpp otherarch/sdcpp/main.cpp otherarch/sdcpp/stable-diffusion.cpp otherarch/sdcpp/upscaler.cpp otherarch/sdcpp/model.cpp otherarch/sdcpp/name_conversion.cpp otherarch/sdcpp/tokenize_util.cpp otherarch/sdcpp/thirdparty/zip.c build-info.h ggml.o ggml-cpu.o ggml-ops.o ggml-vec.o ggml-binops.o ggml-unops.o llama.o console.o ggml-backend_default.o ggml-backend-reg_default.o ggml-repack.o $(OBJS_FULL) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
diff --git a/common/arg.cpp b/common/arg.cpp
index bd64da20b..05f41036e 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -53,6 +53,7 @@ using json = nlohmann::ordered_json;
 static std::initializer_list<enum llama_example> mmproj_examples = {
     LLAMA_EXAMPLE_MTMD,
     LLAMA_EXAMPLE_SERVER,
+    LLAMA_EXAMPLE_CLI,
 };
 
 static std::string read_file(const std::string & fname) {
@@ -470,6 +471,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         ));
     }
 
+    common_log_set_verbosity_thold(params.verbosity);
+
     return true;
 }
 
@@ -792,7 +795,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.display_prompt = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
         {"-co", "--color"}, "[on|off|auto]",
         "Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
@@ -809,7 +812,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                     string_format("error: unknown value for --color: '%s'\n", value.c_str()));
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
     add_opt(common_arg(
         {"-t", "--threads"}, "N",
         string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
@@ -942,7 +945,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"-n", "--predict", "--n-predict"}, "N",
         string_format(
-            ex == LLAMA_EXAMPLE_MAIN
+            ex == LLAMA_EXAMPLE_COMPLETION
                 ? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
                 : "number of tokens to predict (default: %d, -1 = infinity)",
             params.n_predict),
@@ -986,7 +989,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.n_ctx_checkpoints = value;
         }
-    ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
         {"--cache-ram", "-cram"}, "N",
         string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
@@ -994,7 +997,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.cache_ram_mib = value;
         }
-    ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
         {"--kv-unified", "-kvu"},
         string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
@@ -1009,14 +1012,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.ctx_shift = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
     add_opt(common_arg(
         {"--context-shift"},
         string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
         [](common_params & params) {
             params.ctx_shift = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
     add_opt(common_arg(
         {"--chunks"}, "N",
         string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -1052,7 +1055,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.system_prompt = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
     add_opt(common_arg(
         {"--no-perf"},
         string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@@ -1061,6 +1064,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.sampling.no_perf = true;
         }
     ).set_env("LLAMA_ARG_NO_PERF"));
+    add_opt(common_arg(
+        {"--no-show-timings"},
+        string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
+        [](common_params & params) {
+            params.show_timings = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS"));
     add_opt(common_arg(
         {"-f", "--file"}, "FNAME",
         "a file containing the prompt (default: none)",
@@ -1082,7 +1092,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.system_prompt.pop_back();
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
     add_opt(common_arg(
         {"--in-file"}, "FNAME",
         "an input file (repeat to specify multiple files)",
@@ -1130,42 +1140,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.n_print = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
     add_opt(common_arg(
         {"--prompt-cache"}, "FNAME",
         "file to cache prompt state for faster startup (default: none)",
         [](common_params & params, const std::string & value) {
             params.path_prompt_cache = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
     add_opt(common_arg(
         {"--prompt-cache-all"},
         "if specified, saves user input and generations to cache as well\n",
         [](common_params & params) {
             params.prompt_cache_all = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
     add_opt(common_arg(
         {"--prompt-cache-ro"},
         "if specified, uses the prompt cache but does not update it",
         [](common_params & params) {
             params.prompt_cache_ro = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
     add_opt(common_arg(
         {"-r", "--reverse-prompt"}, "PROMPT",
         "halt generation at PROMPT, return control in interactive mode\n",
         [](common_params & params, const std::string & value) {
             params.antiprompt.emplace_back(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-sp", "--special"},
         string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
         [](common_params & params) {
             params.special = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-cnv", "--conversation"},
         "run in conversation mode:\n"
@@ -1175,14 +1185,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
     add_opt(common_arg(
         {"-no-cnv", "--no-conversation"},
         "force disable conversation mode (default: false)",
         [](common_params & params) {
             params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
         {"-st", "--single-turn"},
         "run conversation for a single turn only, then exit when done\n"
@@ -1191,28 +1201,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.single_turn = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
         {"-i", "--interactive"},
         string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
         [](common_params & params) {
             params.interactive = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
     add_opt(common_arg(
         {"-if", "--interactive-first"},
         string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
         [](common_params & params) {
             params.interactive_first = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
     add_opt(common_arg(
         {"-mli", "--multiline-input"},
         "allows you to write or paste multiple lines without ending each in '\\'",
         [](common_params & params) {
             params.multiline_input = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
         {"--in-prefix-bos"},
         "prefix BOS to user inputs, preceding the `--in-prefix` string",
@@ -1220,7 +1230,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.input_prefix_bos = true;
             params.enable_chat_template = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
     add_opt(common_arg(
         {"--in-prefix"}, "STRING",
         "string to prefix user inputs with (default: empty)",
@@ -1228,7 +1238,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.input_prefix = value;
             params.enable_chat_template = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
     add_opt(common_arg(
         {"--in-suffix"}, "STRING",
         "string to suffix after user inputs with (default: empty)",
@@ -1236,14 +1246,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.input_suffix = value;
             params.enable_chat_template = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
     add_opt(common_arg(
         {"--no-warmup"},
         "skip warming up the model with an empty run",
         [](common_params & params) {
             params.warmup = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--spm-infill"},
         string_format(
@@ -1634,14 +1644,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.grp_attn_n = value;
         }
-    ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
+    ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_PASSKEY}));
     add_opt(common_arg(
         {"-gaw", "--grp-attn-w"}, "N",
         string_format("group-attention width (default: %d)", params.grp_attn_w),
         [](common_params & params, int value) {
             params.grp_attn_w = value;
         }
-    ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
     add_opt(common_arg(
         {"-nkvo", "--no-kv-offload"},
         "disable KV offload",
@@ -1831,7 +1841,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.image.emplace_back(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_MTMD}));
+    ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
         {"--image-min-tokens"}, "N",
         "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
@@ -1924,7 +1934,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
             parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
         {"--cpu-moe", "-cmoe"},
         "keep all Mixture of Experts (MoE) weights in the CPU",
@@ -1953,7 +1963,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
     add_opt(common_arg(
         {"--n-cpu-moe-draft", "-ncmoed"}, "N",
         "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
@@ -1967,7 +1977,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
     add_opt(common_arg(
         {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
         string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
@@ -2446,7 +2456,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 params.default_template_kwargs[item.key()] = item.value().dump();
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
     add_opt(common_arg(
         {"-to", "--timeout"}, "N",
         string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@@ -2555,14 +2565,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.use_jinja = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
     add_opt(common_arg(
         {"--no-jinja"},
         string_format("disable jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
         [](common_params & params) {
             params.use_jinja = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
     add_opt(common_arg(
         {"--reasoning-format"}, "FORMAT",
         "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@@ -2573,7 +2583,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.reasoning_format = common_reasoning_format_from_name(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
     add_opt(common_arg(
         {"--reasoning-budget"}, "N",
         "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
@@ -2581,7 +2591,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
             params.reasoning_budget = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
     add_opt(common_arg(
         {"--chat-template"}, "JINJA_TEMPLATE",
         string_format(
@@ -2593,7 +2603,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.chat_template = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
     add_opt(common_arg(
         {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
         string_format(
@@ -2605,7 +2615,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.chat_template = read_file(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
     add_opt(common_arg(
         {"--no-prefill-assistant"},
         string_format(
@@ -2636,7 +2646,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.simple_io = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
         {"--positive-file"}, "FNAME",
         string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
@@ -2719,7 +2729,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
         [](common_params & params) {
             params.verbosity = INT_MAX;
-            common_log_set_verbosity_thold(INT_MAX);
         }
     ));
     add_opt(common_arg(
@@ -2740,7 +2749,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             "(default: %d)\n", params.verbosity),
         [](common_params & params, int value) {
             params.verbosity = value;
-            common_log_set_verbosity_thold(value);
         }
     ).set_env("LLAMA_LOG_VERBOSITY"));
     add_opt(common_arg(
@@ -2873,14 +2881,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, int value) {
             params.speculative.n_max = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX"));
     add_opt(common_arg(
         {"--draft-min", "--draft-n-min"}, "N",
         string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
         [](common_params & params, int value) {
             params.speculative.n_min = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN"));
     add_opt(common_arg(
         {"--draft-p-split"}, "P",
         string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
@@ -2894,14 +2902,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.speculative.p_min = std::stof(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
     add_opt(common_arg(
         {"-cd", "--ctx-size-draft"}, "N",
         string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
         [](common_params & params, int value) {
             params.speculative.n_ctx = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
     add_opt(common_arg(
         {"-devd", "--device-draft"}, "<dev1,dev2,..>",
         "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@@ -2909,7 +2917,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.speculative.devices = parse_device_list(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
         {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
         "number of layers to store in VRAM for the draft model",
@@ -2921,21 +2929,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
             }
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
     add_opt(common_arg(
         {"-md", "--model-draft"}, "FNAME",
         "draft model for speculative decoding (default: unused)",
         [](common_params & params, const std::string & value) {
             params.speculative.model.path = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_MODEL_DRAFT"));
     add_opt(common_arg(
         {"--spec-replace"}, "TARGET", "DRAFT",
         "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
         [](common_params & params, const std::string & tgt, const std::string & dft) {
             params.speculative.replacements.push_back({ tgt, dft });
         }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
     add_opt(common_arg(
         {"-ctkd", "--cache-type-k-draft"}, "TYPE",
         string_format(
@@ -3199,7 +3207,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.use_jinja = true;
             //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
 
     add_opt(common_arg(
         {"--gpt-oss-120b-default"},
@@ -3218,7 +3226,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.use_jinja = true;
             //params.default_template_kwargs["reasoning_effort"] = "\"high\"";
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
 
     add_opt(common_arg(
         {"--vision-gemma-4b-default"},
@@ -3229,7 +3237,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.n_ctx = 0;
             params.use_jinja = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
 
     add_opt(common_arg(
         {"--vision-gemma-12b-default"},
@@ -3240,7 +3248,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.n_ctx = 0;
             params.use_jinja = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
 
     return ctx_arg;
 }
diff --git a/common/common.h b/common/common.h
index df920c6db..3f99817de 100644
--- a/common/common.h
+++ b/common/common.h
@@ -78,7 +78,8 @@ int32_t cpu_get_num_math();
 enum llama_example {
     LLAMA_EXAMPLE_COMMON,
     LLAMA_EXAMPLE_SPECULATIVE,
-    LLAMA_EXAMPLE_MAIN,
+    LLAMA_EXAMPLE_COMPLETION,
+    LLAMA_EXAMPLE_CLI,
     LLAMA_EXAMPLE_EMBEDDING,
     LLAMA_EXAMPLE_PERPLEXITY,
     LLAMA_EXAMPLE_RETRIEVAL,
@@ -402,6 +403,7 @@ struct common_params {
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
     bool no_perf           = false; // disable performance metrics
+    bool show_timings      = true;  // show timing information on CLI
     bool ctx_shift         = false; // context shift on infinite text generation
     bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
     bool kv_unified        = false; // enable unified KV cache
diff --git a/common/console.cpp b/common/console.cpp
index 5e9901e4a..2ea178f81 100644
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -1,4 +1,5 @@
 #include "console.h"
+#include "log.h"
 #include <vector>
 #include <iostream>
 #include <cassert>
@@ -6,6 +7,10 @@
 #include <cctype>
 #include <cwctype>
 #include <cstdint>
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+#include <stdarg.h>
 
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
@@ -35,6 +40,7 @@
 #define ANSI_COLOR_BLUE    "\x1b[34m"
 #define ANSI_COLOR_MAGENTA "\x1b[35m"
 #define ANSI_COLOR_CYAN    "\x1b[36m"
+#define ANSI_COLOR_GRAY    "\x1b[90m"
 #define ANSI_COLOR_RESET   "\x1b[0m"
 #define ANSI_BOLD          "\x1b[1m"
 
@@ -61,17 +67,17 @@ namespace console {
     //
 #endif
 
-    static bool      advanced_display = false;
-    static bool      simple_io        = true;
-    static display_t current_display  = reset;
+    static bool         advanced_display = false;
+    static bool         simple_io        = true;
+    static display_type current_display  = DISPLAY_TYPE_RESET;
 
-    static FILE*     out              = stdout;
+    static FILE*        out              = stdout;
 
 #if defined (_WIN32)
-    static void*     hConsole;
+    static void*        hConsole;
 #else
-    static FILE*     tty              = nullptr;
-    static termios   initial_state;
+    static FILE*        tty              = nullptr;
+    static termios      initial_state;
 #endif
 
     //
@@ -142,7 +148,7 @@ namespace console {
 
     void cleanup() {
         // Reset console display
-        set_display(reset);
+        set_display(DISPLAY_TYPE_RESET);
 
 #if !defined(_WIN32)
         // Restore settings on POSIX systems
@@ -162,20 +168,26 @@ namespace console {
     //
 
     // Keep track of current display and only emit ANSI code if it changes
-    void set_display(display_t display) {
+    void set_display(display_type display) {
         if (advanced_display && current_display != display) {
-            fflush(stdout);
+            common_log_flush(common_log_main());
             switch(display) {
-                case reset:
+                case DISPLAY_TYPE_RESET:
                     fprintf(out, ANSI_COLOR_RESET);
                     break;
-                case prompt:
+                case DISPLAY_TYPE_INFO:
+                    fprintf(out, ANSI_COLOR_MAGENTA);
+                    break;
+                case DISPLAY_TYPE_PROMPT:
                     fprintf(out, ANSI_COLOR_YELLOW);
                     break;
-                case user_input:
+                case DISPLAY_TYPE_REASONING:
+                    fprintf(out, ANSI_COLOR_GRAY);
+                    break;
+                case DISPLAY_TYPE_USER_INPUT:
                     fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
                     break;
-                case error:
+                case DISPLAY_TYPE_ERROR:
                     fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
             }
             current_display = display;
@@ -778,7 +790,6 @@ namespace console {
             }
 
             if (is_special_char) {
-                set_display(user_input);
                 replace_last(line.back());
                 is_special_char = false;
             }
@@ -961,7 +972,6 @@ namespace console {
             }
 
             if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
-                set_display(prompt);
                 replace_last(line.back());
                 is_special_char = true;
             }
@@ -1046,12 +1056,82 @@ namespace console {
     }
 
     bool readline(std::string & line, bool multiline_input) {
-        set_display(user_input);
-
         if (simple_io) {
             return readline_simple(line, multiline_input);
         }
         return readline_advanced(line, multiline_input);
     }
 
+    namespace spinner {
+        static const char LOADING_CHARS[] = {'|', '/', '-', '\\'};
+        static std::condition_variable cv_stop;
+        static std::thread th;
+        static size_t frame = 0; // only modified by one thread
+        static bool running = false;
+        static std::mutex mtx;
+        static auto wait_time = std::chrono::milliseconds(100);
+        static void draw_next_frame() {
+            // don't need lock because only one thread modifies running
+            frame = (frame + 1) % sizeof(LOADING_CHARS);
+            replace_last(LOADING_CHARS[frame]);
+            fflush(out);
+        }
+        void start() {
+            std::unique_lock<std::mutex> lock(mtx);
+            if (simple_io || running) {
+                return;
+            }
+            common_log_flush(common_log_main());
+            fprintf(out, "%c", LOADING_CHARS[0]);
+            fflush(out);
+            frame = 1;
+            running = true;
+            th = std::thread([]() {
+                std::unique_lock<std::mutex> lock(mtx);
+                while (true) {
+                    if (cv_stop.wait_for(lock, wait_time, []{ return !running; })) {
+                        break;
+                    }
+                    draw_next_frame();
+                }
+            });
+        }
+        void stop() {
+            {
+                std::unique_lock<std::mutex> lock(mtx);
+                if (simple_io || !running) {
+                    return;
+                }
+                running = false;
+                cv_stop.notify_all();
+            }
+            if (th.joinable()) {
+                th.join();
+            }
+            replace_last(' ');
+            pop_cursor();
+            fflush(out);
+        }
+    }
+
+    void log(const char * fmt, ...) {
+        va_list args;
+        va_start(args, fmt);
+        vfprintf(out, fmt, args);
+        va_end(args);
+    }
+
+    void error(const char * fmt, ...) {
+        va_list args;
+        va_start(args, fmt);
+        display_type cur = current_display;
+        set_display(DISPLAY_TYPE_ERROR);
+        vfprintf(out, fmt, args);
+        set_display(cur); // restore previous color
+        va_end(args);
+    }
+
+    void flush() {
+        fflush(out);
+    }
 }
diff --git a/common/console.h b/common/console.h
index ec175269b..fad6d3953 100644
--- a/common/console.h
+++ b/common/console.h
@@ -2,18 +2,40 @@
 
 #pragma once
 
+#include "common.h"
+
 #include <string>
 
-namespace console {
-    enum display_t {
-        reset = 0,
-        prompt,
-        user_input,
-        error
-    };
+enum display_type {
+    DISPLAY_TYPE_RESET = 0,
+    DISPLAY_TYPE_INFO,
+    DISPLAY_TYPE_PROMPT,
+    DISPLAY_TYPE_REASONING,
+    DISPLAY_TYPE_USER_INPUT,
+    DISPLAY_TYPE_ERROR
+};
 
+namespace console {
     void init(bool use_simple_io, bool use_advanced_display);
     void cleanup();
-    void set_display(display_t display);
+    void set_display(display_type display);
     bool readline(std::string & line, bool multiline_input);
+
+    namespace spinner {
+        void start();
+        void stop();
+    }
+
+    // note: the logging API below output directly to stdout
+    // it can negatively impact performance if used on inference thread
+    // only use in in a dedicated CLI thread
+    // for logging in inference thread, use log.h instead
+
+    LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
+    void log(const char * fmt, ...);
+
+    LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
+    void error(const char * fmt, ...);
+
+    void flush();
 }
diff --git a/common/log.cpp b/common/log.cpp
index 00a03f158..b17d2b62c 100644
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -420,6 +420,11 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) {
     log->set_timestamps(timestamps);
 }
 
+void common_log_flush(struct common_log * log) {
+    log->pause();
+    log->resume();
+}
+
 static int common_get_verbosity(enum ggml_log_level level) {
     switch (level) {
         case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
diff --git a/common/log.h b/common/log.h
index b24f5f000..f0f8471b5 100644
--- a/common/log.h
+++ b/common/log.h
@@ -84,6 +84,7 @@ void common_log_set_file      (struct common_log * log, const char * file); // n
 void common_log_set_colors    (struct common_log * log, log_colors colors); // not thread-safe
 void common_log_set_prefix    (struct common_log * log, bool prefix);       // whether to output prefix to each log
 void common_log_set_timestamps(struct common_log * log, bool timestamps);   // whether to output timestamps in the prefix
+void common_log_flush         (struct common_log * log);                    // flush all pending log messages
 
 // helper macros for logging
 // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index bc0053f5a..cb9fe4056 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -4326,6 +4326,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                 case GGML_UNARY_OP_EXPM1:
                 case GGML_UNARY_OP_SOFTPLUS:
                 case GGML_UNARY_OP_ELU:
+                case GGML_UNARY_OP_XIELU:
                 case GGML_UNARY_OP_FLOOR:
                 case GGML_UNARY_OP_CEIL:
                 case GGML_UNARY_OP_ROUND:
diff --git a/tools/cli/CMakeLists.txt b/tools/cli/CMakeLists.txt
new file mode 100644
index 000000000..b08fff4c2
--- /dev/null
+++ b/tools/cli/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(TARGET llama-cli)
+add_executable(${TARGET} cli.cpp)
+target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+include_directories(../server)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
new file mode 100644
index 000000000..8a8639207
--- /dev/null
+++ b/tools/cli/cli.cpp
@@ -0,0 +1,395 @@
+#include "common.h"
+#include "arg.h"
+#include "console.h"
+// #include "log.h"
+
+#include "server-context.h"
+#include "server-task.h"
+
+#include <atomic>
+#include <fstream>
+#include <thread>
+#include <signal.h>
+
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+const char * LLAMA_ASCII_LOGO = R"(
+▄▄ ▄▄
+██ ██
+██ ██  ▀▀█▄ ███▄███▄  ▀▀█▄    ▄████ ████▄ ████▄
+██ ██ ▄█▀██ ██ ██ ██ ▄█▀██    ██    ██ ██ ██ ██
+██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀
+                                    ██    ██
+                                    ▀▀    ▀▀
+)";
+
+static std::atomic<bool> g_is_interrupted = false;
+static bool should_stop() {
+    return g_is_interrupted.load();
+}
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+static void signal_handler(int) {
+    if (g_is_interrupted.load()) {
+        // second Ctrl+C - exit immediately
+        // make sure to clear colors before exiting (not using LOG or console.cpp here to avoid deadlock)
+        fprintf(stdout, "\033[0m\n");
+        fflush(stdout);
+        std::exit(130);
+    }
+    g_is_interrupted.store(true);
+}
+#endif
+
+struct cli_context {
+    server_context ctx_server;
+    json messages = json::array();
+    std::vector<raw_buffer> input_files;
+    task_params defaults;
+
+    // thread for showing "loading" animation
+    std::atomic<bool> loading_show;
+
+    cli_context(const common_params & params) {
+        defaults.sampling    = params.sampling;
+        defaults.speculative = params.speculative;
+        defaults.n_keep      = params.n_keep;
+        defaults.n_predict   = params.n_predict;
+        defaults.antiprompt  = params.antiprompt;
+
+        defaults.stream = true; // make sure we always use streaming mode
+        defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way
+        // defaults.return_progress = true; // TODO: show progress
+        defaults.oaicompat_chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    }
+
+    std::string generate_completion(result_timings & out_timings) {
+        server_response_reader rd = ctx_server.get_response_reader();
+        {
+            // TODO: reduce some copies here in the future
+            server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
+            task.id        = rd.get_new_id();
+            task.index     = 0;
+            task.params    = defaults;    // copy
+            task.cli_input = messages;    // copy
+            task.cli_files = input_files; // copy
+            rd.post_task({std::move(task)});
+        }
+
+        // wait for first result
+        console::spinner::start();
+        server_task_result_ptr result = rd.next(should_stop);
+
+        console::spinner::stop();
+        std::string curr_content;
+        bool is_thinking = false;
+
+        while (result) {
+            if (should_stop()) {
+                break;
+            }
+            if (result->is_error()) {
+                json err_data = result->to_json();
+                if (err_data.contains("message")) {
+                    console::error("Error: %s\n", err_data["message"].get<std::string>().c_str());
+                } else {
+                    console::error("Error: %s\n", err_data.dump().c_str());
+                }
+                return curr_content;
+            }
+            auto res_partial = dynamic_cast<server_task_result_cmpl_partial *>(result.get());
+            if (res_partial) {
+                out_timings = std::move(res_partial->timings);
+                for (const auto & diff : res_partial->oaicompat_msg_diffs) {
+                    if (!diff.content_delta.empty()) {
+                        if (is_thinking) {
+                            console::log("\n[End thinking]\n\n");
+                            console::set_display(DISPLAY_TYPE_RESET);
+                            is_thinking = false;
+                        }
+                        curr_content += diff.content_delta;
+                        console::log("%s", diff.content_delta.c_str());
+                        console::flush();
+                    }
+                    if (!diff.reasoning_content_delta.empty()) {
+                        console::set_display(DISPLAY_TYPE_REASONING);
+                        if (!is_thinking) {
+                            console::log("[Start thinking]\n");
+                        }
+                        is_thinking = true;
+                        console::log("%s", diff.reasoning_content_delta.c_str());
+                        console::flush();
+                    }
+                }
+            }
+            auto res_final = dynamic_cast<server_task_result_cmpl_final *>(result.get());
+            if (res_final) {
+                out_timings = std::move(res_final->timings);
+                break;
+            }
+            result = rd.next(should_stop);
+        }
+        g_is_interrupted.store(false);
+        // server_response_reader automatically cancels pending tasks upon destruction
+        return curr_content;
+    }
+
+    // TODO: support remote files in the future (http, https, etc)
+    std::string load_input_file(const std::string & fname, bool is_media) {
+        std::ifstream file(fname, std::ios::binary);
+        if (!file) {
+            return "";
+        }
+        if (is_media) {
+            raw_buffer buf;
+            buf.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+            input_files.push_back(std::move(buf));
+            return mtmd_default_marker();
+        } else {
+            std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+            return content;
+        }
+    }
+};
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    params.verbosity = LOG_LEVEL_ERROR; // by default, less verbose logs
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CLI)) {
+        return 1;
+    }
+
+    // TODO: maybe support it later?
+    if (params.conversation_mode == COMMON_CONVERSATION_MODE_DISABLED) {
+        console::error("--no-conversation is not supported by llama-cli\n");
+        console::error("please use llama-completion instead\n");
+    }
+
+    common_init();
+
+    // struct that contains llama context and inference
+    cli_context ctx_cli(params);
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // TODO: avoid using atexit() here by making `console` a singleton
+    console::init(params.simple_io, params.use_color);
+    atexit([]() { console::cleanup(); });
+
+    console::set_display(DISPLAY_TYPE_RESET);
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+    struct sigaction sigint_action;
+    sigint_action.sa_handler = signal_handler;
+    sigemptyset (&sigint_action.sa_mask);
+    sigint_action.sa_flags = 0;
+    sigaction(SIGINT, &sigint_action, NULL);
+    sigaction(SIGTERM, &sigint_action, NULL);
+#elif defined (_WIN32)
+    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+    };
+    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+
+    console::log("\nLoading model... "); // followed by loading animation
+    console::spinner::start();
+    if (!ctx_cli.ctx_server.load_model(params)) {
+        console::spinner::stop();
+        console::error("\nFailed to load the model\n");
+        return 1;
+    }
+
+    ctx_cli.ctx_server.init();
+
+    console::spinner::stop();
+    console::log("\n");
+
+    std::thread inference_thread([&ctx_cli]() {
+        ctx_cli.ctx_server.start_loop();
+    });
+
+    auto inf = ctx_cli.ctx_server.get_info();
+    std::string modalities = "text";
+    if (inf.has_inp_image) {
+        modalities += ", vision";
+    }
+    if (inf.has_inp_audio) {
+        modalities += ", audio";
+    }
+
+    if (!params.system_prompt.empty()) {
+        ctx_cli.messages.push_back({
+            {"role",    "system"},
+            {"content", params.system_prompt}
+        });
+    }
+
+    console::log("\n");
+    console::log("%s\n", LLAMA_ASCII_LOGO);
+    console::log("build      : %s\n", inf.build_info.c_str());
+    console::log("model      : %s\n", inf.model_name.c_str());
+    console::log("modalities : %s\n", modalities.c_str());
+    if (!params.system_prompt.empty()) {
+        console::log("using custom system prompt\n");
+    }
+    console::log("\n");
+    console::log("available commands:\n");
+    console::log("  /exit or Ctrl+C     stop or exit\n");
+    console::log("  /regen              regenerate the last response\n");
+    console::log("  /clear              clear the chat history\n");
+    console::log("  /read               add a text file\n");
+    if (inf.has_inp_image) {
+        console::log("  /image <file>       add an image file\n");
+    }
+    if (inf.has_inp_audio) {
+        console::log("  /audio <file>       add an audio file\n");
+    }
+    console::log("\n");
+
+    // interactive loop
+    std::string cur_msg;
+    while (true) {
+        std::string buffer;
+        console::set_display(DISPLAY_TYPE_USER_INPUT);
+        if (params.prompt.empty()) {
+            console::log("\n> ");
+            std::string line;
+            bool another_line = true;
+            do {
+                another_line = console::readline(line, params.multiline_input);
+                buffer += line;
+            } while (another_line);
+        } else {
+            // process input prompt from args
+            for (auto & fname : params.image) {
+                std::string marker = ctx_cli.load_input_file(fname, true);
+                if (marker.empty()) {
+                    console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
+                    break;
+                }
+                console::log("Loaded media from '%s'\n", fname.c_str());
+                cur_msg += marker;
+            }
+            buffer = params.prompt;
+            if (buffer.size() > 500) {
+                console::log("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str());
+            } else {
+                console::log("\n> %s\n", buffer.c_str());
+            }
+            params.prompt.clear(); // only use it once
+        }
+        console::set_display(DISPLAY_TYPE_RESET);
+        console::log("\n");
+
+        if (should_stop()) {
+            g_is_interrupted.store(false);
+            break;
+        }
+
+        // remove trailing newline
+        if (!buffer.empty() &&buffer.back() == '\n') {
+            buffer.pop_back();
+        }
+
+        // skip empty messages
+        if (buffer.empty()) {
+            continue;
+        }
+
+        bool add_user_msg = true;
+
+        // process commands
+        if (string_starts_with(buffer, "/exit")) {
+            break;
+        } else if (string_starts_with(buffer, "/regen")) {
+            if (ctx_cli.messages.size() >= 2) {
+                size_t last_idx = ctx_cli.messages.size() - 1;
+                ctx_cli.messages.erase(last_idx);
+                add_user_msg = false;
+            } else {
+                console::error("No message to regenerate.\n");
+                continue;
+            }
+        } else if (string_starts_with(buffer, "/clear")) {
+            ctx_cli.messages.clear();
+            ctx_cli.input_files.clear();
+            console::log("Chat history cleared.\n");
+            continue;
+        } else if (
+                (string_starts_with(buffer, "/image ") && inf.has_inp_image) ||
+                (string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) {
+            // just in case (bad copy-paste for example), we strip all trailing/leading spaces
+            std::string fname = string_strip(buffer.substr(7));
+            std::string marker = ctx_cli.load_input_file(fname, true);
+            if (marker.empty()) {
+                console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
+                continue;
+            }
+            cur_msg += marker;
+            console::log("Loaded media from '%s'\n", fname.c_str());
+            continue;
+        } else if (string_starts_with(buffer, "/read ")) {
+            std::string fname = string_strip(buffer.substr(6));
+            std::string marker = ctx_cli.load_input_file(fname, false);
+            if (marker.empty()) {
+                console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
+                continue;
+            }
+            cur_msg += marker;
+            console::log("Loaded text from '%s'\n", fname.c_str());
+            continue;
+        } else {
+            // not a command
+            cur_msg += buffer;
+        }
+
+        // generate response
+        if (add_user_msg) {
+            ctx_cli.messages.push_back({
+                {"role",    "user"},
+                {"content", cur_msg}
+            });
+            cur_msg.clear();
+        }
+        result_timings timings;
+        std::string assistant_content = ctx_cli.generate_completion(timings);
+        ctx_cli.messages.push_back({
+            {"role",    "assistant"},
+            {"content", assistant_content}
+        });
+        console::log("\n");
+
+        if (params.show_timings) {
+            console::set_display(DISPLAY_TYPE_INFO);
+            console::log("\n");
+            console::log("[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n", timings.prompt_per_second, timings.predicted_per_second);
+            console::set_display(DISPLAY_TYPE_RESET);
+        }
+
+        if (params.single_turn) {
+            break;
+        }
+    }
+
+    console::set_display(DISPLAY_TYPE_RESET);
+
+    console::log("\nExiting...\n");
+    ctx_cli.ctx_server.terminate();
+    inference_thread.join();
+
+    // bump the log level to display timings
+    common_log_set_verbosity_thold(LOG_LEVEL_INFO);
+    llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context());
+
+    return 0;
+}
diff --git a/tools/main/main.cpp b/tools/completion/completion.cpp
similarity index 98%
rename from tools/main/main.cpp
rename to tools/completion/completion.cpp
index 02889bd19..426e0436e 100644
--- a/tools/main/main.cpp
+++ b/tools/completion/completion.cpp
@@ -87,7 +87,7 @@ static void sigint_handler(int signo) {
 int main(int argc, char ** argv) {
     common_params params;
     g_params = &params;
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) {
         return 1;
     }
 
@@ -522,12 +522,6 @@ int main(int argc, char ** argv) {
         is_interacting = params.interactive_first;
     }
 
-    LOG_WRN("*****************************\n");
-    LOG_WRN("IMPORTANT: The current llama-cli will be moved to llama-completion in the near future\n");
-    LOG_WRN("  New llama-cli will have enhanced features and improved user experience\n");
-    LOG_WRN("  More info: https://github.com/ggml-org/llama.cpp/discussions/17618\n");
-    LOG_WRN("*****************************\n");
-
     bool is_antiprompt        = false;
     bool input_echo           = true;
     bool display              = true;
@@ -544,7 +538,7 @@ int main(int argc, char ** argv) {
     std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
 
     // the first thing we will do is to output the prompt, so set color accordingly
-    console::set_display(console::prompt);
+    console::set_display(DISPLAY_TYPE_PROMPT);
     display = params.display_prompt;
 
     std::vector<llama_token> embd;
@@ -589,9 +583,9 @@ int main(int argc, char ** argv) {
                 const int skipped_tokens = (int) embd.size() - max_embd_size;
                 embd.resize(max_embd_size);
 
-                console::set_display(console::error);
+                console::set_display(DISPLAY_TYPE_ERROR);
                 LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
-                console::set_display(console::reset);
+                console::set_display(DISPLAY_TYPE_RESET);
             }
 
             if (ga_n == 1) {
@@ -773,7 +767,7 @@ int main(int argc, char ** argv) {
 
         // reset color to default if there is no pending user input
         if (input_echo && (int) embd_inp.size() == n_consumed) {
-            console::set_display(console::reset);
+            console::set_display(DISPLAY_TYPE_RESET);
             display = true;
         }
 
@@ -869,7 +863,7 @@ int main(int argc, char ** argv) {
                 }
 
                 // color user input only
-                console::set_display(console::user_input);
+                console::set_display(DISPLAY_TYPE_USER_INPUT);
                 display = params.display_prompt;
 
                 std::string line;
@@ -880,7 +874,7 @@ int main(int argc, char ** argv) {
                 } while (another_line);
 
                 // done taking input, reset color
-                console::set_display(console::reset);
+                console::set_display(DISPLAY_TYPE_RESET);
                 display = true;
 
                 if (buffer.empty()) { // Ctrl+D on empty line exits
diff --git a/tools/gguf-split/tests.sh b/tools/gguf-split/tests.sh
index e8677018f..c8dd0b007 100755
--- a/tools/gguf-split/tests.sh
+++ b/tools/gguf-split/tests.sh
@@ -19,7 +19,7 @@ fi
 set -x
 
 SPLIT=$1/llama-gguf-split
-MAIN=$1/llama-cli
+MAIN=$1/llama-completion
 WORK_PATH=$TMP_DIR/gguf-split
 ROOT_DIR=$(realpath $(dirname $0)/../../)
 
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index b5bbc6536..a75af406c 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -310,6 +310,9 @@ int main(int argc, char ** argv) {
 
     if (g_is_interrupted) return 130;
 
+    LOG_WRN("WARN: This is an experimental CLI for testing multimodal capability.\n");
+    LOG_WRN("      For normal use cases, please use the standard llama-cli\n");
+
     if (is_single_turn) {
         g_is_generating = true;
         if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
@@ -349,11 +352,11 @@ int main(int argc, char ** argv) {
         while (!g_is_interrupted) {
             g_is_generating = false;
             LOG("\n> ");
-            console::set_display(console::user_input);
+            console::set_display(DISPLAY_TYPE_USER_INPUT);
             std::string line;
             console::readline(line, false);
             if (g_is_interrupted) break;
-            console::set_display(console::reset);
+            console::set_display(DISPLAY_TYPE_RESET);
             line = string_strip(line);
             if (line.empty()) {
                 continue;
diff --git a/tools/quantize/tests.sh b/tools/quantize/tests.sh
index acc54fd9b..2cae588e9 100644
--- a/tools/quantize/tests.sh
+++ b/tools/quantize/tests.sh
@@ -20,7 +20,7 @@ set -x
 
 SPLIT=$1/llama-gguf-split
 QUANTIZE=$1/llama-quantize
-MAIN=$1/llama-cli
+MAIN=$1/llama-completion
 WORK_PATH=$TMP_DIR/quantize
 ROOT_DIR=$(realpath $(dirname $0)/../../)
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 4578f8d7a..5a67f508d 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -1474,6 +1474,44 @@ struct server_context_impl {
     // Functions to process the task
     //
 
+    // tokenize the input if it's set by CLI, return false on error
+    bool tokenize_cli_input(server_task & task) {
+        if (task.cli_input == nullptr) {
+            return true; // nothing to do
+        }
+        try {
+            auto & opt = oai_parser_opt;
+            common_chat_templates_inputs inputs;
+            inputs.messages              = common_chat_msgs_parse_oaicompat(task.cli_input);
+            inputs.tools                 = {}; // TODO
+            inputs.tool_choice           = COMMON_CHAT_TOOL_CHOICE_NONE;
+            inputs.json_schema           = ""; // TODO
+            inputs.grammar               = ""; // TODO
+            inputs.use_jinja             = opt.use_jinja;
+            inputs.parallel_tool_calls   = false;
+            inputs.add_generation_prompt = true;
+            inputs.reasoning_format      = opt.reasoning_format;
+            inputs.enable_thinking       = opt.enable_thinking;
+
+            // Apply chat template to the list of messages
+            auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
+
+            // tokenize the resulting prompt
+            auto & prompt = chat_params.prompt;
+            if (mctx != nullptr) {
+                task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files);
+            } else {
+                task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]);
+            }
+            task.cli_input.clear();
+            task.cli_files.clear();
+        } catch (const std::exception & e) {
+            send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
+            return false;
+        }
+        return true;
+    }
+
     void process_single_task(server_task && task) {
         switch (task.type) {
             case SERVER_TASK_TYPE_COMPLETION:
@@ -1481,6 +1519,10 @@ struct server_context_impl {
             case SERVER_TASK_TYPE_EMBEDDING:
             case SERVER_TASK_TYPE_RERANK:
                 {
+                    if (!tokenize_cli_input(task)) {
+                        break;
+                    }
+
                     const int id_slot = task.id_slot;
 
                     server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
@@ -1690,7 +1732,6 @@ struct server_context_impl {
                     res->id = task.id;
                     queue_results.send(std::move(res));
                 } break;
-
         }
     }
 
@@ -2626,6 +2667,15 @@ server_response_reader server_context::get_response_reader() {
     return impl->get_response_reader();
 }
 
+server_context_info server_context::get_info() const {
+    return server_context_info {
+        /* build_info    */ build_info,
+        /* model_name    */ impl->model_name,
+        /* has_inp_image */ impl->oai_parser_opt.allow_image,
+        /* has_inp_audio */ impl->oai_parser_opt.allow_audio,
+    };
+}
+
 
 
 // generator-like API for HTTP response generation
diff --git a/tools/server/server-context.h b/tools/server/server-context.h
index eaa138087..230b25952 100644
--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@@ -9,6 +9,13 @@
 
 struct server_context_impl; // private implementation
 
+struct server_context_info {
+    std::string build_info;
+    std::string model_name;
+    bool has_inp_image;
+    bool has_inp_audio;
+};
+
 struct server_context {
     std::unique_ptr<server_context_impl> impl;
 
@@ -33,6 +40,10 @@ struct server_context {
 
     // get a new response reader, used by CLI application
     server_response_reader get_response_reader();
+
+    // get server info
+    // used by CLI application
+    server_context_info get_info() const;
 };
 
 
diff --git a/tools/server/server-queue.h b/tools/server/server-queue.h
index 726eadf4e..8780d7fe1 100644
--- a/tools/server/server-queue.h
+++ b/tools/server/server-queue.h
@@ -135,7 +135,10 @@ struct server_response_reader {
         stop();
     }
 
-    void post_task(server_task && tasks);
+    int get_new_id() {
+        return queue_tasks.get_new_id();
+    }
+    void post_task(server_task && task);
     void post_tasks(std::vector<server_task> && tasks);
     bool has_next() const;
 
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
index 9011ff944..0759094a0 100644
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -120,6 +120,10 @@ struct server_task {
     task_params   params;
     server_tokens tokens;
 
+    // only used by CLI, this delegates the tokenization to the server
+    json                    cli_input = nullptr;
+    std::vector<raw_buffer> cli_files;
+
     server_task_type type;
 
     // used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE