Merge branch 'upstream' into concedo_experimental

# Conflicts: # .github/workflows/build.yml # .github/workflows/release.yml # .github/workflows/server.yml # README.md # docs/build.md # docs/install.md # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/mmvq.cpp # ggml/src/ggml-sycl/vecdotq.hpp # tests/test-backend-ops.cpp # tests/test-chat.cpp
2026-05-21 18:52:02 +00:00 · 2025-06-05 11:03:34 +08:00 · 2025-06-05 11:03:34 +08:00 · bc89b465a8
commit bc89b465a8
parent a341188f84 0d3984424f
35 changed files with 1070 additions and 288 deletions
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@ -70,6 +70,7 @@ struct mtmd_cli_context {
    llama_model       * model;
    llama_context     * lctx;
    const llama_vocab * vocab;
+    common_sampler    * smpl;
    llama_batch         batch;
    int                 n_batch;

@ -89,8 +90,9 @@ struct mtmd_cli_context {
        model = llama_init.model.get();
        lctx = llama_init.context.get();
        vocab = llama_model_get_vocab(model);
+        smpl = common_sampler_init(model, params.sampling);
        n_threads = params.cpuparams.n_threads;
-        batch = llama_batch_init(params.n_batch, 0, 1);
+        batch = llama_batch_init(1, 0, 1); // batch for next token generation
        n_batch = params.n_batch;

        if (!model || !lctx) {
@ -118,6 +120,11 @@ struct mtmd_cli_context {
        }
    }

+    ~mtmd_cli_context() {
+        llama_batch_free(batch);
+        common_sampler_free(smpl);
+    }
+
    void init_vision_context(common_params & params) {
        const char * clip_path = params.mmproj.path.c_str();
        mtmd_context_params mparams = mtmd_context_params_default();
@ -153,7 +160,7 @@ struct mtmd_cli_context {
    }
 };

-static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
+static int generate_response(mtmd_cli_context & ctx, int n_predict) {
    llama_tokens generated_tokens;
    for (int i = 0; i < n_predict; i++) {
        if (i > n_predict || !g_is_generating || g_is_interrupted) {
@ -161,9 +168,9 @@ static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int
            break;
        }

-        llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
+        llama_token token_id = common_sampler_sample(ctx.smpl, ctx.lctx, -1);
        generated_tokens.push_back(token_id);
-        common_sampler_accept(smpl, token_id, true);
+        common_sampler_accept(ctx.smpl, token_id, true);

        if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) {
            LOG("\n");
@ -261,7 +268,6 @@ int main(int argc, char ** argv) {

    bool is_single_turn = !params.prompt.empty() && !params.image.empty();

-    struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling);
    int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict;

    // Ctrl+C handling
@ -300,7 +306,7 @@ int main(int argc, char ** argv) {
        if (eval_message(ctx, msg, true)) {
            return 1;
        }
-        if (!g_is_interrupted && generate_response(ctx, smpl, n_predict)) {
+        if (!g_is_interrupted && generate_response(ctx, n_predict)) {
            return 1;
        }

@ -366,7 +372,7 @@ int main(int argc, char ** argv) {
                return 1;
            }
            if (g_is_interrupted) break;
-            if (generate_response(ctx, smpl, n_predict)) {
+            if (generate_response(ctx, n_predict)) {
                return 1;
            }
            content.clear();
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@ -311,6 +311,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
        GGML_ABORT("chunk type not supported");
    }

+    llama_batch_free(text_batch);
    return 0;
 }

--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -360,7 +360,7 @@ struct server_task {
                params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
            }
            params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format;
-            params.oaicompat_chat_syntax.reasoning_in_content = params.stream;
+            params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (params_base.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
            params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
            params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
        }
@ -2016,6 +2016,11 @@ struct server_context {
                params_base.n_cache_reuse = 0;
                SRV_WRN("%s\n", "cache_reuse is not supported by this context, it will be disabled");
            }
+
+            if (!params_base.speculative.model.path.empty()) {
+                SRV_ERR("%s\n", "err: speculative decode is not supported by this context");
+                return false;
+            }
        }

        return true;
@ -3203,9 +3208,7 @@ struct server_context {
                                }
                            } else {
                                // if we don't cache the prompt, we have to remove the entire KV cache
-                                llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
                                slot.n_past = 0;
-                                slot.cache_tokens.clear(); // TODO: not needed, will be cleared later via "keep_first()"
                            }

                            if (slot.n_past > 0 && slot.n_past < (int) slot.cache_tokens.size()) {
@ -3220,7 +3223,6 @@ struct server_context {
                                    SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.cache_tokens.size(), slot.id, pos_min, n_swa);
                                    SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA, see %s)\n",
                                            "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
-                                    llama_kv_self_seq_rm(ctx, slot.id, 0, -1);
                                    slot.n_past = 0;
                                }
                            }
--- a/tools/server/tests/unit/test_tool_call.py
+++ b/tools/server/tests/unit/test_tool_call.py
@ -499,13 +499,12 @@ def do_test_calc_result(server: ServerProcess, result_override: str | None, n_pr


@pytest.mark.slow
-@pytest.mark.parametrize("n_predict,reasoning_format,stream,expect_reasoning_content,expect_content,hf_repo,template_override", [
-    (128, 'deepseek',   CompletionMode.NORMAL,   None, "^The sum of 102 and 7 is 109[\\s\\S]*",                                       "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
-    (128,  None,        CompletionMode.NORMAL,   None, "^The sum of 102 and 7 is 109[\\s\\S]*",                                       "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
-    (1024, 'deepseek',  CompletionMode.NORMAL,   "I need to calculate the sum of 102 and 7[\\s\\S]*", "To find the sum of[\\s\\S]*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-    (1024, 'deepseek',  CompletionMode.STREAMED, None, "^<think>I need to calculate [\\s\\S]*?</think>To find the sum of [\\s\\S]*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-    (1024, 'deepseek',  CompletionMode.NORMAL,   "First, I [\\s\\S]*", "To find the sum of[\\s\\S]*",                                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
-    (1024, 'deepseek',  CompletionMode.STREAMED, None, "^<think>First, I [\\s\\S]*?</think>To find the sum of[\\s\\S]*",              "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
+@pytest.mark.parametrize("n_predict,reasoning_format,expect_reasoning_content,expect_content,hf_repo,template_override", [
+    (128, 'deepseek',   None, "^The sum of 102 and 7 is 109[\\s\\S]*",                                       "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
+    (128,  None,        None, "^The sum of 102 and 7 is 109[\\s\\S]*",                                       "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
+    (1024, 'deepseek',  "I need to calculate the sum of 102 and 7[\\s\\S]*", "To find the sum of[\\s\\S]*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+    (1024, 'deepseek',  "First, I [\\s\\S]*", "To find the sum of[\\s\\S]*",                                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
    # (1024, 'none',      CompletionMode.NORMAL,   None, "^(<think>\\s*)?I need[\\s\\S]*?</think>\\s*To find[\\s\\S]*",                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
    # (128,  'deepseek',  None, "^Okay, let me figure out the sum of 102 and 7[\\s\\S]*",                      "bartowski/Qwen_QwQ-32B-GGUF:Q4_K_M",                None),
 ])
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@ -308,10 +308,12 @@ class ServerProcess:
        stream = data.get('stream', False)
        if stream:
            content: list[str] = []
+            reasoning_content: list[str] = []
            tool_calls: list[dict] = []
            finish_reason: Optional[str] = None

            content_parts = 0
+            reasoning_content_parts = 0
            tool_call_parts = 0
            arguments_parts = 0

@ -322,6 +324,10 @@ class ServerProcess:
                    assert len(choice['delta']['content']) > 0, f'Expected non empty content delta!'
                    content.append(choice['delta']['content'])
                    content_parts += 1
+                if choice['delta'].get('reasoning_content') is not None:
+                    assert len(choice['delta']['reasoning_content']) > 0, f'Expected non empty reasoning_content delta!'
+                    reasoning_content.append(choice['delta']['reasoning_content'])
+                    reasoning_content_parts += 1
                if choice['delta'].get('finish_reason') is not None:
                    finish_reason = choice['delta']['finish_reason']
                for tc in choice['delta'].get('tool_calls', []):
@ -349,8 +355,10 @@ class ServerProcess:
                        tool_call['function']['name'] = tool_call['function'].get('name', '') + fct['name']
                    if fct.get('arguments') is not None:
                        tool_call['function']['arguments'] += fct['arguments']
+                        arguments_parts += 1
+                    tool_call_parts += 1

-            print(f'Streamed response had {content_parts} content parts, {tool_call_parts} tool call parts incl. {arguments_parts} arguments parts')
+            print(f'Streamed response had {content_parts} content parts, {reasoning_content_parts} reasoning_content parts, {tool_call_parts} tool call parts incl. {arguments_parts} arguments parts')
            result = dict(
                choices=[
                    dict(
@ -359,6 +367,7 @@ class ServerProcess:
                        message=dict(
                            role='assistant',
                            content=''.join(content) if content else None,
+                            reasoning_content=''.join(reasoning_content) if reasoning_content else None,
                            tool_calls=tool_calls if tool_calls else None,
                        ),
                    )