From e877ad8bd9e2f147d8f8244511b852cdade27953 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 22 Feb 2026 08:07:46 +0100 Subject: [PATCH 1/9] ci : fix rocm release path [no ci] (#19784) --- .github/workflows/release.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 2addbb660..ddd1ece3f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -609,7 +609,7 @@ jobs: - name: Upload artifacts uses: actions/upload-artifact@v6 with: - path: llama-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz + path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz name: llama-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz windows-hip: @@ -965,7 +965,7 @@ jobs: **Linux:** - [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz) - [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz) - - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-x64.tar.gz) + - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz) - [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz) **Windows:** From 34ec1c3f182712302f55deca023e465a18a4897c Mon Sep 17 00:00:00 2001 From: Aldehir Rojas Date: Sun, 22 Feb 2026 07:11:31 -0600 Subject: [PATCH 2/9] server : merge contiguous Responses input items into a single assistant message (#19773) * server : merge contiguous input items into a single assistant message * cont : simplify tool call msg * cont : reduce and combine content * cont : fix merging content items --- tools/server/server-common.cpp | 84 ++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 40 deletions(-) diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index d717fb669..88b6e77d8 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1105,6 +1105,8 @@ json convert_responses_to_chatcmpl(const json & response_body) { }; for (json item : input_value) { + bool merge_prev = !chatcmpl_messages.empty() && chatcmpl_messages.back().value("role", "") == "assistant"; + if (exists_and_is_string(item, "content")) { // #responses_create-input-input_item_list-input_message-content-text_input // Only "Input message" contains item["content"]::string @@ -1193,7 +1195,7 @@ json convert_responses_to_chatcmpl(const json & response_body) { item.at("type") == "message" ) { // #responses_create-input-input_item_list-item-output_message - std::vector chatcmpl_content; + auto chatcmpl_content = json::array(); for (const auto & output_text : item.at("content")) { const std::string type = json_value(output_text, "type", std::string()); @@ -1210,10 +1212,19 @@ json convert_responses_to_chatcmpl(const json & response_body) { }); } - item.erase("status"); - item.erase("type"); - item["content"] = chatcmpl_content; - chatcmpl_messages.push_back(item); + if (merge_prev) { + auto & prev_msg = chatcmpl_messages.back(); + if (!exists_and_is_array(prev_msg, "content")) { + prev_msg["content"] = json::array(); + } + auto & prev_content = prev_msg["content"]; + prev_content.insert(prev_content.end(), chatcmpl_content.begin(), chatcmpl_content.end()); + } else { + item.erase("status"); + item.erase("type"); + item["content"] = chatcmpl_content; + chatcmpl_messages.push_back(item); + } } else if (exists_and_is_string(item, "arguments") && exists_and_is_string(item, "call_id") && exists_and_is_string(item, "name") && @@ -1221,24 +1232,27 @@ json convert_responses_to_chatcmpl(const json & response_body) { item.at("type") == "function_call" ) { // #responses_create-input-input_item_list-item-function_tool_call - json msg = json { - {"role", "assistant"}, - {"tool_calls", json::array({ json { - {"function", json { - {"arguments", item.at("arguments")}, - {"name", item.at("name")}, - }}, - {"id", item.at("call_id")}, - {"type", "function"}, - }})}, + json tool_call = { + {"function", json { + {"arguments", item.at("arguments")}, + {"name", item.at("name")}, + }}, + {"id", item.at("call_id")}, + {"type", "function"}, }; - if (!chatcmpl_messages.empty() && chatcmpl_messages.back().contains("reasoning_content")) { - // Move reasoning content from dummy message to tool call message - msg["reasoning_content"] = chatcmpl_messages.back().at("reasoning_content"); - chatcmpl_messages.pop_back(); + if (merge_prev) { + auto & prev_msg = chatcmpl_messages.back(); + if (!exists_and_is_array(prev_msg, "tool_calls")) { + prev_msg["tool_calls"] = json::array(); + } + prev_msg["tool_calls"].push_back(tool_call); + } else { + chatcmpl_messages.push_back(json { + {"role", "assistant"}, + {"tool_calls", json::array({tool_call})} + }); } - chatcmpl_messages.push_back(msg); } else if (exists_and_is_string(item, "call_id") && (exists_and_is_string(item, "output") || exists_and_is_array(item, "output")) && exists_and_is_string(item, "type") && @@ -1282,12 +1296,16 @@ json convert_responses_to_chatcmpl(const json & response_body) { throw std::invalid_argument("item['content']['text'] is not a string"); } - // Pack reasoning content in dummy message - chatcmpl_messages.push_back(json { - {"role", "assistant"}, - {"content", json::array()}, - {"reasoning_content", item.at("content")[0].at("text")}, - }); + if (merge_prev) { + auto & prev_msg = chatcmpl_messages.back(); + prev_msg["reasoning_content"] = item.at("content")[0].at("text"); + } else { + chatcmpl_messages.push_back(json { + {"role", "assistant"}, + {"content", json::array()}, + {"reasoning_content", item.at("content")[0].at("text")}, + }); + } } else { throw std::invalid_argument("Cannot determine type of 'item'"); } @@ -1296,20 +1314,6 @@ json convert_responses_to_chatcmpl(const json & response_body) { throw std::invalid_argument("'input' must be a string or array of objects"); } - // Remove unused dummy message which contains - // reasoning content not followed by tool call - chatcmpl_messages.erase(std::remove_if( - chatcmpl_messages.begin(), - chatcmpl_messages.end(), - [](const json & x){ return x.contains("role") && - x.at("role") == "assistant" && - x.contains("content") && - x.at("content") == json::array() && - x.contains("reasoning_content"); - }), - chatcmpl_messages.end() - ); - chatcmpl_body["messages"] = chatcmpl_messages; if (response_body.contains("tools")) { From 9f0684f003f0feae3436293d9d9686f190105729 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 22 Feb 2026 16:14:37 +0100 Subject: [PATCH 3/9] ci : fix rocm archive name [no ci] (#19808) --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ddd1ece3f..860acc6b1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -604,7 +604,7 @@ jobs: id: pack_artifacts run: | cp LICENSE ./build/bin/ - tar -czvf llama-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin . - name: Upload artifacts uses: actions/upload-artifact@v6 From ae2368e74eb2c280629ce6cf80edb88d72d23495 Mon Sep 17 00:00:00 2001 From: HelloKS Date: Mon, 23 Feb 2026 00:15:02 +0900 Subject: [PATCH 4/9] model : add Kanana-2 model support (#19803) * model: Add Kanana-2 model support * lint: adjust spacing --- convert_hf_to_gguf.py | 3 +++ convert_hf_to_gguf_update.py | 1 + src/llama-model.cpp | 4 ++-- src/llama-vocab.cpp | 3 ++- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 31acd5bb4..e03810959 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1274,6 +1274,9 @@ class TextModel(ModelBase): if chkhsh == "b4b8ca1f9769494fbd956ebc4c249de6131fb277a4a3345a7a92c7dd7a55808d": # ref: https://huggingface.co/jdopensource/JoyAI-LLM-Flash res = "joyai-llm" + if chkhsh == "e4d54df1ebc1f2b91acd986c5b51aa50837d5faf7c7398e73c1f9e9ee5d19869": + # ref: https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601 + res = "kanana2" if res is None: logger.warning("\n") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 8f7443d1b..53a73759e 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -152,6 +152,7 @@ models = [ {"name": "exaone-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B", }, {"name": "qwen35", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3.5-9B-Instruct", }, {"name": "joyai-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", }, + {"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", }, ] # some models are known to be broken upstream, so we will skip them as exceptions diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 764839b9b..c93e29555 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1703,8 +1703,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { } break; case LLM_ARCH_DEEPSEEK2: { - // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B - const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26); + // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B + const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256)); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 69b25a1bf..9c118eab7 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2027,7 +2027,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2; } else if ( tokenizer_pre == "gpt-4o" || - tokenizer_pre == "llama4") { + tokenizer_pre == "llama4" || + tokenizer_pre == "kanana2") { pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O; clean_spaces = false; } else if ( From cacc371f99fb3b5b431d3fa89ac0c752bbd62a3b Mon Sep 17 00:00:00 2001 From: Kilian Krampf Date: Sun, 22 Feb 2026 16:26:33 +0100 Subject: [PATCH 5/9] Fix wrong cli-argument in documentation (#19804) --- tools/server/webui/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/webui/README.md b/tools/server/webui/README.md index 98b01fdcd..6fc908e27 100644 --- a/tools/server/webui/README.md +++ b/tools/server/webui/README.md @@ -101,7 +101,7 @@ In a separate terminal, start the backend server: ./llama-server -m model.gguf # Multi-model (ROUTER mode) -./llama-server --model-store /path/to/models +./llama-server --models-dir /path/to/models ``` ### 3. Start Development Servers From ed4837891d3a142d8806c3879afb5752f1254e98 Mon Sep 17 00:00:00 2001 From: Aldehir Rojas Date: Sun, 22 Feb 2026 10:34:54 -0600 Subject: [PATCH 6/9] common : fix improper trimming in XML parser on complete message (#19805) Co-authored-by: Jules LEIDELINGER <11395311+julio75012@users.noreply.github.com> --- common/chat-parser-xml-toolcall.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/chat-parser-xml-toolcall.cpp b/common/chat-parser-xml-toolcall.cpp index a80900ff8..ba359fdbf 100644 --- a/common/chat-parser-xml-toolcall.cpp +++ b/common/chat-parser-xml-toolcall.cpp @@ -803,7 +803,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons } // remove potential partial suffix - if (builder.pos() == builder.input().size()) { + if (builder.pos() == builder.input().size() && builder.is_partial()) { if (unclosed_reasoning_content.empty()) { rstrip(content); trim_potential_partial_word(content); From 5452d736f80efae2062d60e9392ad9225ac227ba Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Sun, 22 Feb 2026 21:08:23 +0100 Subject: [PATCH 7/9] jinja: correct stats for tojson and string filters (#19785) --- common/jinja/runtime.cpp | 15 +++++----- common/jinja/value.cpp | 32 ++++++++++++++++++++++ common/jinja/value.h | 2 ++ tests/test-jinja.cpp | 59 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 101 insertions(+), 7 deletions(-) diff --git a/common/jinja/runtime.cpp b/common/jinja/runtime.cpp index cc012c892..c93e182a7 100644 --- a/common/jinja/runtime.cpp +++ b/common/jinja/runtime.cpp @@ -85,7 +85,7 @@ value identifier::execute_impl(context & ctx) { auto builtins = global_builtins(); if (!it->is_undefined()) { if (ctx.is_get_stats) { - it->stats.used = true; + value_t::stats_t::mark_used(it); } JJ_DEBUG("Identifier '%s' found, type = %s", val.c_str(), it->type().c_str()); return it; @@ -277,7 +277,7 @@ value binary_expression::execute_impl(context & ctx) { static value try_builtin_func(context & ctx, const std::string & name, value & input, bool undef_on_missing = false) { JJ_DEBUG("Trying built-in function '%s' for type %s", name.c_str(), input->type().c_str()); if (ctx.is_get_stats) { - input->stats.used = true; + value_t::stats_t::mark_used(input); input->stats.ops.insert(name); } auto builtins = input->get_builtins(); @@ -448,7 +448,7 @@ value for_statement::execute_impl(context & ctx) { // mark the variable being iterated as used for stats if (ctx.is_get_stats) { - iterable_val->stats.used = true; + value_t::stats_t::mark_used(iterable_val); iterable_val->stats.ops.insert("array_access"); } @@ -470,7 +470,7 @@ value for_statement::execute_impl(context & ctx) { items.push_back(std::move(tuple)); } if (ctx.is_get_stats) { - iterable_val->stats.used = true; + value_t::stats_t::mark_used(iterable_val); iterable_val->stats.ops.insert("object_access"); } } else { @@ -480,7 +480,7 @@ value for_statement::execute_impl(context & ctx) { items.push_back(item); } if (ctx.is_get_stats) { - iterable_val->stats.used = true; + value_t::stats_t::mark_used(iterable_val); iterable_val->stats.ops.insert("array_access"); } } @@ -817,8 +817,9 @@ value member_expression::execute_impl(context & ctx) { } if (ctx.is_get_stats && val && object && property) { - val->stats.used = true; - object->stats.used = true; + value_t::stats_t::mark_used(val); + value_t::stats_t::mark_used(object); + value_t::stats_t::mark_used(property); if (is_val(property)) { object->stats.ops.insert("array_access"); } else if (is_val(property)) { diff --git a/common/jinja/value.cpp b/common/jinja/value.cpp index 9987836d1..749113124 100644 --- a/common/jinja/value.cpp +++ b/common/jinja/value.cpp @@ -161,6 +161,11 @@ static value tojson(const func_args & args) { value val_separators = args.get_kwarg_or_pos("separators", 3); value val_sort = args.get_kwarg_or_pos("sort_keys", 4); int indent = -1; + if (args.ctx.is_get_stats) { + // mark as used (recursively) for stats + auto val_input = args.get_pos(0); + value_t::stats_t::mark_used(const_cast(val_input), true); + } if (is_val(val_indent)) { indent = static_cast(val_indent->as_int()); } @@ -891,6 +896,11 @@ const func_builtins & value_array_t::get_builtins() const { }}, {"string", [](const func_args & args) -> value { args.ensure_vals(); + if (args.ctx.is_get_stats) { + // mark as used (recursively) for stats + auto val_input = args.get_pos(0); + value_t::stats_t::mark_used(const_cast(val_input), true); + } return mk_val(args.get_pos(0)->as_string()); }}, {"tojson", tojson}, @@ -1046,6 +1056,11 @@ const func_builtins & value_object_t::get_builtins() const { {"tojson", tojson}, {"string", [](const func_args & args) -> value { args.ensure_vals(); + if (args.ctx.is_get_stats) { + // mark as used (recursively) for stats + auto val_input = args.get_pos(0); + value_t::stats_t::mark_used(const_cast(val_input), true); + } return mk_val(args.get_pos(0)->as_string()); }}, {"length", [](const func_args & args) -> value { @@ -1358,4 +1373,21 @@ std::string value_to_string_repr(const value & val) { } } +// stats utility +void value_t::stats_t::mark_used(value & val, bool deep) { + val->stats.used = true; + if (deep) { + if (is_val(val)) { + for (auto & item : val->val_arr) { + mark_used(item, deep); + } + } else if (is_val(val)) { + for (auto & pair : val->val_obj) { + mark_used(pair.first, deep); + mark_used(pair.second, deep); + } + } + } +} + } // namespace jinja diff --git a/common/jinja/value.h b/common/jinja/value.h index 1c04760a0..07e447ff6 100644 --- a/common/jinja/value.h +++ b/common/jinja/value.h @@ -118,6 +118,8 @@ struct value_t { bool used = false; // ops can be builtin calls or operators: "array_access", "object_access" std::set ops; + // utility to recursively mark value and its children as used + static void mark_used(value & val, bool deep = false); } stats; value_t() = default; diff --git a/tests/test-jinja.cpp b/tests/test-jinja.cpp index f5197bd33..05ea8ca9e 100644 --- a/tests/test-jinja.cpp +++ b/tests/test-jinja.cpp @@ -32,6 +32,7 @@ static void test_string_methods(testing & t); static void test_array_methods(testing & t); static void test_object_methods(testing & t); static void test_hasher(testing & t); +static void test_stats(testing & t); static void test_fuzzing(testing & t); static bool g_python_mode = false; @@ -70,6 +71,7 @@ int main(int argc, char *argv[]) { t.test("object methods", test_object_methods); if (!g_python_mode) { t.test("hasher", test_hasher); + t.test("stats", test_stats); t.test("fuzzing", test_fuzzing); } @@ -1795,6 +1797,63 @@ static void test_hasher(testing & t) { }); } +static void test_stats(testing & t) { + static auto get_stats = [](const std::string & tmpl, const json & vars) -> jinja::value { + jinja::lexer lexer; + auto lexer_res = lexer.tokenize(tmpl); + + jinja::program prog = jinja::parse_from_tokens(lexer_res); + + jinja::context ctx(tmpl); + jinja::global_from_json(ctx, json{{ "val", vars }}, true); + ctx.is_get_stats = true; + + jinja::runtime runtime(ctx); + runtime.execute(prog); + + return ctx.get_val("val"); + }; + + t.test("stats", [](testing & t) { + jinja::value val = get_stats( + "{{val.num}} " + "{{val.str}} " + "{{val.arr[0]}} " + "{{val.obj.key1}} " + "{{val.nested | tojson}}", + // Note: the json below will be wrapped inside "val" in the context + json{ + {"num", 1}, + {"str", "abc"}, + {"arr", json::array({1, 2, 3})}, + {"obj", json::object({{"key1", 1}, {"key2", 2}, {"key3", 3}})}, + {"nested", json::object({ + {"inner_key1", json::array({1, 2})}, + {"inner_key2", json::object({{"a", "x"}, {"b", "y"}})} + })}, + {"mixed", json::object({ + {"used", 1}, + {"unused", 2}, + })}, + } + ); + + t.assert_true("num is used", val->at("num")->stats.used); + t.assert_true("str is used", val->at("str")->stats.used); + + t.assert_true("arr is used", val->at("arr")->stats.used); + t.assert_true("arr[0] is used", val->at("arr")->at(0)->stats.used); + t.assert_true("arr[1] is not used", !val->at("arr")->at(1)->stats.used); + + t.assert_true("obj is used", val->at("obj")->stats.used); + t.assert_true("obj.key1 is used", val->at("obj")->at("key1")->stats.used); + t.assert_true("obj.key2 is not used", !val->at("obj")->at("key2")->stats.used); + + t.assert_true("inner_key1[0] is used", val->at("nested")->at("inner_key1")->at(0)->stats.used); + t.assert_true("inner_key2.a is used", val->at("nested")->at("inner_key2")->at("a")->stats.used); + }); +} + static void test_template_cpp(testing & t, const std::string & name, const std::string & tmpl, const json & vars, const std::string & expect) { t.test(name, [&tmpl, &vars, &expect](testing & t) { jinja::lexer lexer; From e8e261699a2a93b60f307d92aa788e47b6b2ebd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 22 Feb 2026 22:33:49 +0100 Subject: [PATCH 8/9] cli : provide model with text filename (#19783) --- tools/cli/cli.cpp | 9 +++++++++ tools/server/server-context.cpp | 3 +++ tools/server/server-context.h | 3 +++ 3 files changed, 15 insertions(+) diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index ad421e632..e57bf52e3 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -380,6 +380,15 @@ int main(int argc, char ** argv) { console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str()); continue; } + if (inf.fim_sep_token != LLAMA_TOKEN_NULL) { + cur_msg += common_token_to_piece(ctx_cli.ctx_server.get_llama_context(), inf.fim_sep_token, true); + cur_msg += fname; + cur_msg.push_back('\n'); + } else { + cur_msg += "--- File: "; + cur_msg += fname; + cur_msg += " ---\n"; + } cur_msg += marker; console::log("Loaded text from '%s'\n", fname.c_str()); continue; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 8aab0d4c1..0f2f3a45a 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2911,6 +2911,9 @@ server_context_meta server_context::get_meta() const { /* fim_pre_token */ llama_vocab_fim_pre(impl->vocab), /* fim_sub_token */ llama_vocab_fim_suf(impl->vocab), /* fim_mid_token */ llama_vocab_fim_mid(impl->vocab), + /* fim_pad_token */ llama_vocab_fim_pad(impl->vocab), + /* fim_rep_token */ llama_vocab_fim_rep(impl->vocab), + /* fim_sep_token */ llama_vocab_fim_sep(impl->vocab), /* model_vocab_type */ llama_vocab_type(impl->vocab), /* model_vocab_n_tokens */ llama_vocab_n_tokens(impl->vocab), diff --git a/tools/server/server-context.h b/tools/server/server-context.h index c0b5d373f..03c29f513 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -30,6 +30,9 @@ struct server_context_meta { llama_token fim_pre_token; llama_token fim_sub_token; llama_token fim_mid_token; + llama_token fim_pad_token; + llama_token fim_rep_token; + llama_token fim_sep_token; // model meta enum llama_vocab_type model_vocab_type; From 2b6dfe824de8600c061ef91ce5cc5c307f97112c Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Mon, 23 Feb 2026 07:04:30 +0100 Subject: [PATCH 9/9] llama : remove write/read of output ids/logits/embeddings (#18862) * llama : remove write/read of output ids/logits/embeddings This commit removes the write/read of output ids, logits and embeddings from the llama context state. Refs: https://github.com/ggml-org/llama.cpp/pull/18862#issuecomment-3756330941 * completion : add replying of session state This commit updates the session handing in the completion tool to handle the that logits are no longer stored in the session file. Instead, we need to replay the last token to get the logits for sampling. * common : add common_prompt_batch_decode function This commit adds a new function which is responsible for decoding prompt and optionally handle the saving for session data. * update save-state.cpp to use llama_state_load_file This commit updates the save-load-state example to utilize the new llama_state_load_file function for loading the model state from a file. And it also replays the last token after loading since this state is now stored before the last token is processed. * examples : set n_seq_max = 2 for ctx3 This commit updates the save-load-state example to set the n_seq_max parameter to 2 when initializing the ctx3 context. The motivation for this change is that using 1 as n_parallel/n_seq_max the context only supports one sequence, but the test laster tries to use a second sequence which results in the following error: ```console main : loaded state with 4 tokens main : seq 0 copied, 225760 bytes main : kv cache cleared find_slot: seq_id=1 >= n_seq_max=1 Try using a bigger --parallel value state_read_meta: failed to find available cells in kv cache ``` This seems to only happen for recurrent/hybrid models. --- common/common.cpp | 62 ++++++++++ common/common.h | 17 +++ examples/save-load-state/save-load-state.cpp | 93 ++++++-------- src/llama-context.cpp | 122 ------------------- tools/completion/completion.cpp | 38 +++--- 5 files changed, 132 insertions(+), 200 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 75116ed6f..53bddc4ef 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1760,3 +1760,65 @@ float lr_opt::get_lr(float epoch) const { LOG_INF("epoch %.2g lr=%.2g\n", epoch, r); return r; } + +bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos) { + llama_batch batch = llama_batch_get_one(&last_token, 1); + batch.pos = &pos; + if (llama_decode(ctx, batch)) { + LOG_ERR("%s: failed to replay last token\n", __func__); + return false; + } + return true; +} + +bool common_prompt_batch_decode( + struct llama_context * ctx, + const std::vector & tokens, + int & n_past, + int n_batch, + std::string_view state_path, + bool save_state) { + const int n_eval = tokens.size(); + if (n_eval == 0) { + return true; + } + + if (save_state && n_eval > 1) { + const int n_tokens_before_last = n_eval - 1; + + GGML_ASSERT(n_eval <= n_batch); + + // Decode all but the last token so we can save the memory state before decoding the last token. + // This is done so we can restore the session state later and replay the last token. + // Memory implementations in recurrent/hybrid models don't support removing tokens from their + // memory, so we can't just remove the last token from the memory and replay the last token which + // is the reason for this logic. + if (llama_decode(ctx, llama_batch_get_one(const_cast(tokens.data()), n_tokens_before_last))) { + LOG_ERR("%s : failed to eval\n", __func__); + return false; + } + n_past += n_tokens_before_last; + + llama_state_save_file(ctx, state_path.data(), tokens.data(), n_tokens_before_last); + LOG_INF("saved session before last token to %s, n_tokens = %d\n", state_path.data(), n_tokens_before_last); + + llama_token last_token = tokens.back(); + llama_batch batch = llama_batch_get_one(&last_token, 1); + int32_t pos = n_past; + batch.pos = &pos; + + if (llama_decode(ctx, batch)) { + LOG_ERR("%s : failed to eval last token\n", __func__); + return false; + } + n_past++; + } else { + if (llama_decode(ctx, llama_batch_get_one(const_cast(tokens.data()), n_eval))) { + LOG_ERR("%s : failed to eval\n", __func__); + return false; + } + n_past += n_eval; + } + + return true; +} diff --git a/common/common.h b/common/common.h index a4c431172..1fa172865 100644 --- a/common/common.h +++ b/common/common.h @@ -804,6 +804,23 @@ void common_batch_add( const std::vector & seq_ids, bool logits); +// decodes a single batch of tokens for a prompt and manages session tokens +// +// Note: We save state before the last token so that we can replay it to ensure +// compatibility with all memory types. Recurrent/hybrid models cannot remove +// tokens from memory, so this approach works across all model architectures. +bool common_prompt_batch_decode( + struct llama_context * ctx, + const std::vector & embd, + int & n_past, + int n_batch, + std::string_view state_path, + bool save_state); + +// replays the last token after loading state to regenerate logits +// used after loading session state to ensure the sampling context has valid logits +bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos); + // // Vocab utils // diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 39d446466..5e35dcd60 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -5,12 +5,15 @@ #include #include + int main(int argc, char ** argv) { common_params params; params.prompt = "The quick brown fox"; params.sampling.seed = 1234; + const std::string_view state_file = "dump_state.bin"; + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { return 1; } @@ -53,35 +56,16 @@ int main(int argc, char ** argv) { // tokenize prompt auto tokens = common_tokenize(ctx, params.prompt, true); - // prepare the batch - llama_batch batch = llama_batch_init(tokens.size(), 0, 1); - for (size_t i = 0; i < tokens.size(); i++) { - common_batch_add(batch, tokens[i], i, {0}, false); + const bool save_state = true; + if (!common_prompt_batch_decode(ctx, tokens, n_past, params.n_batch, state_file, save_state)) { + return 1; } - batch.logits[batch.n_tokens - 1] = true; // generate next token - - // evaluate prompt - llama_decode(ctx, batch); - n_past += batch.n_tokens; - - // save state (rng, logits, embedding and kv_cache) to file - { - std::vector state_mem(llama_state_get_size(ctx)); - const size_t written = llama_state_get_data(ctx, state_mem.data(), state_mem.size()); - - FILE *fp_write = fopen("dump_state.bin", "wb"); - fwrite(state_mem.data(), 1, written, fp_write); - fclose(fp_write); - - fprintf(stderr, "%s : serialized state into %zd out of a maximum of %zd bytes\n", __func__, written, state_mem.size()); - } - - // save state (last tokens) - const auto n_past_saved = n_past; // first run printf("\nfirst run: %s", params.prompt.c_str()); + llama_batch batch = llama_batch_init(1, 0, 1); + for (auto i = 0; i < params.n_predict; i++) { auto next_token = llama_sampler_sample(smpl, ctx, -1); auto next_token_str = common_token_to_piece(ctx, next_token); @@ -111,27 +95,23 @@ int main(int argc, char ** argv) { printf("\nsecond run: %s", params.prompt.c_str()); - // load state (rng, logits, embedding and kv_cache) from file - { - std::vector state_mem; + // load state from file + std::vector unused_sts(tokens.size()); // unused session tokens. + size_t n_token_count_out = 0; - FILE * fp_read = fopen("dump_state.bin", "rb"); - fseek(fp_read, 0, SEEK_END); - state_mem.resize(ftell(fp_read)); - fseek(fp_read, 0, SEEK_SET); - const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read); - fclose(fp_read); - - if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) { - fprintf(stderr, "\n%s : failed to read state\n", __func__); - return 1; - } - - fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size()); + if (!llama_state_load_file(ctx2, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { + fprintf(stderr, "\n%s : failed to load state\n", __func__); + return 1; } + fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out); + // restore state (last tokens) - n_past = n_past_saved; + n_past = n_token_count_out; + if (!common_replay_last_token(ctx2, tokens.back(), n_past)) { + return 1; + } + ++n_past; // second run for (auto i = 0; i < params.n_predict; i++) { @@ -160,7 +140,9 @@ int main(int argc, char ** argv) { } // make new context - llama_context * ctx3 = llama_init_from_model(model, common_context_params_to_llama(params)); + auto params_ctx3 = common_context_params_to_llama(params); + params_ctx3.n_seq_max = 2; + llama_context * ctx3 = llama_init_from_model(model, params_ctx3); llama_sampler * smpl3 = llama_sampler_chain_init(sparams); @@ -169,26 +151,21 @@ int main(int argc, char ** argv) { printf("\nsingle seq run: %s", params.prompt.c_str()); // load state (rng, logits, embedding and kv_cache) from file - { - std::vector state_mem; + n_token_count_out = 0; - FILE * fp_read = fopen("dump_state.bin", "rb"); - fseek(fp_read, 0, SEEK_END); - state_mem.resize(ftell(fp_read)); - fseek(fp_read, 0, SEEK_SET); - const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read); - fclose(fp_read); - - if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) { - fprintf(stderr, "\n%s : failed to read state\n", __func__); - return 1; - } - - fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size()); + if (!llama_state_load_file(ctx3, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) { + fprintf(stderr, "\n%s : failed to load state\n", __func__); + return 1; } + fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out); + // restore state (last tokens) - n_past = n_past_saved; + n_past = n_token_count_out; + if (!common_replay_last_token(ctx3, tokens.back(), n_past)) { + return 1; + } + ++n_past; // save seq 0 and load into seq 1 { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 7cd0bfc0d..98d055d34 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2440,64 +2440,6 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { // TODO: add more model-specific info which should prevent loading the session file if not identical } - // write output ids - { - LLAMA_LOG_DEBUG("%s: - writing output ids\n", __func__); - - const auto n_outputs = this->n_outputs; - const auto & output_ids = this->output_ids; - - std::vector w_output_pos; - - w_output_pos.resize(n_outputs); - - // build a more compact representation of the output ids - for (size_t i = 0; i < n_batch(); ++i) { - // map an output id to a position in the batch - int64_t pos = output_ids[i]; - if (pos >= 0) { - GGML_ASSERT(pos < n_outputs); - w_output_pos[pos] = i; - } - } - - io.write(&n_outputs, sizeof(n_outputs)); - - if (n_outputs) { - io.write(w_output_pos.data(), n_outputs * sizeof(int32_t)); - } - } - - // [TAG_CONTEXT_STATE_LOGITS] - // write logits - { - LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__); - - const uint64_t logits_size = std::min((uint64_t) this->logits.size, (uint64_t) n_outputs * model.vocab.n_tokens()); - - io.write(&logits_size, sizeof(logits_size)); - - if (logits_size) { - io.write(logits.data, logits_size * sizeof(float)); - } - } - - // write embeddings - { - LLAMA_LOG_DEBUG("%s: - writing embeddings\n", __func__); - - const uint64_t embd_size = std::min((uint64_t) this->embd.size, (uint64_t) n_outputs * model.hparams.n_embd); - - io.write(&embd_size, sizeof(embd_size)); - - if (embd_size) { - io.write(embd.data, embd_size * sizeof(float)); - } - } - - // TODO: handle sampling buffers and samplers state ? - // https://github.com/ggml-org/llama.cpp/pull/17004 - if (memory != nullptr) { LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__); memory->state_write(io); @@ -2523,70 +2465,6 @@ size_t llama_context::state_read_data(llama_io_read_i & io) { // TODO: add more info which needs to be identical but which is not verified otherwise } - // read output ids - { - LLAMA_LOG_DEBUG("%s: - reading output ids\n", __func__); - - auto n_outputs = this->n_outputs; - io.read_to(&n_outputs, sizeof(n_outputs)); - - if (n_outputs > output_reserve(n_outputs)) { - throw std::runtime_error("could not reserve outputs"); - } - - std::vector output_pos; - - if (n_outputs) { - output_pos.resize(n_outputs); - io.read_to(output_pos.data(), n_outputs * sizeof(int32_t)); - - for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) { - int32_t id = output_pos[i]; - if ((uint32_t) id >= n_batch()) { - throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, n_batch())); - } - this->output_ids[id] = i; - } - - this->n_outputs = n_outputs; - } - } - - // read logits - { - LLAMA_LOG_DEBUG("%s: - reading logits\n", __func__); - - uint64_t logits_size; - io.read_to(&logits_size, sizeof(logits_size)); - - if (this->logits.size < logits_size) { - throw std::runtime_error("logits buffer too small"); - } - - if (logits_size) { - io.read_to(this->logits.data, logits_size * sizeof(float)); - } - } - - // read embeddings - { - LLAMA_LOG_DEBUG("%s: - reading embeddings\n", __func__); - - uint64_t embd_size; - io.read_to(&embd_size, sizeof(embd_size)); - - if (this->embd.size < embd_size) { - throw std::runtime_error("embeddings buffer too small"); - } - - if (embd_size) { - io.read_to(this->embd.data, embd_size * sizeof(float)); - } - } - - // TODO: handle sampling buffers and samplers state ? - // https://github.com/ggml-org/llama.cpp/pull/17004 - if (memory) { LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__); diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index 977132756..aed2c0e38 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -387,6 +387,17 @@ int main(int argc, char ** argv) { } session_do_save = !path_session.empty() && n_match < embd_inp.size() && !params.prompt_cache_ro; + + // Logits are not stored as part of the session state so we need to + // "replay" the last token to get logits for sampling. + if (!session_tokens.empty() && n_match > 0 && n_match == session_tokens.size()) { + if (!common_replay_last_token(ctx, session_tokens.back(), n_match)) { + return 1; + } + + session_do_save = false; + LOG_INF("%s: replayed last token from session\n", __func__); + } } // number of tokens to keep when resetting context @@ -675,40 +686,27 @@ int main(int argc, char ** argv) { } if (!embd.empty()) { - int n_eval = (int) embd.size(); - LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); - - GGML_ASSERT(n_eval <= params.n_batch); - if (llama_decode(ctx, llama_batch_get_one(embd.data(), n_eval))) { - LOG_ERR("%s : failed to eval\n", __func__); + const bool is_last_batch = (n_consumed >= (int) embd_inp.size()); + const bool save_now = session_do_save && is_last_batch; + if (!common_prompt_batch_decode(ctx, embd, n_past, params.n_batch, path_session, save_now)) { return 1; } - - n_past += n_eval; + session_tokens.insert(session_tokens.end(), embd.begin(), embd.begin()); + n_session_consumed = session_tokens.size(); + session_do_save = false; LOG_DBG("n_past = %d\n", n_past); + // Display total tokens alongside total time if (params.n_print > 0 && n_past % params.n_print == 0) { LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); } } - - if (!embd.empty() && !path_session.empty()) { - session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); - n_session_consumed = session_tokens.size(); - } } embd.clear(); if ((int) embd_inp.size() <= n_consumed && !is_interacting) { - // optionally save the session on first sample (for faster prompt loading next time) - if (session_do_save) { - session_do_save = false; - llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); - - LOG_DBG("saved session to %s\n", path_session.c_str()); - } const llama_token id = common_sampler_sample(smpl, ctx, -1);