diff --git a/common/chat-parser-xml-toolcall.cpp b/common/chat-parser-xml-toolcall.cpp index a80900ff8..ba359fdbf 100644 --- a/common/chat-parser-xml-toolcall.cpp +++ b/common/chat-parser-xml-toolcall.cpp @@ -803,7 +803,7 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons } // remove potential partial suffix - if (builder.pos() == builder.input().size()) { + if (builder.pos() == builder.input().size() && builder.is_partial()) { if (unclosed_reasoning_content.empty()) { rstrip(content); trim_potential_partial_word(content); diff --git a/common/common.cpp b/common/common.cpp index 8694652f5..3687f6b57 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1767,3 +1767,65 @@ float lr_opt::get_lr(float epoch) const { LOG_INF("epoch %.2g lr=%.2g\n", epoch, r); return r; } + +bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos) { + llama_batch batch = llama_batch_get_one(&last_token, 1); + batch.pos = &pos; + if (llama_decode(ctx, batch)) { + LOG_ERR("%s: failed to replay last token\n", __func__); + return false; + } + return true; +} + +bool common_prompt_batch_decode( + struct llama_context * ctx, + const std::vector & tokens, + int & n_past, + int n_batch, + std::string_view state_path, + bool save_state) { + const int n_eval = tokens.size(); + if (n_eval == 0) { + return true; + } + + if (save_state && n_eval > 1) { + const int n_tokens_before_last = n_eval - 1; + + GGML_ASSERT(n_eval <= n_batch); + + // Decode all but the last token so we can save the memory state before decoding the last token. + // This is done so we can restore the session state later and replay the last token. + // Memory implementations in recurrent/hybrid models don't support removing tokens from their + // memory, so we can't just remove the last token from the memory and replay the last token which + // is the reason for this logic. + if (llama_decode(ctx, llama_batch_get_one(const_cast(tokens.data()), n_tokens_before_last))) { + LOG_ERR("%s : failed to eval\n", __func__); + return false; + } + n_past += n_tokens_before_last; + + llama_state_save_file(ctx, state_path.data(), tokens.data(), n_tokens_before_last); + LOG_INF("saved session before last token to %s, n_tokens = %d\n", state_path.data(), n_tokens_before_last); + + llama_token last_token = tokens.back(); + llama_batch batch = llama_batch_get_one(&last_token, 1); + int32_t pos = n_past; + batch.pos = &pos; + + if (llama_decode(ctx, batch)) { + LOG_ERR("%s : failed to eval last token\n", __func__); + return false; + } + n_past++; + } else { + if (llama_decode(ctx, llama_batch_get_one(const_cast(tokens.data()), n_eval))) { + LOG_ERR("%s : failed to eval\n", __func__); + return false; + } + n_past += n_eval; + } + + return true; +} diff --git a/common/common.h b/common/common.h index 4f70c4074..4a36bb5c4 100644 --- a/common/common.h +++ b/common/common.h @@ -801,6 +801,23 @@ void common_batch_add( const std::vector & seq_ids, bool logits); +// decodes a single batch of tokens for a prompt and manages session tokens +// +// Note: We save state before the last token so that we can replay it to ensure +// compatibility with all memory types. Recurrent/hybrid models cannot remove +// tokens from memory, so this approach works across all model architectures. +bool common_prompt_batch_decode( + struct llama_context * ctx, + const std::vector & embd, + int & n_past, + int n_batch, + std::string_view state_path, + bool save_state); + +// replays the last token after loading state to regenerate logits +// used after loading session state to ensure the sampling context has valid logits +bool common_replay_last_token(struct llama_context * ctx, llama_token last_token, int32_t pos); + // // Vocab utils // diff --git a/common/jinja/runtime.cpp b/common/jinja/runtime.cpp index cc012c892..c93e182a7 100644 --- a/common/jinja/runtime.cpp +++ b/common/jinja/runtime.cpp @@ -85,7 +85,7 @@ value identifier::execute_impl(context & ctx) { auto builtins = global_builtins(); if (!it->is_undefined()) { if (ctx.is_get_stats) { - it->stats.used = true; + value_t::stats_t::mark_used(it); } JJ_DEBUG("Identifier '%s' found, type = %s", val.c_str(), it->type().c_str()); return it; @@ -277,7 +277,7 @@ value binary_expression::execute_impl(context & ctx) { static value try_builtin_func(context & ctx, const std::string & name, value & input, bool undef_on_missing = false) { JJ_DEBUG("Trying built-in function '%s' for type %s", name.c_str(), input->type().c_str()); if (ctx.is_get_stats) { - input->stats.used = true; + value_t::stats_t::mark_used(input); input->stats.ops.insert(name); } auto builtins = input->get_builtins(); @@ -448,7 +448,7 @@ value for_statement::execute_impl(context & ctx) { // mark the variable being iterated as used for stats if (ctx.is_get_stats) { - iterable_val->stats.used = true; + value_t::stats_t::mark_used(iterable_val); iterable_val->stats.ops.insert("array_access"); } @@ -470,7 +470,7 @@ value for_statement::execute_impl(context & ctx) { items.push_back(std::move(tuple)); } if (ctx.is_get_stats) { - iterable_val->stats.used = true; + value_t::stats_t::mark_used(iterable_val); iterable_val->stats.ops.insert("object_access"); } } else { @@ -480,7 +480,7 @@ value for_statement::execute_impl(context & ctx) { items.push_back(item); } if (ctx.is_get_stats) { - iterable_val->stats.used = true; + value_t::stats_t::mark_used(iterable_val); iterable_val->stats.ops.insert("array_access"); } } @@ -817,8 +817,9 @@ value member_expression::execute_impl(context & ctx) { } if (ctx.is_get_stats && val && object && property) { - val->stats.used = true; - object->stats.used = true; + value_t::stats_t::mark_used(val); + value_t::stats_t::mark_used(object); + value_t::stats_t::mark_used(property); if (is_val(property)) { object->stats.ops.insert("array_access"); } else if (is_val(property)) { diff --git a/common/jinja/value.cpp b/common/jinja/value.cpp index 9987836d1..749113124 100644 --- a/common/jinja/value.cpp +++ b/common/jinja/value.cpp @@ -161,6 +161,11 @@ static value tojson(const func_args & args) { value val_separators = args.get_kwarg_or_pos("separators", 3); value val_sort = args.get_kwarg_or_pos("sort_keys", 4); int indent = -1; + if (args.ctx.is_get_stats) { + // mark as used (recursively) for stats + auto val_input = args.get_pos(0); + value_t::stats_t::mark_used(const_cast(val_input), true); + } if (is_val(val_indent)) { indent = static_cast(val_indent->as_int()); } @@ -891,6 +896,11 @@ const func_builtins & value_array_t::get_builtins() const { }}, {"string", [](const func_args & args) -> value { args.ensure_vals(); + if (args.ctx.is_get_stats) { + // mark as used (recursively) for stats + auto val_input = args.get_pos(0); + value_t::stats_t::mark_used(const_cast(val_input), true); + } return mk_val(args.get_pos(0)->as_string()); }}, {"tojson", tojson}, @@ -1046,6 +1056,11 @@ const func_builtins & value_object_t::get_builtins() const { {"tojson", tojson}, {"string", [](const func_args & args) -> value { args.ensure_vals(); + if (args.ctx.is_get_stats) { + // mark as used (recursively) for stats + auto val_input = args.get_pos(0); + value_t::stats_t::mark_used(const_cast(val_input), true); + } return mk_val(args.get_pos(0)->as_string()); }}, {"length", [](const func_args & args) -> value { @@ -1358,4 +1373,21 @@ std::string value_to_string_repr(const value & val) { } } +// stats utility +void value_t::stats_t::mark_used(value & val, bool deep) { + val->stats.used = true; + if (deep) { + if (is_val(val)) { + for (auto & item : val->val_arr) { + mark_used(item, deep); + } + } else if (is_val(val)) { + for (auto & pair : val->val_obj) { + mark_used(pair.first, deep); + mark_used(pair.second, deep); + } + } + } +} + } // namespace jinja diff --git a/common/jinja/value.h b/common/jinja/value.h index 1c04760a0..07e447ff6 100644 --- a/common/jinja/value.h +++ b/common/jinja/value.h @@ -118,6 +118,8 @@ struct value_t { bool used = false; // ops can be builtin calls or operators: "array_access", "object_access" std::set ops; + // utility to recursively mark value and its children as used + static void mark_used(value & val, bool deep = false); } stats; value_t() = default; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 31acd5bb4..e03810959 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1274,6 +1274,9 @@ class TextModel(ModelBase): if chkhsh == "b4b8ca1f9769494fbd956ebc4c249de6131fb277a4a3345a7a92c7dd7a55808d": # ref: https://huggingface.co/jdopensource/JoyAI-LLM-Flash res = "joyai-llm" + if chkhsh == "e4d54df1ebc1f2b91acd986c5b51aa50837d5faf7c7398e73c1f9e9ee5d19869": + # ref: https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601 + res = "kanana2" if res is None: logger.warning("\n") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 8f7443d1b..53a73759e 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -152,6 +152,7 @@ models = [ {"name": "exaone-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/K-EXAONE-236B-A23B", }, {"name": "qwen35", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3.5-9B-Instruct", }, {"name": "joyai-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jdopensource/JoyAI-LLM-Flash", }, + {"name": "kanana2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601", }, ] # some models are known to be broken upstream, so we will skip them as exceptions diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 27a98070d..7609e56cc 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2450,64 +2450,6 @@ size_t llama_context::state_write_data(llama_io_write_i & io) { // TODO: add more model-specific info which should prevent loading the session file if not identical } - // write output ids - { - //LLAMA_LOG_DEBUG("%s: - writing output ids\n", __func__); - - const auto n_outputs = this->n_outputs; - const auto & output_ids = this->output_ids; - - std::vector w_output_pos; - - w_output_pos.resize(n_outputs); - - // build a more compact representation of the output ids - for (size_t i = 0; i < n_batch(); ++i) { - // map an output id to a position in the batch - int64_t pos = output_ids[i]; - if (pos >= 0) { - GGML_ASSERT(pos < n_outputs); - w_output_pos[pos] = i; - } - } - - io.write(&n_outputs, sizeof(n_outputs)); - - if (n_outputs) { - io.write(w_output_pos.data(), n_outputs * sizeof(int32_t)); - } - } - - // [TAG_CONTEXT_STATE_LOGITS] - // write logits - { - //LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__); - - const uint64_t logits_size = std::min((uint64_t) this->logits.size, (uint64_t) n_outputs * model.vocab.n_tokens()); - - io.write(&logits_size, sizeof(logits_size)); - - if (logits_size) { - io.write(logits.data, logits_size * sizeof(float)); - } - } - - // write embeddings - { - //LLAMA_LOG_DEBUG("%s: - writing embeddings\n", __func__); - - const uint64_t embd_size = std::min((uint64_t) this->embd.size, (uint64_t) n_outputs * model.hparams.n_embd); - - io.write(&embd_size, sizeof(embd_size)); - - if (embd_size) { - io.write(embd.data, embd_size * sizeof(float)); - } - } - - // TODO: handle sampling buffers and samplers state ? - // https://github.com/ggml-org/llama.cpp/pull/17004 - if (memory != nullptr) { LLAMA_LOG_DEBUG("%s: - writing memory module\n", __func__); memory->state_write(io); @@ -2533,70 +2475,6 @@ size_t llama_context::state_read_data(llama_io_read_i & io) { // TODO: add more info which needs to be identical but which is not verified otherwise } - // read output ids - { - //LLAMA_LOG_DEBUG("%s: - reading output ids\n", __func__); - - auto n_outputs = this->n_outputs; - io.read_to(&n_outputs, sizeof(n_outputs)); - - if (n_outputs > output_reserve(n_outputs)) { - throw std::runtime_error("could not reserve outputs"); - } - - std::vector output_pos; - - if (n_outputs) { - output_pos.resize(n_outputs); - io.read_to(output_pos.data(), n_outputs * sizeof(int32_t)); - - for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) { - int32_t id = output_pos[i]; - if ((uint32_t) id >= n_batch()) { - throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, n_batch())); - } - this->output_ids[id] = i; - } - - this->n_outputs = n_outputs; - } - } - - // read logits - { - //LLAMA_LOG_DEBUG("%s: - reading logits\n", __func__); - - uint64_t logits_size; - io.read_to(&logits_size, sizeof(logits_size)); - - if (this->logits.size < logits_size) { - throw std::runtime_error("logits buffer too small"); - } - - if (logits_size) { - io.read_to(this->logits.data, logits_size * sizeof(float)); - } - } - - // read embeddings - { - //LLAMA_LOG_DEBUG("%s: - reading embeddings\n", __func__); - - uint64_t embd_size; - io.read_to(&embd_size, sizeof(embd_size)); - - if (this->embd.size < embd_size) { - throw std::runtime_error("embeddings buffer too small"); - } - - if (embd_size) { - io.read_to(this->embd.data, embd_size * sizeof(float)); - } - } - - // TODO: handle sampling buffers and samplers state ? - // https://github.com/ggml-org/llama.cpp/pull/17004 - if (memory) { LLAMA_LOG_DEBUG("%s: - reading memory module\n", __func__); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 67a3a07c3..9a05da77d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1816,8 +1816,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { } break; case LLM_ARCH_DEEPSEEK2: { - // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B - const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26); + // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B + const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256)); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 414d24725..8de0424b0 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -2263,7 +2263,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2; } else if ( tokenizer_pre == "gpt-4o" || - tokenizer_pre == "llama4") { + tokenizer_pre == "llama4" || + tokenizer_pre == "kanana2") { pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O; clean_spaces = false; } else if ( diff --git a/tests/test-jinja.cpp b/tests/test-jinja.cpp index f5197bd33..05ea8ca9e 100644 --- a/tests/test-jinja.cpp +++ b/tests/test-jinja.cpp @@ -32,6 +32,7 @@ static void test_string_methods(testing & t); static void test_array_methods(testing & t); static void test_object_methods(testing & t); static void test_hasher(testing & t); +static void test_stats(testing & t); static void test_fuzzing(testing & t); static bool g_python_mode = false; @@ -70,6 +71,7 @@ int main(int argc, char *argv[]) { t.test("object methods", test_object_methods); if (!g_python_mode) { t.test("hasher", test_hasher); + t.test("stats", test_stats); t.test("fuzzing", test_fuzzing); } @@ -1795,6 +1797,63 @@ static void test_hasher(testing & t) { }); } +static void test_stats(testing & t) { + static auto get_stats = [](const std::string & tmpl, const json & vars) -> jinja::value { + jinja::lexer lexer; + auto lexer_res = lexer.tokenize(tmpl); + + jinja::program prog = jinja::parse_from_tokens(lexer_res); + + jinja::context ctx(tmpl); + jinja::global_from_json(ctx, json{{ "val", vars }}, true); + ctx.is_get_stats = true; + + jinja::runtime runtime(ctx); + runtime.execute(prog); + + return ctx.get_val("val"); + }; + + t.test("stats", [](testing & t) { + jinja::value val = get_stats( + "{{val.num}} " + "{{val.str}} " + "{{val.arr[0]}} " + "{{val.obj.key1}} " + "{{val.nested | tojson}}", + // Note: the json below will be wrapped inside "val" in the context + json{ + {"num", 1}, + {"str", "abc"}, + {"arr", json::array({1, 2, 3})}, + {"obj", json::object({{"key1", 1}, {"key2", 2}, {"key3", 3}})}, + {"nested", json::object({ + {"inner_key1", json::array({1, 2})}, + {"inner_key2", json::object({{"a", "x"}, {"b", "y"}})} + })}, + {"mixed", json::object({ + {"used", 1}, + {"unused", 2}, + })}, + } + ); + + t.assert_true("num is used", val->at("num")->stats.used); + t.assert_true("str is used", val->at("str")->stats.used); + + t.assert_true("arr is used", val->at("arr")->stats.used); + t.assert_true("arr[0] is used", val->at("arr")->at(0)->stats.used); + t.assert_true("arr[1] is not used", !val->at("arr")->at(1)->stats.used); + + t.assert_true("obj is used", val->at("obj")->stats.used); + t.assert_true("obj.key1 is used", val->at("obj")->at("key1")->stats.used); + t.assert_true("obj.key2 is not used", !val->at("obj")->at("key2")->stats.used); + + t.assert_true("inner_key1[0] is used", val->at("nested")->at("inner_key1")->at(0)->stats.used); + t.assert_true("inner_key2.a is used", val->at("nested")->at("inner_key2")->at("a")->stats.used); + }); +} + static void test_template_cpp(testing & t, const std::string & name, const std::string & tmpl, const json & vars, const std::string & expect) { t.test(name, [&tmpl, &vars, &expect](testing & t) { jinja::lexer lexer; diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index ee476cdc4..e588fb8ff 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -388,6 +388,17 @@ int main(int argc, char ** argv) { } session_do_save = !path_session.empty() && n_match < embd_inp.size() && !params.prompt_cache_ro; + + // Logits are not stored as part of the session state so we need to + // "replay" the last token to get logits for sampling. + if (!session_tokens.empty() && n_match > 0 && n_match == session_tokens.size()) { + if (!common_replay_last_token(ctx, session_tokens.back(), n_match)) { + return 1; + } + + session_do_save = false; + LOG_INF("%s: replayed last token from session\n", __func__); + } } // number of tokens to keep when resetting context @@ -676,40 +687,27 @@ int main(int argc, char ** argv) { } if (!embd.empty()) { - int n_eval = (int) embd.size(); - LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str()); - - GGML_ASSERT(n_eval <= params.n_batch); - if (llama_decode(ctx, llama_batch_get_one(embd.data(), n_eval))) { - LOG_ERR("%s : failed to eval\n", __func__); + const bool is_last_batch = (n_consumed >= (int) embd_inp.size()); + const bool save_now = session_do_save && is_last_batch; + if (!common_prompt_batch_decode(ctx, embd, n_past, params.n_batch, path_session, save_now)) { return 1; } - - n_past += n_eval; + session_tokens.insert(session_tokens.end(), embd.begin(), embd.begin()); + n_session_consumed = session_tokens.size(); + session_do_save = false; LOG_DBG("n_past = %d\n", n_past); + // Display total tokens alongside total time if (params.n_print > 0 && n_past % params.n_print == 0) { LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); } } - - if (!embd.empty() && !path_session.empty()) { - session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); - n_session_consumed = session_tokens.size(); - } } embd.clear(); if ((int) embd_inp.size() <= n_consumed && !is_interacting) { - // optionally save the session on first sample (for faster prompt loading next time) - if (session_do_save) { - session_do_save = false; - llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); - - LOG_DBG("saved session to %s\n", path_session.c_str()); - } const llama_token id = common_sampler_sample(smpl, ctx, -1); diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index d717fb669..88b6e77d8 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1105,6 +1105,8 @@ json convert_responses_to_chatcmpl(const json & response_body) { }; for (json item : input_value) { + bool merge_prev = !chatcmpl_messages.empty() && chatcmpl_messages.back().value("role", "") == "assistant"; + if (exists_and_is_string(item, "content")) { // #responses_create-input-input_item_list-input_message-content-text_input // Only "Input message" contains item["content"]::string @@ -1193,7 +1195,7 @@ json convert_responses_to_chatcmpl(const json & response_body) { item.at("type") == "message" ) { // #responses_create-input-input_item_list-item-output_message - std::vector chatcmpl_content; + auto chatcmpl_content = json::array(); for (const auto & output_text : item.at("content")) { const std::string type = json_value(output_text, "type", std::string()); @@ -1210,10 +1212,19 @@ json convert_responses_to_chatcmpl(const json & response_body) { }); } - item.erase("status"); - item.erase("type"); - item["content"] = chatcmpl_content; - chatcmpl_messages.push_back(item); + if (merge_prev) { + auto & prev_msg = chatcmpl_messages.back(); + if (!exists_and_is_array(prev_msg, "content")) { + prev_msg["content"] = json::array(); + } + auto & prev_content = prev_msg["content"]; + prev_content.insert(prev_content.end(), chatcmpl_content.begin(), chatcmpl_content.end()); + } else { + item.erase("status"); + item.erase("type"); + item["content"] = chatcmpl_content; + chatcmpl_messages.push_back(item); + } } else if (exists_and_is_string(item, "arguments") && exists_and_is_string(item, "call_id") && exists_and_is_string(item, "name") && @@ -1221,24 +1232,27 @@ json convert_responses_to_chatcmpl(const json & response_body) { item.at("type") == "function_call" ) { // #responses_create-input-input_item_list-item-function_tool_call - json msg = json { - {"role", "assistant"}, - {"tool_calls", json::array({ json { - {"function", json { - {"arguments", item.at("arguments")}, - {"name", item.at("name")}, - }}, - {"id", item.at("call_id")}, - {"type", "function"}, - }})}, + json tool_call = { + {"function", json { + {"arguments", item.at("arguments")}, + {"name", item.at("name")}, + }}, + {"id", item.at("call_id")}, + {"type", "function"}, }; - if (!chatcmpl_messages.empty() && chatcmpl_messages.back().contains("reasoning_content")) { - // Move reasoning content from dummy message to tool call message - msg["reasoning_content"] = chatcmpl_messages.back().at("reasoning_content"); - chatcmpl_messages.pop_back(); + if (merge_prev) { + auto & prev_msg = chatcmpl_messages.back(); + if (!exists_and_is_array(prev_msg, "tool_calls")) { + prev_msg["tool_calls"] = json::array(); + } + prev_msg["tool_calls"].push_back(tool_call); + } else { + chatcmpl_messages.push_back(json { + {"role", "assistant"}, + {"tool_calls", json::array({tool_call})} + }); } - chatcmpl_messages.push_back(msg); } else if (exists_and_is_string(item, "call_id") && (exists_and_is_string(item, "output") || exists_and_is_array(item, "output")) && exists_and_is_string(item, "type") && @@ -1282,12 +1296,16 @@ json convert_responses_to_chatcmpl(const json & response_body) { throw std::invalid_argument("item['content']['text'] is not a string"); } - // Pack reasoning content in dummy message - chatcmpl_messages.push_back(json { - {"role", "assistant"}, - {"content", json::array()}, - {"reasoning_content", item.at("content")[0].at("text")}, - }); + if (merge_prev) { + auto & prev_msg = chatcmpl_messages.back(); + prev_msg["reasoning_content"] = item.at("content")[0].at("text"); + } else { + chatcmpl_messages.push_back(json { + {"role", "assistant"}, + {"content", json::array()}, + {"reasoning_content", item.at("content")[0].at("text")}, + }); + } } else { throw std::invalid_argument("Cannot determine type of 'item'"); } @@ -1296,20 +1314,6 @@ json convert_responses_to_chatcmpl(const json & response_body) { throw std::invalid_argument("'input' must be a string or array of objects"); } - // Remove unused dummy message which contains - // reasoning content not followed by tool call - chatcmpl_messages.erase(std::remove_if( - chatcmpl_messages.begin(), - chatcmpl_messages.end(), - [](const json & x){ return x.contains("role") && - x.at("role") == "assistant" && - x.contains("content") && - x.at("content") == json::array() && - x.contains("reasoning_content"); - }), - chatcmpl_messages.end() - ); - chatcmpl_body["messages"] = chatcmpl_messages; if (response_body.contains("tools")) { diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 8aab0d4c1..0f2f3a45a 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2911,6 +2911,9 @@ server_context_meta server_context::get_meta() const { /* fim_pre_token */ llama_vocab_fim_pre(impl->vocab), /* fim_sub_token */ llama_vocab_fim_suf(impl->vocab), /* fim_mid_token */ llama_vocab_fim_mid(impl->vocab), + /* fim_pad_token */ llama_vocab_fim_pad(impl->vocab), + /* fim_rep_token */ llama_vocab_fim_rep(impl->vocab), + /* fim_sep_token */ llama_vocab_fim_sep(impl->vocab), /* model_vocab_type */ llama_vocab_type(impl->vocab), /* model_vocab_n_tokens */ llama_vocab_n_tokens(impl->vocab), diff --git a/tools/server/server-context.h b/tools/server/server-context.h index c0b5d373f..03c29f513 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -30,6 +30,9 @@ struct server_context_meta { llama_token fim_pre_token; llama_token fim_sub_token; llama_token fim_mid_token; + llama_token fim_pad_token; + llama_token fim_rep_token; + llama_token fim_sep_token; // model meta enum llama_vocab_type model_vocab_type; diff --git a/tools/server/webui/README.md b/tools/server/webui/README.md index 98b01fdcd..6fc908e27 100644 --- a/tools/server/webui/README.md +++ b/tools/server/webui/README.md @@ -101,7 +101,7 @@ In a separate terminal, start the backend server: ./llama-server -m model.gguf # Multi-model (ROUTER mode) -./llama-server --model-store /path/to/models +./llama-server --models-dir /path/to/models ``` ### 3. Start Development Servers