From c2802af9e80e9f476b0d10fcf3001e7274007685 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Tue, 29 Apr 2025 20:50:46 +0800 Subject: [PATCH] fix qwen3, fixed sd, fixed glm4 --- gpttype_adapter.cpp | 33 +++++++++++++++ kcpp_adapters/ChatML-NoThink.json | 8 ++++ klite.embd | 62 +++++++++++++++++++--------- koboldcpp.py | 2 +- model_adapter.cpp | 9 ++++ model_adapter.h | 1 + otherarch/sdcpp/stable-diffusion.cpp | 8 ++-- 7 files changed, 99 insertions(+), 24 deletions(-) create mode 100644 kcpp_adapters/ChatML-NoThink.json diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 6461ddd50..6d649af94 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -1915,6 +1915,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in kcpp_data->n_ctx = clamped_max_context_length; max_context_limit_at_load = clamped_max_context_length; add_bos_token = !inputs.no_bos_token; + if(!add_bos_token) { printf("\n======\nBOS token prefix was disabled! Your output may be degraded unless model was designed for it!\n======\n"); @@ -2368,6 +2369,14 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } } + //we cannot really trust the add bos in vocab. old models don't set it. + // instead, we EXPLICITY need to find the add_bos_token key==false to automatically set it off. + if(!llamamodel->vocab.get_add_bos() && add_bos_token && file_format_meta.explicitly_no_bos) + { + printf("\nThis architecture has explicitly disabled the BOS token - if you need it, you must add it manually.\n"); + add_bos_token = false; + } + //warmup at least 33 tokens to trigger batch std::vector tmp; for (int i = 1; i <= 33; ++i) { @@ -3180,6 +3189,30 @@ generation_outputs gpttype_generate(const generation_inputs inputs) } } + //need to add a cursed hack to get coherency for GLM4, by ensuring injection for both sop and gmask + if (file_format == FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_GLM4) { + std::string temp = gpttype_get_chat_template(); + if (temp.find("[gMASK]") != std::string::npos) { + if (addedmemory == "") { + if (kcpp_data->prompt.rfind("[gMASK]", 0) == 0) { //check startswith + kcpp_data->prompt.erase(0, 7); + } + if (kcpp_data->prompt.rfind("", 0) == 0) { //check startswith + kcpp_data->prompt.erase(0, 5); + } + addedmemory = ""; + } else { + if (addedmemory.rfind("[gMASK]", 0) == 0) { //check startswith + addedmemory.erase(0, 7); + } + if (addedmemory.rfind("", 0) == 0) { //check startswith + addedmemory.erase(0, 5); + } + addedmemory = "" + addedmemory; + } + } + } + bool stream_sse = inputs.stream_sse; bool allow_regular_prints = (!is_quiet && debugmode!=-1); diff --git a/kcpp_adapters/ChatML-NoThink.json b/kcpp_adapters/ChatML-NoThink.json new file mode 100644 index 000000000..4fc437b05 --- /dev/null +++ b/kcpp_adapters/ChatML-NoThink.json @@ -0,0 +1,8 @@ +{ + "system_start": "<|im_start|>system\n", + "system_end": "<|im_end|>\n", + "user_start": "<|im_start|>user\n", + "user_end": "<|im_end|>\n", + "assistant_start": "<|im_start|>assistant\n", + "assistant_end": "<|im_end|>\n\n\n\n" +} diff --git a/klite.embd b/klite.embd index 66e84353d..86a279ad6 100644 --- a/klite.embd +++ b/klite.embd @@ -12,7 +12,7 @@ Current version indicated by LITEVER below. -->