From c2802af9e80e9f476b0d10fcf3001e7274007685 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Tue, 29 Apr 2025 20:50:46 +0800
Subject: [PATCH] fix qwen3, fixed sd, fixed glm4

---
 gpttype_adapter.cpp                  | 33 +++++++++++++++
 kcpp_adapters/ChatML-NoThink.json    |  8 ++++
 klite.embd                           | 62 +++++++++++++++++++---------
 koboldcpp.py                         |  2 +-
 model_adapter.cpp                    |  9 ++++
 model_adapter.h                      |  1 +
 otherarch/sdcpp/stable-diffusion.cpp |  8 ++--
 7 files changed, 99 insertions(+), 24 deletions(-)
 create mode 100644 kcpp_adapters/ChatML-NoThink.json
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index 6461ddd50..6d649af94 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -1915,6 +1915,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
     kcpp_data->n_ctx = clamped_max_context_length;
     max_context_limit_at_load = clamped_max_context_length;
     add_bos_token = !inputs.no_bos_token;
+
     if(!add_bos_token)
     {
         printf("\n======\nBOS token prefix was disabled! Your output may be degraded unless model was designed for it!\n======\n");
@@ -2368,6 +2369,14 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
             }
         }
 
+        //we cannot really trust the add bos in vocab. old models don't set it.
+        // instead, we EXPLICITY need to find the add_bos_token key==false to automatically set it off.
+        if(!llamamodel->vocab.get_add_bos() && add_bos_token && file_format_meta.explicitly_no_bos)
+        {
+            printf("\nThis architecture has explicitly disabled the BOS token - if you need it, you must add it manually.\n");
+            add_bos_token = false;
+        }
+
         //warmup at least 33 tokens to trigger batch
         std::vector<int> tmp;
         for (int i = 1; i <= 33; ++i) {
@@ -3180,6 +3189,30 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
         }
     }
 
+    //need to add a cursed hack to get coherency for GLM4, by ensuring injection for both sop and gmask
+    if (file_format == FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_GLM4) {
+        std::string temp = gpttype_get_chat_template();
+        if (temp.find("[gMASK]<sop>") != std::string::npos) {
+            if (addedmemory == "") {
+                if (kcpp_data->prompt.rfind("[gMASK]", 0) == 0) {  //check startswith
+                    kcpp_data->prompt.erase(0, 7);
+                }
+                if (kcpp_data->prompt.rfind("<sop>", 0) == 0) {  //check startswith
+                    kcpp_data->prompt.erase(0, 5);
+                }
+                addedmemory = "<sop>";
+            } else {
+                if (addedmemory.rfind("[gMASK]", 0) == 0) {  //check startswith
+                    addedmemory.erase(0, 7);
+                }
+                if (addedmemory.rfind("<sop>", 0) == 0) {  //check startswith
+                    addedmemory.erase(0, 5);
+                }
+                addedmemory = "<sop>" + addedmemory;
+            }
+        }
+    }
+
     bool stream_sse = inputs.stream_sse;
     bool allow_regular_prints = (!is_quiet && debugmode!=-1);
 
diff --git a/kcpp_adapters/ChatML-NoThink.json b/kcpp_adapters/ChatML-NoThink.json
new file mode 100644
index 000000000..4fc437b05
--- /dev/null
+++ b/kcpp_adapters/ChatML-NoThink.json
@@ -0,0 +1,8 @@
+{
+  "system_start": "<|im_start|>system\n",
+  "system_end": "<|im_end|>\n",
+  "user_start": "<|im_start|>user\n",
+  "user_end": "<|im_end|>\n",
+  "assistant_start": "<|im_start|>assistant\n",
+  "assistant_end": "<|im_end|>\n<think>\n\n</think>\n"
+}
diff --git a/klite.embd b/klite.embd
index 66e84353d..86a279ad6 100644
--- a/klite.embd
+++ b/klite.embd
@@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
 -->
 
 <script>
-	const LITEVER = 233;
+	const LITEVER = 234;
 	const urlParams = new URLSearchParams(window.location.search);
 	var localflag = urlParams.get('local'); //this will be replaced automatically in embedded kcpp
 	const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@@ -3171,6 +3171,7 @@ Current version indicated by LITEVER below.
 		instruct_systag_end: "",
 		instruct_sysprompt: "",
 		instruct_has_markdown: true,
+		instruct_has_latex: true,
 		placeholder_tags: true,
 		render_special_tags: false,
 		request_logprobs: false,
@@ -3330,6 +3331,16 @@ Current version indicated by LITEVER below.
 	},
 	{
 		"id":3,
+		"name":"ChatML (No Think)",
+		"user":"<|im_start|>user\\n",
+		"user_end":"<|im_end|>\\n",
+		"assistant":"<|im_start|>assistant\\n<think>\\n\\n</think>\\n",
+		"assistant_end":"<|im_end|>\\n",
+		"system":"<|im_start|>system\\n",
+		"system_end":"<|im_end|>\\n",
+	},
+	{
+		"id":4,
 		"name":"CommandR",
 		"user":"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>",
 		"user_end":"<|END_OF_TURN_TOKEN|>",
@@ -3339,7 +3350,7 @@ Current version indicated by LITEVER below.
 		"system_end":"<|END_OF_TURN_TOKEN|>",
 	},
 	{
-		"id":4,
+		"id":5,
 		"name":"Gemma 2 & 3",
 		"user":"<start_of_turn>user\\n",
 		"user_end":"<end_of_turn>\\n",
@@ -3349,7 +3360,7 @@ Current version indicated by LITEVER below.
 		"system_end":"<end_of_turn>\\n",
 	},
 	{
-		"id":5,
+		"id":6,
 		"name":"Llama 2 Chat",
 		"user":"[INST] ",
 		"user_end":"",
@@ -3359,7 +3370,7 @@ Current version indicated by LITEVER below.
 		"system_end":"",
 	},
 	{
-		"id":6,
+		"id":7,
 		"name":"Llama 3 Chat",
 		"user":"<|start_header_id|>user<|end_header_id|>\\n\\n",
 		"user_end":"<|eot_id|>",
@@ -3369,7 +3380,7 @@ Current version indicated by LITEVER below.
 		"system_end":"<|eot_id|>",
 	},
 	{
-		"id":7,
+		"id":8,
 		"name":"Llama 4 Chat",
 		"user":"<|header_start|>user<|header_end|>\\n\\n",
 		"user_end":"<|eot|>",
@@ -3379,7 +3390,7 @@ Current version indicated by LITEVER below.
 		"system_end":"<|eot|>",
 	},
 	{
-		"id":8,
+		"id":9,
 		"name":"Metharme",
 		"user":"<|user|>",
 		"user_end":"",
@@ -3389,7 +3400,7 @@ Current version indicated by LITEVER below.
 		"system_end":"",
 	},
 	{
-		"id":9,
+		"id":10,
 		"name":"Mistral V1",
 		"user":" [INST] ",
 		"user_end":"",
@@ -3399,7 +3410,7 @@ Current version indicated by LITEVER below.
 		"system_end":"",
 	},
 	{
-		"id":10,
+		"id":11,
 		"name":"Mistral V2 & V3",
 		"user":"[INST] ",
 		"user_end":"",
@@ -3409,7 +3420,7 @@ Current version indicated by LITEVER below.
 		"system_end":"",
 	},
 	{
-		"id":11,
+		"id":12,
 		"name":"Mistral V7 & V3-Tekken",
 		"user":"[INST]",
 		"user_end":"",
@@ -3419,7 +3430,7 @@ Current version indicated by LITEVER below.
 		"system_end":"[/SYSTEM_PROMPT]",
 	},
 	{
-		"id":12,
+		"id":13,
 		"name":"Phi-3 Mini",
 		"user":"<|user|>\\n",
 		"user_end":"<|end|>\\n",
@@ -3429,7 +3440,7 @@ Current version indicated by LITEVER below.
 		"system_end":"<|end|>\\n",
 	},
 	{
-		"id":13,
+		"id":14,
 		"name":"Vicuna",
 		"user":"\\nUSER: ",
 		"user_end":"",
@@ -3439,7 +3450,7 @@ Current version indicated by LITEVER below.
 		"system_end":"",
 	},
 	{
-		"id":14,
+		"id":15,
 		"name":"Deepseek V2.5",
 		"user":"<｜User｜>",
 		"user_end":"<｜end▁of▁sentence｜>",
@@ -3449,7 +3460,7 @@ Current version indicated by LITEVER below.
 		"system_end":"",
 	},
 	{
-		"id":15,
+		"id":16,
 		"name":"GLM-4",
 		"user":"<|user|>\\n",
 		"user_end":"",
@@ -5378,7 +5389,7 @@ Current version indicated by LITEVER below.
 		navigator.clipboard.writeText(innercode);
 	}
 
-	function simpleMarkdown(text) {
+	function simpleMarkdown(text, renderLatex) {
 		const escapeHTML = (str) => str.replace(/</g, "&lt;").replace(/>/g, "&gt;");
 		const highlightCode = (code) => {
 			let cpybtn = `<button class="unselectable" onclick="return copyMarkdownCode(this)" style="float:right;">Copy</button>`;
@@ -5501,7 +5512,10 @@ Current version indicated by LITEVER below.
 			.replace(/  \n/g, "\n<br/>");
 			md = replaceTabbedCodeblocks(md);
 			md = md.replace(/<\/code\><\/pre\>\n<pre\><code\>/g, "\n");
-			md = replaceLatex(md);
+			if(renderLatex)
+			{
+				md = replaceLatex(md);
+			}
 			md = md.replace(/<\/ul>\n/gm, "</ul>").replace(/<\/ol>\n/gm, "</ol>");
 			md = md.replace(/\\([`_~\*\+\-\.\^\\\<\>\(\)\[\]])/gm, "$1");
 			return md;
@@ -11149,6 +11163,7 @@ Current version indicated by LITEVER below.
 		document.getElementById("adventure_context_mod").checked = localsettings.adventure_context_mod;
 		document.getElementById("chat_context_mod").checked = localsettings.chat_context_mod;
 		document.getElementById("instruct_has_markdown").checked = localsettings.instruct_has_markdown;
+		document.getElementById("instruct_has_latex").checked = localsettings.instruct_has_latex;
 		document.getElementById("placeholder_tags").checked = localsettings.placeholder_tags;
 		document.getElementById("run_in_background").checked = run_in_background;
 		document.getElementById("auto_ctxlen").checked = localsettings.auto_ctxlen;
@@ -11601,6 +11616,7 @@ Current version indicated by LITEVER below.
 		localsettings.adventure_context_mod = (document.getElementById("adventure_context_mod").checked ? true : false);
 		localsettings.chat_context_mod = (document.getElementById("chat_context_mod").checked ? true : false);
 		localsettings.instruct_has_markdown = (document.getElementById("instruct_has_markdown").checked ? true : false);
+		localsettings.instruct_has_latex = (document.getElementById("instruct_has_latex").checked ? true : false);
 		localsettings.placeholder_tags = (document.getElementById("placeholder_tags").checked ? true : false);
 		run_in_background = (document.getElementById("run_in_background").checked ? true : false);
 		background_audio_loop(run_in_background);
@@ -17833,7 +17849,7 @@ Current version indicated by LITEVER below.
 					{
 						fulltxt += "```"; //force end code block
 					}
-					fulltxt = simpleMarkdown(fulltxt);
+					fulltxt = simpleMarkdown(fulltxt,localsettings.instruct_has_latex);
 				}
 
 				let instruct_turns = repack_instruct_turns(fulltxt, `%SpcStg%`,`%SpcEtg%`, true);
@@ -18598,7 +18614,10 @@ Current version indicated by LITEVER below.
 				{
 					processed_msg += "```"; //force end code block
 				}
-				processed_msg = simpleMarkdown(processed_msg);
+				if(localsettings.instruct_has_markdown)
+				{
+					processed_msg = simpleMarkdown(processed_msg,localsettings.instruct_has_latex);
+				}
 
 				//convert the msg into images
 				processed_msg = processed_msg.replace(/\[<\|p\|.+?\|p\|>\]/g, function (m) {
@@ -20357,7 +20376,7 @@ Current version indicated by LITEVER below.
 					replacedText = replacedText.replace(/&quot;(.*?)&quot;/g, wrapperSpan(styleRole, 'speech')); 	// Apply the speech style to "speech".
 					if(localsettings.instruct_has_markdown)
 					{
-						replacedText = simpleMarkdown(replacedText);
+						replacedText = simpleMarkdown(replacedText,localsettings.instruct_has_latex);
 					}
 					return `<span>${replacedText}</span>`;
 				});
@@ -21270,9 +21289,14 @@ Current version indicated by LITEVER below.
 					<div class="settingitem">
 						<div class="settinglabel">
 							<div class="justifyleft settingsmall">Enable Markdown <span class="helpicon">?<span
-								class="helptext">Allows the UI to use markdown formatting such as quotes, LaTeX, and code blocks.</span></span></div>
+								class="helptext">Allows the UI to use markdown formatting such as quotes and code blocks.</span></span></div>
 								<input type="checkbox" title="Enabled Markdown" id="instruct_has_markdown" style="margin:0px 0px 0px auto;">
 						</div>
+						<div class="settinglabel">
+							<div class="justifyleft settingsmall">Enable LaTeX <span class="helpicon">?<span
+								class="helptext">Allows the UI to render LaTeX within markdown formatting (Needs Markdown).</span></span></div>
+								<input type="checkbox" title="Enable LaTeX (Needs Markdown)" id="instruct_has_latex" style="margin:0px 0px 0px auto;">
+						</div>
 						<div class="settinglabel">
 							<div class="justifyleft settingsmall">Trim Sentences <span class="helpicon">?<span
 								class="helptext">Trims incomplete sentences in AI output.</span></span></div>
diff --git a/koboldcpp.py b/koboldcpp.py
index 6ed87dd60..f307168c3 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -860,7 +860,7 @@ def dump_gguf_metadata(file_path): #if you're gonna copy this into your own proj
                 if dt_translated=="arr":
                     print(f"{dt_translated}: {curr_key} = [{len(curr_val)}]")
                 elif dt_translated=="str":
-                    print(f"{dt_translated}: {curr_key} = {curr_val[:100]}")
+                    print(f"{dt_translated}: {curr_key} = {curr_val[:256]}")
                 else:
                     print(f"{dt_translated}: {curr_key} = {curr_val}")
             print("\n*** GGUF TENSOR INFO ***")
diff --git a/model_adapter.cpp b/model_adapter.cpp
index 6fcb8a070..763ce02ec 100644
--- a/model_adapter.cpp
+++ b/model_adapter.cpp
@@ -291,6 +291,15 @@ void print_tok_vec(std::vector<float> &embd)
             if (keyidx != -1) {
                 freq_base_train = gguf_get_val_f32(ctx, keyidx);
             }
+            fkey = "tokenizer.ggml.add_bos_token";
+            keyidx = gguf_find_key(ctx, fkey.c_str());
+            if (keyidx != -1) {
+                bool result = gguf_get_val_bool(ctx, keyidx);
+                if(result==false)
+                {
+                    fileformatmeta->explicitly_no_bos = true;
+                }
+            }
 
             int filever = gguf_get_version(ctx);
 
diff --git a/model_adapter.h b/model_adapter.h
index 246799ae9..dc3cde2a1 100644
--- a/model_adapter.h
+++ b/model_adapter.h
@@ -71,6 +71,7 @@ struct FileFormatExtraMeta
     GGUFArch model_architecture = GGUFArch::ARCH_DEFAULT;
     int n_expert_count = 0;
     std::string model_architecture_str = "";
+    bool explicitly_no_bos = false; //only true if key exists AND is false
 };
 
 struct TopPicksData
diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp
index fe4c7c3ec..451afcf24 100644
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
@@ -1639,15 +1639,15 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
     }
 
     struct ggml_init_params params;
-    params.mem_size = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
+    params.mem_size = static_cast<size_t>(20 * 1024 * 1024);  // 20 MB increased by kcpp
     if (sd_version_is_sd3(sd_ctx->sd->version)) {
-        params.mem_size *= 3;
+        params.mem_size *= 2; //readjust by kcpp as above changed
     }
     if (sd_version_is_flux(sd_ctx->sd->version)) {
-        params.mem_size *= 4;
+        params.mem_size *= 2; //readjust by kcpp as above changed
     }
     if (sd_ctx->sd->stacked_id) {
-        params.mem_size += static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
+        params.mem_size += static_cast<size_t>(15 * 1024 * 1024);  // 10 MB
     }
     params.mem_size += width * height * 3 * sizeof(float);
     params.mem_size *= batch_count;