Merge branch 'upstream' into concedo_experimental

# Conflicts: # ggml.c # scripts/compare-llama-bench.py # tests/test-backend-ops.cpp
2025-09-11 17:44:38 +00:00 · 2024-05-08 18:19:28 +08:00 · 2024-05-08 18:19:28 +08:00 · 165a56088b
commit 165a56088b
parent bc39b4d98a 7e0b6a7b3b
13 changed files with 1291 additions and 87 deletions
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -151,6 +151,8 @@ for model in models:
    # print the "pre_tokenizer" content from the tokenizer.json
    with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
        cfg = json.load(f)
        normalizer = cfg["normalizer"]
        logger.info("normalizer: " + json.dumps(normalizer, indent=4))
        pre_tokenizer = cfg["pre_tokenizer"]
        logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -575,7 +575,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
    auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
-        if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
+        if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) {
            return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
        } else if (a->type == GGML_TYPE_F32) {
            return ggml_add(ctx, a, b);
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -47,7 +47,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
-    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
+    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "14.00G, -0.0020 ppl @ Mistral-7B", },
    { "BF16",   LLAMA_FTYPE_MOSTLY_BF16,   "14.00G, -0.0050 ppl @ Mistral-7B", },
    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
    // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
    { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing", },
--- a/ggml-impl.h
+++ b/ggml-impl.h
@ -17,6 +17,83 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 /**
 * Converts brain16 to float32.
 *
 * The bfloat16 floating point format has the following structure:
 *
 *       ┌sign
 *       │
 *       │   ┌exponent
 *       │   │
 *       │   │      ┌mantissa
 *       │   │      │
 *       │┌──┴───┐┌─┴───┐
 *     0b0000000000000000 brain16
 *
 * Since bf16 has the same number of exponent bits as a 32bit float,
 * encoding and decoding numbers becomes relatively straightforward.
 *
 *       ┌sign
 *       │
 *       │   ┌exponent
 *       │   │
 *       │   │      ┌mantissa
 *       │   │      │
 *       │┌──┴───┐┌─┴───────────────────┐
 *     0b00000000000000000000000000000000 IEEE binary32
 *
 * For comparison, the standard fp16 format has fewer exponent bits.
 *
 *       ┌sign
 *       │
 *       │  ┌exponent
 *       │  │
 *       │  │    ┌mantissa
 *       │  │    │
 *       │┌─┴─┐┌─┴──────┐
 *     0b0000000000000000 IEEE binary16
 *
 * @see IEEE 754-2008
 */
 static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
    union {
        float f;
        uint32_t i;
    } u;
    u.i = (uint32_t)h.bits << 16;
    return u.f;
 }
 /**
 * Converts float32 to brain16.
 *
 * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
 * Subnormals shall be flushed to zero, and NANs will be quiet.
 * This code should vectorize nicely if using modern compilers.
 */
 static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
    ggml_bf16_t h;
    union {
        float f;
        uint32_t i;
    } u;
    u.f = s;
    if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
        h.bits = (u.i >> 16) | 64; /* force to quiet */
        return h;
    }
    if (!(u.i & 0x7f800000)) { /* subnormal */
        h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
        return h;
    }
    h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
    return h;
 }
 #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
 #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
 #ifdef __cplusplus
 extern "C" {
 #endif
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -803,7 +803,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_GET_ROWS:
            {
-                return op->ne[3] == 1;
+                return op->src[0]->type != GGML_TYPE_BF16 && op->ne[3] == 1;
            }
        default:
            return false;
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -2175,7 +2175,7 @@ kernel void kernel_flash_attn_ext_f16(
    const short D4 = D/4;
    const short D8 = D/8;
-    const short Q8 = Q/8;
+  //const short Q8 = Q/8;
    const short NW = N_SIMDWIDTH;
    const short SH = (C + Q); // shared memory per simdgroup in (half)
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -12451,6 +12451,24 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
    const size_t nb = nbytes/ggml_type_size(type);
    switch (type) {
        case GGML_TYPE_BF16:
            {
                int nans = 0;
                int infs = 0;
                const unsigned short * f = (const unsigned short *) data;
                for (size_t i = 0; i < nb; ++i) {
                    nans += (f[i] & 0x7fff) > 0x7f80;
                    infs += (f[i] & 0x7fff) == 0x7f80;
                }
                if (nans) {
                    fprintf(stderr, "%s: found %d NaNs in row of %zu BF16 values\n", __func__, nans, nb);
                    return false;
                }
                if (infs) {
                    fprintf(stderr, "%s: found %d infinities in row of %zu BF16 values\n", __func__, infs, nb);
                    return false;
                }
            } break;
        case GGML_TYPE_F16:
            {
                const ggml_fp16_t * f = (const ggml_fp16_t *) data;
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -333,14 +333,20 @@ extern "C" {
    // get ggml_status name string
    GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
    // ieee 754-2008 half-precision float16
    // todo: make this not an integral type
    typedef uint16_t ggml_fp16_t;
    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t);
    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
    GGML_API void        ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
    GGML_API void        ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
-    // convert FP16 <-> FP32
+    // google brain half-precision bfloat16
-    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
+    typedef struct { uint16_t bits; } ggml_bf16_t;
-    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
+    GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
-
+    GGML_API float       ggml_bf16_to_fp32(ggml_bf16_t);  // consider just doing << 16
-    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
+    GGML_API void        ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
-    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
+    GGML_API void        ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
    struct ggml_object;
    struct ggml_context;
@ -377,6 +383,7 @@ extern "C" {
        GGML_TYPE_I64     = 27,
        GGML_TYPE_F64     = 28,
        GGML_TYPE_IQ1_M   = 29,
        GGML_TYPE_BF16    = 30,
        GGML_TYPE_COUNT,
    };
@ -417,6 +424,7 @@ extern "C" {
        GGML_FTYPE_MOSTLY_IQ2_S   = 21, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
        GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
    };
    // available tensor operations:
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -817,6 +817,7 @@ class GGMLQuantizationType(IntEnum):
    I64     = 27
    F64     = 28
    IQ1_M   = 29
    BF16    = 30
 class GGUFEndian(IntEnum):
@ -888,6 +889,7 @@ GGML_QUANT_SIZES = {
    GGMLQuantizationType.I64:     (1, 8),
    GGMLQuantizationType.F64:     (1, 8),
    GGMLQuantizationType.IQ1_M:   (256, QK_K // 8 + QK_K // 16  + QK_K // 32),
    GGMLQuantizationType.BF16:    (1, 2),
 }
--- a/klite.embd
+++ b/klite.embd
@ -3710,12 +3710,15 @@ Current version: 138
 		opmode: 4, //what mode are we in? 1=story, 2=adventure, 3=chat, 4=instruct
 		adventure_is_action: false, //in adventure mode, determine story or action
 		adventure_context_mod: true, //extra injection for adventure mode
 		chat_context_mod: true, //extra injection for chat mode
 		chatname: "You", //name to use in chat
 		chatopponent: defaultchatopponent,
 		instruct_starttag: "\\n### Instruction:\\n",
 		instruct_endtag: "\\n### Response:\\n",
 		instruct_sysprompt: "",
 		instruct_has_markdown: true,
 		placeholder_tags: true,
 		render_special_tags: false,
 		persist_session: true,
 		speech_synth: 0, //0 is disabled, 1000 is xtts
 		xtts_voice: "female_calm",
@ -3750,6 +3753,7 @@ Current version: 138
 		allow_continue_chat: false,
 		inject_timestamps_chat: false,
 		inject_timestamps_instruct: false,
 		inject_chatnames_instruct: false,
 		inject_jailbreak_instruct: false,
 		idle_responses: 0,
 		idle_duration: 60,
@ -8381,6 +8385,7 @@ Current version: 138
 		document.getElementById("trimsentences").checked = localsettings.trimsentences;
 		document.getElementById("trimwhitespace").checked = localsettings.trimwhitespace;
 		document.getElementById("compressnewlines").checked = localsettings.compressnewlines;
 		document.getElementById("render_special_tags").checked = localsettings.render_special_tags;
 		document.getElementById("eos_ban_mode").value = localsettings.eos_ban_mode;
 		document.getElementById("persist_session").checked = localsettings.persist_session;
 		document.getElementById("opmode").value = localsettings.opmode;
@ -8388,6 +8393,8 @@ Current version: 138
 		document.getElementById("chatopponent").value = replaceAll(localsettings.chatopponent,"||$||","\n");
 		handle_bot_name_onchange();
 		document.getElementById("instruct_starttag").value = localsettings.instruct_starttag;
 		let sp = replaceAll(localsettings.instruct_sysprompt, "\n", "\\n");
 		document.getElementById("instruct_sysprompt").value = sp;
 		document.getElementById("instruct_endtag").value = localsettings.instruct_endtag;
 		document.getElementById("min_p").value = localsettings.min_p;
 		document.getElementById("dynatemp_range").value = localsettings.dynatemp_range;
@ -8436,10 +8443,12 @@ Current version: 138
 		document.getElementById("allow_continue_chat").checked = localsettings.allow_continue_chat;
 		document.getElementById("inject_timestamps_chat").checked = localsettings.inject_timestamps_chat;
 		document.getElementById("inject_timestamps_instruct").checked = localsettings.inject_timestamps_instruct;
 		document.getElementById("inject_chatnames_instruct").checked = localsettings.inject_chatnames_instruct;
 		document.getElementById("inject_jailbreak_instruct").checked = localsettings.inject_jailbreak_instruct;
 		document.getElementById("idle_responses").value = localsettings.idle_responses;
 		document.getElementById("idle_duration").value = localsettings.idle_duration;
 		document.getElementById("adventure_context_mod").checked = localsettings.adventure_context_mod;
 		document.getElementById("chat_context_mod").checked = localsettings.chat_context_mod;
 		document.getElementById("instruct_has_markdown").checked = localsettings.instruct_has_markdown;
 		document.getElementById("placeholder_tags").checked = localsettings.placeholder_tags;
 		document.getElementById("run_in_background").checked = run_in_background;
@ -8653,6 +8662,8 @@ Current version: 138
 		}
 		localsettings.chatopponent = newopps;
 		localsettings.instruct_starttag = document.getElementById("instruct_starttag").value;
 		localsettings.instruct_sysprompt = document.getElementById("instruct_sysprompt").value;
 		localsettings.instruct_sysprompt = replaceAll(localsettings.instruct_sysprompt, "\\n", "\n");
 		if (localsettings.instruct_starttag == null || localsettings.instruct_starttag == "") {
 			localsettings.instruct_starttag = "\\n### Instruction:\\n";
 		}
@ -8678,6 +8689,7 @@ Current version: 138
 		localsettings.trimsentences = (document.getElementById("trimsentences").checked ? true : false);
 		localsettings.trimwhitespace = (document.getElementById("trimwhitespace").checked ? true : false);
 		localsettings.compressnewlines = (document.getElementById("compressnewlines").checked ? true : false);
 		localsettings.render_special_tags = (document.getElementById("render_special_tags").checked ? true : false);
 		localsettings.eos_ban_mode = document.getElementById("eos_ban_mode").value;
 		localsettings.persist_session = (document.getElementById("persist_session").checked ? true : false);
 		if(document.getElementById("opmode").value==1)
@ -8701,10 +8713,12 @@ Current version: 138
 		localsettings.allow_continue_chat = (document.getElementById("allow_continue_chat").checked ? true : false);
 		localsettings.inject_timestamps_chat = (document.getElementById("inject_timestamps_chat").checked ? true : false);
 		localsettings.inject_timestamps_instruct = (document.getElementById("inject_timestamps_instruct").checked ? true : false);
 		localsettings.inject_chatnames_instruct = (document.getElementById("inject_chatnames_instruct").checked ? true : false);
 		localsettings.inject_jailbreak_instruct = (document.getElementById("inject_jailbreak_instruct").checked ? true : false);
 		localsettings.idle_responses = document.getElementById("idle_responses").value;
 		localsettings.idle_duration = document.getElementById("idle_duration").value;
 		localsettings.adventure_context_mod = (document.getElementById("adventure_context_mod").checked ? true : false);
 		localsettings.chat_context_mod = (document.getElementById("chat_context_mod").checked ? true : false);
 		localsettings.instruct_has_markdown = (document.getElementById("instruct_has_markdown").checked ? true : false);
 		localsettings.placeholder_tags = (document.getElementById("placeholder_tags").checked ? true : false);
 		run_in_background = (document.getElementById("run_in_background").checked ? true : false);
@ -8955,12 +8969,21 @@ Current version: 138
 		if (document.getElementById('gui_type').value==2) { document.getElementById('btn_aesthetics').classList.remove('hidden'); }
 		else { document.getElementById('btn_aesthetics').classList.add('hidden'); }
 	}
 	function toggle_include_chatnames()
 	{
 		if (document.getElementById("inject_chatnames_instruct").checked) {
 			document.getElementById('chatinstructsharedsection2').classList.remove('hidden');
 		} else {
 			document.getElementById('chatinstructsharedsection2').classList.add('hidden');
 		}
 	}
 	function toggle_opmode() {
 		document.getElementById('chatnamesection1').classList.add('hidden');
 		document.getElementById('adventuresection1').classList.add('hidden');
 		document.getElementById('instructsection1').classList.add('hidden');
 		document.getElementById('chatnamesection2').classList.add('hidden');
 		document.getElementById('chatinstructsharedsection2').classList.add('hidden');
 		document.getElementById('adventuresection2').classList.add('hidden');
 		document.getElementById('instructsection2').classList.add('hidden');
@ -8982,6 +9005,7 @@ Current version: 138
 			document.getElementById('gui_type').value = localsettings.gui_type_chat;
 			document.getElementById('chatnamesection1').classList.remove('hidden');
 			document.getElementById('chatnamesection2').classList.remove('hidden');
 			document.getElementById('chatinstructsharedsection2').classList.remove('hidden');
 			document.getElementById('uipicker_messenger').classList.remove('hidden');
 			document.getElementById('uipicker_aesthetic').classList.remove('hidden');
 		}
@ -8990,6 +9014,7 @@ Current version: 138
 			document.getElementById('instructsection1').classList.remove('hidden');
 			document.getElementById('instructsection2').classList.remove('hidden');
 			document.getElementById('uipicker_aesthetic').classList.remove('hidden');
 			toggle_include_chatnames();
 		}
 		//deselect invalid
@ -9973,10 +9998,15 @@ Current version: 138
 				if(newgen != "")
 				{
 					if(localsettings.inject_chatnames_instruct)
 					{
 						newgen = localsettings.chatname + ": " + newgen;
 					}
 					if(localsettings.inject_timestamps_instruct)
 					{
 						newgen = "["+(new Date().toLocaleTimeString([], {year: 'numeric', month: 'numeric', day: 'numeric', hour: '2-digit', minute: '2-digit'}))+"] " + newgen;
 					}
 					//append instruction for instruct mode
 					newgen = ist + newgen + iet;
@ -10175,7 +10205,7 @@ Current version: 138
 					if(trimmed!=""){ co = trimmed; }
 				}
-				if (current_anote.length == 0 && current_memory.length == 0) {
+				if (localsettings.chat_context_mod && current_anote.length == 0 && current_memory.length == 0 && current_wi.length == 0) {
 					if (gametext_arr.length > 0 && gametext_arr[0].startsWith("\n" + me + ": ")) {
 						let injected = "[The following is an interesting chat message log between " + me + " and " + co + ".]\n\n" + localsettings.chatname + ": Hi.\n" + co + ": Hello.";
 						if(co=="")
@ -10227,16 +10257,23 @@ Current version: 138
 			}
-			if (localsettings.opmode == 4) {
+			if (localsettings.opmode == 4)
-
+			{
-				if(localsettings.inject_timestamps_instruct && pending_context_preinjection=="" && truncated_context!="")
+				if (pending_context_preinjection == "" && truncated_context != "")
 				{
-					let endmatcher = (localsettings.placeholder_tags?instructendplaceholder:get_instruct_endtag(false));
+					let endmatcher = (localsettings.placeholder_tags ? instructendplaceholder : get_instruct_endtag(false));
-
+					if (truncated_context.toLowerCase().trim().endsWith(endmatcher.toLowerCase().trim())) {
-					if(truncated_context.toLowerCase().trim().endsWith(endmatcher.toLowerCase().trim()))
+						if (localsettings.inject_timestamps_instruct) {
-					{
+							pending_context_preinjection += "[" + (new Date().toLocaleTimeString([], { year: 'numeric', month: 'numeric', day: 'numeric', hour: '2-digit', minute: '2-digit' })) + "]";
-						pending_context_preinjection += "["+(new Date().toLocaleTimeString([], {year: 'numeric', month: 'numeric', day: 'numeric', hour: '2-digit', minute: '2-digit'}))+"]";
+						}
 						if (localsettings.inject_chatnames_instruct && localsettings.chatopponent!="") {
 							if (localsettings.inject_timestamps_instruct) {
 								pending_context_preinjection += " ";
 							}
 							pending_context_preinjection += localsettings.chatopponent + ":";
 						}
 					}
 				}
 				truncated_context += pending_context_preinjection;
 			}
@ -10258,7 +10295,13 @@ Current version: 138
 			//memory is allowed to be up to 0.8 times of ctx allowance, anote up to 0.6 times
 			let max_mem_len = Math.floor(max_allowed_characters*0.8);
 			let max_anote_len = Math.floor(max_allowed_characters*0.6);
-			let truncated_memory = substring_to_boundary(current_memory, max_mem_len);
+			let appendedsysprompt = "";
 			if(localsettings.opmode==4 && localsettings.instruct_sysprompt!="")
 			{
 				max_mem_len = Math.floor(max_allowed_characters*0.7);
 				appendedsysprompt = get_instruct_starttag(false)+" "+localsettings.instruct_sysprompt + "\n";
 			}
 			let truncated_memory = appendedsysprompt + substring_to_boundary(current_memory, max_mem_len);
 			if (truncated_memory != null && truncated_memory != "") {
 				if(newlineaftermemory)
 				{
@ -10603,6 +10646,7 @@ Current version: 138
 			submit_payload.params.dynatemp_exponent = localsettings.dynatemp_exponent;
 			submit_payload.params.smoothing_factor = localsettings.smoothing_factor;
 			submit_payload.params.banned_tokens = get_token_bans();
 			submit_payload.params.render_special = localsettings.render_special_tags;
 		}
 		//presence pen and logit bias for OAI and newer kcpp
 		if((custom_kobold_endpoint != "" && is_using_kcpp_with_mirostat()) || custom_oai_endpoint!="")
@ -12190,7 +12234,7 @@ Current version: 138
 			if (idle_timer > idle_timer_max) {
 				idle_timer = 0;
 				let nextcounter = ++idle_triggered_counter;
-				if(localsettings.opmode == 4)
+				if(localsettings.opmode == 4) //handle idle messages
 				{
 					if (!localsettings.placeholder_tags) {
 						pending_context_preinjection =  get_instruct_endtag(false);
@ -12873,6 +12917,18 @@ Current version: 138
 				fulltxt = replaceAll(fulltxt, `%SpcStg%`, `<hr class="hr_instruct"><span class="color_cyan"><img src="`+human_square+`" style="height:38px;width:auto;padding:3px 6px 3px 3px;border-radius: 8%;"/>`);
 				fulltxt = replaceAll(fulltxt, `%SpcEtg%`, `</span><hr class="hr_instruct"><img src="`+niko_square+`" style="height:38px;width:auto;padding:3px 6px 3px 3px;border-radius: 8%;"/>`);
 				//apply stylization to time tags
 				if(localsettings.inject_timestamps_instruct && localsettings.instruct_has_markdown)
 				{
 					fulltxt = fulltxt.replace(/(\[\d{1,2}\/\d{1,2}\/\d{4}, \d{1,2}:\d{2} [AP]M\])/g, "$1\n");
 				}
 				if(localsettings.inject_chatnames_instruct && localsettings.instruct_has_markdown)
 				{
 					let m_name = localsettings.chatname + ": ";
 					let m_opp = localsettings.chatopponent + ": ";
 					fulltxt = replaceAll(fulltxt, m_name, `<b>` + escapeHtml(m_name) + `</b>`);
 					fulltxt = replaceAll(fulltxt, m_opp, `<b>` + escapeHtml(m_opp) + `</b>`);
 				}
 			}else{
 				fulltxt = replaceAll(fulltxt, get_instruct_starttag(true), `%SclStg%`+escapeHtml(get_instruct_starttag(true))+`%SpnEtg%`);
 				fulltxt = replaceAll(fulltxt, get_instruct_endtag(true), `%SclStg%`+escapeHtml(get_instruct_endtag(true))+`%SpnEtg%`);
@ -15149,50 +15205,51 @@ Current version: 138
 						</div>
 						<div id="chatnamesection1" class="settinglabel hidden" style="padding-top: 3px;">
 							<div class="settinglabel">
 							<div class="justifyleft settingsmall">Chat PrePrompt <span class="helpicon">?<span
 								class="helptext">Modifies the context, injecting tokens to improve chat quality for new chats.</span></span> </div>
 							<input type="checkbox" id="adventure_context_mod" style="margin:0px 0 0;">
 							</div>
 							<div class="settinglabel">
 								<div class="justifyleft settingsmall">Inject Timestamps <span class="helpicon">?<span
 									class="helptext">Injects timestamps into context, allowing the AI to have a sense of time.</span></span></div>
 								<input type="checkbox" id="inject_timestamps_chat" style="margin:0px 0 0;">
 							</div>
 						</div>
 						<div id="adventuresection1" class="settinglabel hidden" style="padding-top: 3px;">
 							<div class="settinglabel">
-							<div class="justifyleft settingsmall">Adventure Prompt <span class="helpicon">?<span
+							<div class="justifyleft settingsmall">Adventure PrePrompt <span class="helpicon">?<span
-								class="helptext">Modifies the context, injecting tokens to improve adventure quality for adventure mode.</span></span> </div>
+								class="helptext">Modifies the context, injecting tokens to improve adventure quality for new adventures.</span></span> </div>
-							<input type="checkbox" id="adventure_context_mod" style="margin:0px 0 0;">
+							<input type="checkbox" id="chat_context_mod" style="margin:0px 0 0;">
 							</div>
 						</div>
 						<div id="instructsection1" class="settinglabel hidden" style="padding-top: 3px;">
 							<div class="justifyleft settingsmall">Enable Markdown <span class="helpicon">?<span
 								class="helptext">Allows the UI to use markdown formatting such as quotes and code blocks.</span></span>
 								<input type="checkbox" id="instruct_has_markdown" style="margin:0px 0 0;">
 							</div>
 							<div class="settinglabel">
 								<div class="justifyleft settingsmall">Inject Timestamps <span class="helpicon">?<span
 									class="helptext">Injects timestamps into context, allowing the AI to have a sense of time.</span></span></div>
 								<input type="checkbox" id="inject_timestamps_instruct" style="margin:0px 0 0;">
 							</div>
 							<div class="settinglabel">
 								<div class="justifyleft settingsmall">Inject ChatNames <span class="helpicon">?<span
 									class="helptext">Appends chat names after every instruct tag, a hybrid chat mode.</span></span></div>
 								<input type="checkbox" id="inject_chatnames_instruct" style="margin:0px 0 0;" onchange="toggle_include_chatnames()">
 							</div>
 							<div class="settinglabel">
 								<div class="justifyleft settingsmall">Assistant Jailbreak <span class="helpicon">?<span
 									class="helptext">Automatically injects a jailbreak message after every query to make the AI more likely to obey you.</span></span></div>
 								<input type="checkbox" id="inject_jailbreak_instruct" style="margin:0px 0 0;">
 							</div>
 						</div>
 						</div>
 					</div>
 					<div class="settingitem">
 					<div class="settinglabel">
 						<div id="chatnamesection2" class="settinglabel hidden" style="padding-top: 3px;">
 							<table class="settingsmall text-center" style="border-spacing: 4px 2px;	border-collapse: separate;">
 								<tr>
 								<th>Your Name</th>
 								<th>AI Name <span class="helpicon">?<span class="helptext">The name of the person you want to chat with. Multiple opponents can be specified, creating a group chat, separate their names using multiple lines.</span></span></th>
 								</tr>
 								<tr>
 								<td style="vertical-align: top;"><input class="settinglabel miniinput" type="text" placeholder="(Enter Name)" value="" id="chatname" title="The name that you will be chatting as"></td>
 								<td style="vertical-align: top;"><textarea class="settinglabel miniinput" style="resize: none;overflow:hidden;" id="chatopponent" placeholder="(Auto)" rows="1" wrap="off" title="The name of the person you want to chat with" oninput="handle_bot_name_input()" onchange="handle_bot_name_onchange()"></textarea></td>
 								</tr>
 							  </table>
 							<div class="settinglabel">
 							<div class="justifyleft settingsmall">Multiline Replies <span class="helpicon">?<span
 								class="helptext">Whether to allow multiple lines in AI responses. Disable this if the AI starts generating rubbish.</span></span> </div>
 							<input type="checkbox" id="multiline_replies" style="margin:0px 0 0;">
 							</div>
 							<div class="settinglabel">
 							<div class="justifyleft settingsmall">Continue Bot Replies <span class="helpicon">?<span
 								class="helptext">Allow incomplete AI chat replies, which can be continued by pressing submit again. Not recommended for newbies.</span></span></div>
 							<input type="checkbox" id="allow_continue_chat" style="margin:0px 0 0;">
 							</div>
 							<div class="settinglabel">
 							<div class="justifyleft settingsmall">Inject Timestamps <span class="helpicon">?<span
 								class="helptext">Injects timestamps into context, allowing the AI to have a sense of time.</span></span></div>
 							<input type="checkbox" id="inject_timestamps_chat" style="margin:0px 0 0;">
 							</div>
 						</div>
 						<div id="adventuresection2" class="settinglabel hidden" style="padding-top: 3px;">
 							<div class="settinglabel">
 								<div class="justifyleft settingsmall">Multiline Replies <span class="helpicon">?<span
@ -15214,6 +15271,10 @@ Current version: 138
 								<option value="8">CommandR</option>
 								<option value="9">Llama 3 Chat</option>
 							</select>
 							<div class="settingsmall miniinput" style="width:100%;padding:2px">
 							<div class="justifyleft settingsmall">Sys. Prompt <span class="helpicon">?<span class="helptext">A system pre-prompt sent at the very start to guide the AI behavior. Usually NOT needed.</span></span></div>
 							<input class="settinglabel miniinput" type="text" placeholder="(Optional)" value="" id="instruct_sysprompt">
 							</div>
 							<table class="settingsmall text-center" style="border-spacing: 3px 2px;	border-collapse: separate;">
 								<tr>
 									<th>Start Seq.<span class="helpicon">?<span class="helptext">The sequence to start an instruction prompt</span></span></th>
@ -15224,19 +15285,29 @@ Current version: 138
 								<td><input class="settinglabel miniinput" type="text" placeholder="\\n### Response:\\n" value="" id="instruct_endtag" onchange="edit_instruct_tag_format()" title="The sequence to end an instruction prompt"></td>
 								</tr>
 							</table>
-
+						</div>
-							<div class="justifyleft settingsmall">Enable Markdown <span class="helpicon">?<span
+						<div id="chatinstructsharedsection2" class="settinglabel hidden" style="padding-top: 3px;">
-								class="helptext">Allows the UI to use markdown formatting such as quotes and code blocks.</span></span> </div>
+							<table class="settingsmall text-center" style="border-spacing: 4px 2px;	border-collapse: separate;">
-							<input type="checkbox" id="instruct_has_markdown" style="margin:0px 0 0;">
+								<tr>
 								<th>Your Name</th>
 								<th>AI Name <span class="helpicon">?<span class="helptext">Name of the person(s) you want to chat with. Multiple opponents can be specified, creating a group chat, separate their names using multiple lines.</span></span></th>
 								</tr>
 								<tr>
 								<td style="vertical-align: top;"><input class="settinglabel miniinput" style="height:18px;" type="text" placeholder="(Enter Name)" value="" id="chatname" title="The name that you will be chatting as"></td>
 								<td style="vertical-align: top;"><textarea class="settinglabel miniinput" style="resize: none;overflow:hidden;" id="chatopponent" placeholder="(Auto)" rows="1" wrap="off" title="The name of the person you want to chat with" oninput="handle_bot_name_input()" onchange="handle_bot_name_onchange()"></textarea></td>
 								</tr>
 							  </table>
 						</div>
 						<div id="chatnamesection2" class="settinglabel hidden" style="padding-top: 3px;">
 							<div class="settinglabel">
-								<div class="justifyleft settingsmall">Inject Timestamps <span class="helpicon">?<span
+							<div class="justifyleft settingsmall">Multiline Replies <span class="helpicon">?<span
-									class="helptext">Injects timestamps into context, allowing the AI to have a sense of time.</span></span></div>
+								class="helptext">Whether to allow multiple lines in AI responses. Disable this if the AI starts generating rubbish.</span></span> </div>
-								<input type="checkbox" id="inject_timestamps_instruct" style="margin:0px 0 0;">
+							<input type="checkbox" id="multiline_replies" style="margin:0px 0 0;">
 							</div>
 							<div class="settinglabel">
-								<div class="justifyleft settingsmall">Assistant Jailbreak <span class="helpicon">?<span
+							<div class="justifyleft settingsmall">Continue Bot Replies <span class="helpicon">?<span
-									class="helptext">Automatically injects a jailbreak message after every query to make the AI more likely to obey you.</span></span></div>
+								class="helptext">Allow incomplete AI chat replies, which can be continued by pressing submit again. Not recommended for newbies.</span></span></div>
-								<input type="checkbox" id="inject_jailbreak_instruct" style="margin:0px 0 0;">
+							<input type="checkbox" id="allow_continue_chat" style="margin:0px 0 0;">
 							</div>
 						</div>
 					</div>
@ -15513,6 +15584,11 @@ Current version: 138
 								class="helptext">If enabled, uses universal {{user}} and {{[INPUT]}} placeholders that get swapped on submit. If disabled, uses plaintext chat or instruct tags verbatim.</span></span></div>
 						   <input type="checkbox" id="placeholder_tags" style="margin:0px 0px 0px auto;">
 						</div>
 						<div class="settinglabel">
 							<div class="justifyleft settingsmall">Render Sp.Tags <span class="helpicon">?<span
 								class="helptext">If enabled, renders special tags like EOS and padding tokens. Not recommended.</span></span></div>
 						   <input type="checkbox" id="render_special_tags" style="margin:0px 0px 0px auto;">
 						</div>
 						<div class="settinglabel">
 							<div class="justifyleft settingsmall">Run In Background <span class="helpicon">?<span
 								class="helptext">Prevents the browser from suspending Kobold Lite by playing a silent audio track. This setting cannot be saved.</span></span></div>
--- a/llama.cpp
+++ b/llama.cpp
@ -3206,6 +3206,7 @@ struct llama_model_loader {
            switch (type_max) {
                case GGML_TYPE_F32:     ftype = LLAMA_FTYPE_ALL_F32;        break;
                case GGML_TYPE_F16:     ftype = LLAMA_FTYPE_MOSTLY_F16;     break;
                case GGML_TYPE_BF16:    ftype = LLAMA_FTYPE_MOSTLY_BF16;    break;
                case GGML_TYPE_Q4_0:    ftype = LLAMA_FTYPE_MOSTLY_Q4_0;    break;
                case GGML_TYPE_Q4_1:    ftype = LLAMA_FTYPE_MOSTLY_Q4_1;    break;
                case GGML_TYPE_Q5_0:    ftype = LLAMA_FTYPE_MOSTLY_Q5_0;    break;
@ -3710,6 +3711,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
    switch (ftype) {
        case LLAMA_FTYPE_ALL_F32:     return "all F32";
        case LLAMA_FTYPE_MOSTLY_F16:  return "F16";
        case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
        case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
        case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
        case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
@ -6199,6 +6201,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
            || !(
                model.ftype == LLAMA_FTYPE_ALL_F32 ||
                model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
                model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
            )
@ -14473,13 +14476,16 @@ static void llama_tensor_dequantize_internal(
        if (qtype.to_float == NULL) {
            throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
        }
-    } else if (tensor->type != GGML_TYPE_F16) {
+    } else if (tensor->type != GGML_TYPE_F16 &&
               tensor->type != GGML_TYPE_BF16) {
        throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
    }
    if (nthread < 2) {
        if (tensor->type == GGML_TYPE_F16) {
            ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
        } else if (tensor->type == GGML_TYPE_BF16) {
            ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
        } else if (ggml_is_quantized(tensor->type)) {
            qtype.to_float(tensor->data, f32_output, nelements);
        } else {
@ -14488,7 +14494,14 @@ static void llama_tensor_dequantize_internal(
        return;
    }
-    size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
+    size_t block_size;
    if (tensor->type == GGML_TYPE_F16 ||
        tensor->type == GGML_TYPE_BF16) {
        block_size = 1;
    } else {
        block_size = (size_t)ggml_blck_size(tensor->type);
    }
    size_t block_size_bytes = ggml_type_size(tensor->type);
    GGML_ASSERT(nelements % block_size == 0);
@ -14507,6 +14520,8 @@ static void llama_tensor_dequantize_internal(
        auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
            if (typ == GGML_TYPE_F16) {
                ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
            } else if (typ == GGML_TYPE_BF16) {
                ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
            } else {
                qtype.to_float(inbuf, outbuf, nels);
            }
@ -14867,6 +14882,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
        case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
        case LLAMA_FTYPE_MOSTLY_F16:  default_type = GGML_TYPE_F16;  break;
        case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
        case LLAMA_FTYPE_ALL_F32:     default_type = GGML_TYPE_F32;  break;
        // K-quants
--- a/llama.h
+++ b/llama.h
@ -137,6 +137,7 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors
        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };