Merge branch 'upstream' into concedo_experimental

# Conflicts: # examples/parallel/parallel.cpp
2026-05-15 01:15:34 +00:00 · 2025-07-18 23:49:45 +08:00 · 2025-07-18 23:49:45 +08:00 · b0b7a07b34
commit b0b7a07b34
parent b8e3280432 2adf8d83ac
18 changed files with 1070 additions and 600 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -843,6 +843,9 @@ class TextModel(ModelBase):
        if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
            # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
            res = "lfm2"
+        if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
+            # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
+            res = "exaone4"

        if res is None:
            logger.warning("\n")
@ -6780,6 +6783,75 @@ class ExaoneModel(TextModel):
                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))


+@ModelBase.register("Exaone4ForCausalLM")
+class Exaone4Model(TextModel):
+    model_arch = gguf.MODEL_ARCH.EXAONE4
+
+    def set_vocab(self):
+        tokens, toktypes, tokpre = self.get_vocab_base()
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+
+        if hparams.get("sliding_window") is not None:
+            self.gguf_writer.add_sliding_window(hparams["sliding_window"])
+            if "layer_types" in hparams:
+                self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]])
+            elif "sliding_window_pattern" in hparams:
+                sliding_window_pattern = []
+                if isinstance(hparams["sliding_window_pattern"], str):  # e.g. LLLG
+                    for i in range(hparams["num_hidden_layers"]):
+                        sliding_window_pattern.append(hparams["sliding_window_pattern"][i % len(hparams["sliding_window_pattern"])] == "L")
+                if isinstance(hparams["sliding_window_pattern"], int):  # e.g. 4
+                    for i in range(hparams["num_hidden_layers"]):
+                        sliding_window_pattern.append((i + 1) % hparams["sliding_window_pattern"] != 0)
+                if len(sliding_window_pattern) == hparams["num_hidden_layers"]:
+                    self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
+
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
+            if rope_scaling.get("rope_type", '').lower() == "llama3":
+                base = self.hparams.get("rope_theta", 10_000.0)
+                if (dim := self.hparams.get("head_dim")) is None:
+                    dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
+
+                factor = rope_scaling.get("factor", 16.0)
+                low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
+                high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
+                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
+
+                low_freq_wavelen = old_context_len / low_freq_factor
+                high_freq_wavelen = old_context_len / high_freq_factor
+
+                rope_factors = []
+                for freq in freqs:
+                    wavelen = 2 * math.pi / freq
+                    if wavelen < high_freq_wavelen:
+                        rope_factors.append(1)
+                    elif wavelen > low_freq_wavelen:
+                        rope_factors.append(factor)
+                    else:
+                        smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))
+
+                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
+
+
@ModelBase.register("GraniteForCausalLM")
 class GraniteModel(LlamaModel):
    """Conversion for IBM's GraniteForCausalLM"""
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -129,6 +129,7 @@ models = [
    {"name": "a.x-4.0",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
    {"name": "midm-2.0",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
    {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
+    {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
--- a/ggml/src/ggml-cuda/cpy-utils.cuh
+++ b/ggml/src/ggml-cuda/cpy-utils.cuh
@ -0,0 +1,251 @@
+#pragma once
+
+#include "ggml-common.h"
+
+static __device__ __forceinline__ void convert_f32_f32(const float * src, float * dst) {
+    *dst = *src;
+}
+
+static __device__ __forceinline__ void convert_f32_f16(const float * src, half * dst) {
+    *dst = __float2half(*src);
+}
+
+static __device__ __forceinline__ void convert_f32_bf16(const float * src, nv_bfloat16 * dst) {
+    *dst = *src;
+}
+
+static __device__ __forceinline__ void convert_f16_f16(const half * src, half * dst) {
+    *dst = *src;
+}
+
+static __device__ __forceinline__ void convert_f16_f32(const half * src, float * dst) {
+    *dst = *src;
+}
+
+static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
+    if (x <= val[0]) return 0;
+    if (x >= val[n-1]) return n-1;
+    int ml = 0, mu = n-1;
+    while (mu-ml > 1) {
+        int mav = (ml+mu)/2;
+        if (x < val[mav]) mu = mav; else ml = mav;
+    }
+    return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
+}
+
+static __device__ void quantize_f32_q4_0_block(const float * __restrict__ x, block_q4_0 * __restrict__ y) {
+    float amax = 0.0f;
+    float vmax = 0.0f;
+
+    for (int j = 0; j < QK4_0; ++j) {
+        const float v = x[j];
+        if (amax < fabsf(v)) {
+            amax = fabsf(v);
+            vmax = v;
+        }
+    }
+
+    const float d  = vmax / -8;
+    const float id = d ? 1.0f/d : 0.0f;
+
+    y->d = d;
+
+    for (int j = 0; j < QK4_0/2; ++j) {
+        const float x0 = x[0       + j]*id;
+        const float x1 = x[QK4_0/2 + j]*id;
+
+        const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
+        const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
+
+        y->qs[j]  = xi0;
+        y->qs[j] |= xi1 << 4;
+    }
+}
+
+static __device__ void quantize_f32_q4_1_block(const float * __restrict__ x, block_q4_1 * __restrict__ y) {
+    float vmin = FLT_MAX;
+    float vmax = -FLT_MAX;
+
+    for (int j = 0; j < QK4_1; ++j) {
+        const float v = x[j];
+        if (v < vmin) vmin = v;
+        if (v > vmax) vmax = v;
+    }
+
+    const float d  = (vmax - vmin) / ((1 << 4) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    y->dm.x = d;
+    y->dm.y = vmin;
+
+    for (int j = 0; j < QK4_1/2; ++j) {
+        const float x0 = (x[0       + j] - vmin)*id;
+        const float x1 = (x[QK4_1/2 + j] - vmin)*id;
+
+        const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
+        const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
+
+        y->qs[j]  = xi0;
+        y->qs[j] |= xi1 << 4;
+    }
+}
+
+static __device__ void quantize_f32_q5_0_block(const float * __restrict__ x, block_q5_0 * __restrict__ y) {
+    float amax = 0.0f;
+    float vmax = 0.0f;
+
+    for (int j = 0; j < QK5_0; ++j) {
+        const float v = x[j];
+        if (amax < fabsf(v)) {
+            amax = fabsf(v);
+            vmax = v;
+        }
+    }
+
+    const float d  = vmax / -16;
+    const float id = d ? 1.0f/d : 0.0f;
+
+    y->d = d;
+
+    uint32_t qh = 0;
+    for (int j = 0; j < QK5_0/2; ++j) {
+        const float x0 = x[0       + j]*id;
+        const float x1 = x[QK5_0/2 + j]*id;
+
+        const uint8_t xi0 = min(31, (int8_t)(x0 + 16.5f));
+        const uint8_t xi1 = min(31, (int8_t)(x1 + 16.5f));
+
+        y->qs[j]  = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
+        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+        qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
+    }
+    memcpy(y->qh, &qh, sizeof(qh));
+}
+
+static __device__ void quantize_f32_q5_1_block(const float * __restrict__ x, block_q5_1 * __restrict__ y) {
+    float min = x[0];
+    float max = x[0];
+
+    for (int j = 1; j < QK5_1; ++j) {
+        const float v = x[j];
+        min = v < min ? v : min;
+        max = v > max ? v : max;
+    }
+
+    const float d  = (max - min) / 31;
+    const float id = d ? 1.0f/d : 0.0f;
+
+    y->dm.x = d;
+    y->dm.y = min;
+
+    uint32_t qh = 0;
+    for (int j = 0; j < QK5_1/2; ++j) {
+        const float x0 = (x[0       + j] - min)*id;
+        const float x1 = (x[QK5_1/2 + j] - min)*id;
+
+        const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
+        const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
+
+        y->qs[j]  = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
+        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+        qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2);
+    }
+    memcpy(y->qh, &qh, sizeof(qh));
+}
+
+static __device__ void quantize_f32_q8_0_block(const float * __restrict__ x, block_q8_0 * __restrict__ y) {
+    float amax = 0.0f; // absolute max
+
+    for (int j = 0; j < QK8_0; j++) {
+        const float v = x[j];
+        amax = fmaxf(amax, fabsf(v));
+    }
+
+    const float d = amax / ((1 << 7) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    y->d = d;
+
+    for (int j = 0; j < QK8_0; ++j) {
+        const float x0 = x[j]*id;
+        y->qs[j] = roundf(x0);
+    }
+}
+
+static __device__ void quantize_f32_iq4_nl_block(const float * __restrict__ x, block_iq4_nl * __restrict__ y) {
+    float amax = 0.0f;
+    float vmax = 0.0f;
+
+    for (int j = 0; j < QK4_NL; ++j) {
+        const float v = x[j];
+        if (amax < fabsf(v)) {
+            amax = fabsf(v);
+            vmax = v;
+        }
+    }
+
+    float d = vmax / kvalues_iq4nl[0];
+    const float id = d ? 1.0f/d : 0.0f;
+
+    float sumqx = 0, sumq2 = 0;
+    for (int j = 0; j < QK4_NL/2; ++j) {
+        const float x0 = x[0        + j]*id;
+        const float x1 = x[QK4_NL/2 + j]*id;
+        const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
+        const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
+        y->qs[j] = xi0 | (xi1 << 4);
+        const float v0 = kvalues_iq4nl[xi0];
+        const float v1 = kvalues_iq4nl[xi1];
+        const float w0 = x[0        + j]*x[0        + j];
+        const float w1 = x[QK4_NL/2 + j]*x[QK4_NL/2 + j];
+        sumqx += w0*v0*x[j] + w1*v1*x[QK4_NL/2 + j];
+        sumq2 += w0*v0*v0 + w1*v1*v1;
+    }
+
+    y->d = sumq2 > 0 ? sumqx/sumq2 : d;
+}
+
+// Wrapper functions for cpy.cu compatibility
+static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
+    quantize_f32_q4_0_block((const float *)cxi, (block_q4_0 *)cdsti);
+}
+
+static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
+    quantize_f32_q4_1_block((const float *)cxi, (block_q4_1 *)cdsti);
+}
+
+static __device__ void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
+    quantize_f32_q5_0_block((const float *)cxi, (block_q5_0 *)cdsti);
+}
+
+static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
+    quantize_f32_q5_1_block((const float *)cxi, (block_q5_1 *)cdsti);
+}
+
+static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
+    quantize_f32_q8_0_block((const float *)cxi, (block_q8_0 *)cdsti);
+}
+
+static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
+    quantize_f32_iq4_nl_block((const float *)cxi, (block_iq4_nl *)cdsti);
+}
+
+static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
+    convert_f32_f32((const float *)cxi, (float *)cdsti);
+}
+
+static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
+    convert_f32_f16((const float *)cxi, (half *)cdsti);
+}
+
+static __device__ void cpy_1_f32_bf16(const char * cxi, char * cdsti) {
+    convert_f32_bf16((const float *)cxi, (nv_bfloat16 *)cdsti);
+}
+
+static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
+    convert_f16_f16((const half *)cxi, (half *)cdsti);
+}
+
+static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
+    convert_f16_f32((const half *)cxi, (float *)cdsti);
+}
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@ -1,46 +1,12 @@
 #include "cpy.cuh"
 #include "dequantize.cuh"
+#include "cpy-utils.cuh"
 #ifdef GGML_USE_MUSA
 #include "ggml-musa/mudnn.cuh"
 #endif // GGML_USE_MUSA

 typedef void (*cpy_kernel_t)(const char * cx, char * cdst);

-static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    float * dsti = (float *) cdsti;
-
-    *dsti = *xi;
-}
-
-static __device__ void cpy_1_f32_bf16(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    nv_bfloat16 * dsti = (nv_bfloat16 *) cdsti;
-
-    *dsti = *xi;
-}
-
-static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    half * dsti = (half *) cdsti;
-
-    *dsti = __float2half(*xi);
-}
-
-static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) {
-    const half * xi = (const half *) cxi;
-    half * dsti = (half *) cdsti;
-
-    *dsti = *xi;
-}
-
-static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
-    const half * xi = (const half *) cxi;
-    float * dsti = (float *) cdsti;
-
-    *dsti = *xi;
-}
-
 template <cpy_kernel_t cpy_1>
 static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
                                   const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@ -71,29 +37,6 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const in
    cpy_1(cx + x_offset, cdst + dst_offset);
 }

-static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q8_0 * dsti = (block_q8_0 *) cdsti;
-
-    float amax = 0.0f; // absolute max
-
-    for (int j = 0; j < QK8_0; j++) {
-        const float v = xi[j];
-        amax = fmaxf(amax, fabsf(v));
-    }
-
-    const float d = amax / ((1 << 7) - 1);
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->d = d;
-
-    for (int j = 0; j < QK8_0; ++j) {
-        const float x0 = xi[j]*id;
-
-        dsti->qs[j] = roundf(x0);
-    }
-}
-
 static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
    float * cdstf = (float *)(cdsti);

@ -106,139 +49,6 @@ static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
    }
 }

-static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q4_0 * dsti = (block_q4_0 *) cdsti;
-
-    float amax = 0.0f;
-    float vmax = 0.0f;
-
-    for (int j = 0; j < QK4_0; ++j) {
-        const float v = xi[j];
-        if (amax < fabsf(v)) {
-            amax = fabsf(v);
-            vmax = v;
-        }
-    }
-
-    const float d  = vmax / -8;
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->d = d;
-
-    for (int j = 0; j < QK4_0/2; ++j) {
-        const float x0 = xi[0       + j]*id;
-        const float x1 = xi[QK4_0/2 + j]*id;
-
-        const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
-        const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
-
-        dsti->qs[j]  = xi0;
-        dsti->qs[j] |= xi1 << 4;
-    }
-}
-
-static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q4_1 * dsti = (block_q4_1 *) cdsti;
-
-    float vmin = FLT_MAX;
-    float vmax = -FLT_MAX;
-
-    for (int j = 0; j < QK4_1; ++j) {
-        const float v = xi[j];
-
-        if (v < vmin) vmin = v;
-        if (v > vmax) vmax = v;
-    }
-
-    const float d  = (vmax - vmin) / ((1 << 4) - 1);
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->dm.x = d;
-    dsti->dm.y = vmin;
-
-    for (int j = 0; j < QK4_1/2; ++j) {
-        const float x0 = (xi[0       + j] - vmin)*id;
-        const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
-
-        const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
-        const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
-
-        dsti->qs[j]  = xi0;
-        dsti->qs[j] |= xi1 << 4;
-    }
-}
-
-static __device__ void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q5_0 * dsti = (block_q5_0 *) cdsti;
-
-    float amax = 0.0f;
-    float vmax = 0.0f;
-
-    for (int j = 0; j < QK5_0; ++j) {
-        const float v = xi[j];
-        if (amax < fabsf(v)) {
-            amax = fabsf(v);
-            vmax = v;
-        }
-    }
-
-    const float d  = vmax / -16;
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->d = d;
-
-    uint32_t qh = 0;
-    for (int j = 0; j < QK5_0/2; ++j) {
-        const float x0 = xi[0       + j]*id;
-        const float x1 = xi[QK5_0/2 + j]*id;
-
-        const uint8_t xi0 = min(31, (int8_t)(x0 + 16.5f));
-        const uint8_t xi1 = min(31, (int8_t)(x1 + 16.5f));
-
-        dsti->qs[j]  = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
-        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
-        qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
-    }
-    memcpy(dsti->qh, &qh, sizeof(qh));
-}
-
-static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q5_1 * dsti = (block_q5_1 *) cdsti;
-
-    float min = xi[0];
-    float max = xi[0];
-
-    for (int j = 1; j < QK5_1; ++j) {
-        const float v = xi[j];
-        min = v < min ? v : min;
-        max = v > max ? v : max;
-    }
-
-    const float d  = (max - min) / 31;
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->dm.x = d;
-    dsti->dm.y = min;
-
-    uint32_t qh = 0;
-    for (int j = 0; j < QK5_1/2; ++j) {
-        const float x0 = (xi[0       + j] - min)*id;
-        const float x1 = (xi[QK5_1/2 + j] - min)*id;
-
-        const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
-        const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
-
-        dsti->qs[j]  = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
-        qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
-        qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2);
-    }
-    memcpy(dsti->qh, &qh, sizeof(qh));
-}
-
 template<dequantize_kernel_t dequant, int qk>
 static __device__ void cpy_blck_q_f32(const char * cxi, char * cdsti) {
    float * cdstf = (float *)(cdsti);
@ -252,53 +62,6 @@ static __device__ void cpy_blck_q_f32(const char * cxi, char * cdsti) {
    }
 }

-static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
-    if (x <= val[0]) return 0;
-    if (x >= val[n-1]) return n-1;
-    int ml = 0, mu = n-1;
-    while (mu-ml > 1) {
-        int mav = (ml+mu)/2;
-        if (x < val[mav]) mu = mav; else ml = mav;
-    }
-    return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
-}
-
-static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
-
-    float amax = 0.0f;
-    float vmax = 0.0f;
-
-    for (int j = 0; j < QK4_NL; ++j) {
-        const float v = xi[j];
-        if (amax < fabsf(v)) {
-            amax = fabsf(v);
-            vmax = v;
-        }
-    }
-
-    float d = vmax / kvalues_iq4nl[0];
-    const float id = d ? 1.0f/d : 0.0f;
-
-    float sumqx = 0, sumq2 = 0;
-    for (int j = 0; j < QK4_NL/2; ++j) {
-        const float x0 = xi[0        + j]*id;
-        const float x1 = xi[QK4_NL/2 + j]*id;
-        const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
-        const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
-        dsti->qs[j] = xi0 | (xi1 << 4);
-        const float v0 = kvalues_iq4nl[xi0];
-        const float v1 = kvalues_iq4nl[xi1];
-        const float w0 = xi[0        + j]*xi[0        + j];
-        const float w1 = xi[QK4_NL/2 + j]*xi[QK4_NL/2 + j];
-        sumqx += w0*v0*xi[j] + w1*v1*xi[QK4_NL/2 + j];
-        sumq2 += w0*v0*v0 + w1*v1*v1;
-    }
-
-    dsti->d = sumq2 > 0 ? sumqx/sumq2 : d;
-}
-
 template <cpy_kernel_t cpy_blck, int qk>
 static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int ne,
                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -2595,6 +2595,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
    // Loop over nodes in GGML graph to obtain info needed for CUDA graph
    cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();

+    const std::string gemma3n_per_layer_proj_src0_name = "inp_per_layer_selected";
+    const std::string gemma3n_per_layer_proj_src1_name = "per_layer_proj";
+
    for (int i = 0; i < cgraph->n_nodes; i++) {
        ggml_tensor * node = cgraph->nodes[i];

@ -2616,9 +2619,12 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
 #endif
        }

-        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
-            // disable CUDA graphs for batch size > 1 for now.
-            // Changes in batch size or context size can cause changes to the grid size of some kernels.
+        if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && (node->src[0] ? node->src[0]->name != gemma3n_per_layer_proj_src0_name : true) && (node->src[1] ? node->src[1]->name != gemma3n_per_layer_proj_src1_name : true)) {
+            // disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
+            // by means of matching node names. See
+            // https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
+            // https://github.com/huggingface/transformers/blob/bda75b4011239d065de84aa3e744b67ebfa7b245/src/transformers/models/gemma3n/modeling_gemma3n.py#L1773,
+            // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
            use_cuda_graph = false;
 #ifndef NDEBUG
            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
@ -3231,8 +3237,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            } break;
        case GGML_OP_SET_ROWS:
            {
-#pragma message("TODO: implement Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
-                return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16) &&
+                return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 ||
+                       op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q5_0 ||
+                       op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_IQ4_NL) &&
                       op->src[0]->type == GGML_TYPE_F32 &&
                       op->src[1]->type == GGML_TYPE_I64;
            } break;
--- a/ggml/src/ggml-cuda/set-rows.cu
+++ b/ggml/src/ggml-cuda/set-rows.cu
@ -1,4 +1,5 @@
 #include "set-rows.cuh"
+#include "cpy-utils.cuh"

 typedef void (*set_rows_kernel_t)(const char * src, char * dst);

@ -10,17 +11,93 @@ __device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {

 template<>
 __device__ __forceinline__ void set_rows_1<float, half>(const float * src_f, half * dst_h) {
-    *dst_h = __float2half(*src_f);
+    convert_f32_f16(src_f, dst_h);
 }

 template<>
 __device__ __forceinline__ void set_rows_1<float, nv_bfloat16>(const float * src_f, nv_bfloat16 * dst_b) {
-    *dst_b = *src_f;
+    convert_f32_bf16(src_f, dst_b);
 }

 template<>
 __device__ __forceinline__ void set_rows_1<float, float>(const float * src_f, float * dst_f) {
-    *dst_f = *src_f;
+    convert_f32_f32(src_f, dst_f);
+}
+
+// Generic quantized set_rows kernel template
+template<typename block_type, int qk, void (*quantize_func)(const float*, block_type*)>
+static __global__ void k_set_rows_quant(
+        const float * __restrict__ src0, const int64_t * __restrict__ src1, block_type * __restrict__ dst,
+        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
+        const int64_t s01, const int64_t s02, const int64_t s03,
+        const int64_t s10, const int64_t s11, const int64_t s12,
+        const int64_t s1, const int64_t s2, const int64_t s3) {
+
+    const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x;
+    const int64_t ne_total = (ne00 * ne01 * ne02 * ne03) / qk;
+
+    if (i >= ne_total) {
+        return;
+    }
+
+    const int64_t i_base = i * qk;
+    const int64_t i03 = i_base / (ne00 * ne01 * ne02);
+    const int64_t i02 = (i_base - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
+    const int64_t i01 = (i_base - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00;
+    const int64_t i00 = i_base - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00;
+
+    const int64_t i12 = i03 % ne12;
+    const int64_t i11 = i02 % ne11;
+    const int64_t i10 = i01;
+
+    const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
+
+    const float * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
+    block_type * dst_row_ptr = dst + (dst_row*s1 + i02*s2 + i03*s3) / sizeof(block_type);
+
+    const float * src_block = src0_row + i00;
+    block_type * dst_block = dst_row_ptr + i00 / qk;
+
+    quantize_func(src_block, dst_block);
+}
+
+// Template dispatch function for quantized set_rows
+template<typename block_type, int qk, void (*quantize_func)(const float*, block_type*)>
+static void set_rows_cuda_quant(
+        const float * src0_d, const int64_t * src1_d, block_type * dst_d,
+        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
+        const size_t nb01, const size_t nb02, const size_t nb03,
+        const size_t nb10, const size_t nb11, const size_t nb12,
+        const size_t nb1, const size_t nb2, const size_t nb3,
+        cudaStream_t stream) {
+
+    GGML_ASSERT(ne00 % qk == 0);
+    const int64_t ne_total = (ne00 * ne01 * ne02 * ne03) / qk;
+    const int num_blocks = (ne_total + CUDA_SET_ROWS_BLOCK_SIZE - 1) / CUDA_SET_ROWS_BLOCK_SIZE;
+    const dim3 block_size(CUDA_SET_ROWS_BLOCK_SIZE);
+    const dim3 grid_size(num_blocks);
+
+    const int64_t s01 = nb01/sizeof(float);
+    const int64_t s02 = nb02/sizeof(float);
+    const int64_t s03 = nb03/sizeof(float);
+    const int64_t s10 = nb10/sizeof(int64_t);
+    const int64_t s11 = nb11/sizeof(int64_t);
+    const int64_t s12 = nb12/sizeof(int64_t);
+    const int64_t s1  = nb1;
+    const int64_t s2  = nb2;
+    const int64_t s3  = nb3;
+
+    if (ne_total > 0) {
+        k_set_rows_quant<block_type, qk, quantize_func><<<grid_size, block_size, 0, stream>>>(
+            src0_d, src1_d, dst_d,
+            ne00, ne01, ne02, ne03,
+            ne10, ne11, ne12, ne13,
+            s01, s02, s03,
+            s10, s11, s12,
+            s1, s2, s3);
+    }
 }

 template<typename src_t, typename dst_t>
@ -145,7 +222,67 @@ void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
            nb1, nb2, nb3,
            stream
        );
+    } else if (dst->type == GGML_TYPE_Q4_0) {
+        set_rows_cuda_quant<block_q4_0, QK4_0, quantize_f32_q4_0_block>(
+            src0_d, src1_d, (block_q4_0*)dst->data,
+            ne00, ne01, ne02, ne03,
+            ne10, ne11, ne12, ne13,
+            nb01, nb02, nb03,
+            nb10, nb11, nb12,
+            nb1, nb2, nb3,
+            stream
+        );
+    } else if (dst->type == GGML_TYPE_Q4_1) {
+        set_rows_cuda_quant<block_q4_1, QK4_1, quantize_f32_q4_1_block>(
+            src0_d, src1_d, (block_q4_1*)dst->data,
+            ne00, ne01, ne02, ne03,
+            ne10, ne11, ne12, ne13,
+            nb01, nb02, nb03,
+            nb10, nb11, nb12,
+            nb1, nb2, nb3,
+            stream
+        );
+    } else if (dst->type == GGML_TYPE_Q5_0) {
+        set_rows_cuda_quant<block_q5_0, QK5_0, quantize_f32_q5_0_block>(
+            src0_d, src1_d, (block_q5_0*)dst->data,
+            ne00, ne01, ne02, ne03,
+            ne10, ne11, ne12, ne13,
+            nb01, nb02, nb03,
+            nb10, nb11, nb12,
+            nb1, nb2, nb3,
+            stream
+        );
+    } else if (dst->type == GGML_TYPE_Q5_1) {
+        set_rows_cuda_quant<block_q5_1, QK5_1, quantize_f32_q5_1_block>(
+            src0_d, src1_d, (block_q5_1*)dst->data,
+            ne00, ne01, ne02, ne03,
+            ne10, ne11, ne12, ne13,
+            nb01, nb02, nb03,
+            nb10, nb11, nb12,
+            nb1, nb2, nb3,
+            stream
+        );
+    } else if (dst->type == GGML_TYPE_Q8_0) {
+        set_rows_cuda_quant<block_q8_0, QK8_0, quantize_f32_q8_0_block>(
+            src0_d, src1_d, (block_q8_0*)dst->data,
+            ne00, ne01, ne02, ne03,
+            ne10, ne11, ne12, ne13,
+            nb01, nb02, nb03,
+            nb10, nb11, nb12,
+            nb1, nb2, nb3,
+            stream
+        );
+    } else if (dst->type == GGML_TYPE_IQ4_NL) {
+        set_rows_cuda_quant<block_iq4_nl, QK4_NL, quantize_f32_iq4_nl_block>(
+            src0_d, src1_d, (block_iq4_nl*)dst->data,
+            ne00, ne01, ne02, ne03,
+            ne10, ne11, ne12, ne13,
+            nb01, nb02, nb03,
+            nb10, nb11, nb12,
+            nb1, nb2, nb3,
+            stream
+        );
    } else {
-        GGML_ABORT("unsupported type");
+        GGML_ABORT("unsupported type %s", ggml_type_name(dst->type));
    }
 }
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -354,6 +354,7 @@ class MODEL_ARCH(IntEnum):
    JAIS             = auto()
    NEMOTRON         = auto()
    EXAONE           = auto()
+    EXAONE4          = auto()
    GRANITE          = auto()
    GRANITE_MOE      = auto()
    GRANITE_HYBRID   = auto()
@ -671,6 +672,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.JAIS:             "jais",
    MODEL_ARCH.NEMOTRON:         "nemotron",
    MODEL_ARCH.EXAONE:           "exaone",
+    MODEL_ARCH.EXAONE4:          "exaone4",
    MODEL_ARCH.GRANITE:          "granite",
    MODEL_ARCH.GRANITE_MOE:      "granitemoe",
    MODEL_ARCH.GRANITE_HYBRID:   "granitehybrid",
@ -2197,6 +2199,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
+    MODEL_ARCH.EXAONE4: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_POST_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_POST_NORM,
+    ],
    MODEL_ARCH.GRANITE: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
--- a/klite.embd
+++ b/klite.embd
@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
 -->

 <script id="init-config">
-	const LITEVER = 263;
+	const LITEVER = 264;
 	const urlParams = new URLSearchParams(window.location.search);
 	var localflag = urlParams.get('local'); //this will be replaced automatically in embedded kcpp
 	const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@ -20830,7 +20830,9 @@ Current version indicated by LITEVER below.
 					chatunits.push({
 					msg:curr,
 					myturn:myturnchat,
-					unlabelled:unlabelled_turns});
+					unlabelled_name:unlabelled_turns,
+					unlabelled_img:unlabelled_turns
+				});
 				}
 				unlabelled_turns = false;
 			}
@ -21512,7 +21514,9 @@ Current version indicated by LITEVER below.
 						name:localsettings.chatopponent,
 						msg:tempfullsearchable.split(localsettings.chatopponent+": ")[1],
 						myturn:myturnchat,
-						unlabelled: false});
+						unlabelled_name: false,
+						unlabelled_img: false
+					});
 				}
 				else
 				{
@ -21521,7 +21525,9 @@ Current version indicated by LITEVER below.
 						name:foundself[0].substring(0,foundself[0].length-2),
 						msg:tempfullsearchable.split(foundself[0])[1],
 						myturn:myturnchat,
-						unlabelled: false});
+						unlabelled_name: false,
+						unlabelled_img: false
+					});
 				}
 			}
 			else if(foundopponent != null && foundopponent.length > 0)
@ -21531,7 +21537,9 @@ Current version indicated by LITEVER below.
 					name:foundopponent[0].substring(0,foundopponent[0].length-2),
 					msg:tempfullsearchable.split(foundopponent[0])[1],
 					myturn:myturnchat,
-					unlabelled: false});
+					unlabelled_name: false,
+					unlabelled_img: false
+					});
 			}else{ //unknown sender, just use existing turn
 				if(chatunits.length==0)
 				{
@ -21541,7 +21549,9 @@ Current version indicated by LITEVER below.
 						name:"",
 						msg:tempfullsearchable,
 						myturn:myturnchat,
-						unlabelled: true});
+						unlabelled_name: true,
+						unlabelled_img: true
+					});
 					}
 				}
 				else
@ -23070,7 +23080,7 @@ Current version indicated by LITEVER below.
 		else if(localsettings.opmode==1)
 		{
 			//aesthetic mode repacks story as one big chunk
-			chatunits = [{"msg":input, "myturn":false, "unlabelled":true}];
+			chatunits = [{"msg":input, "myturn":false, "unlabelled_name":true, "unlabelled_img":true}];
 		}
 		else
 		{
@ -23082,7 +23092,16 @@ Current version indicated by LITEVER below.
 		let pendstream = "";
 		if (synchro_pending_stream != "" && !isPreview) {
 			pendstream = escape_html(pending_context_preinjection) + format_streaming_text(escape_html(synchro_pending_stream));
-			chatunits.push({"msg":`<span class='pending_text'>${pendstream}</span>`,"myturn":false});
+			let allow_cont_prev_turn = (localsettings.opmode==4 || (localsettings.opmode==3 && localsettings.allow_continue_chat));
+			if(chatunits.length>0 && chatunits[chatunits.length-1].myturn==false && chatunits[chatunits.length-1].msg && allow_cont_prev_turn)
+			{
+				//inject into previous turn, only for instruct OR continuechat
+				chatunits[chatunits.length-1].msg += `<span class='pending_text'>${pendstream}</span>`;
+			}
+			else
+			{
+				chatunits.push({"msg":`<span class='pending_text'>${pendstream}</span>`,"myturn":false,"unlabelled_name":true, "unlabelled_img":false});
+			}
 		}
 		for(var i=0;i<chatunits.length;++i)
 		{
@ -23122,9 +23141,12 @@ Current version indicated by LITEVER below.
 			let showavatar = true;

 			//adventure and story has no names or avatars, also handle unlabelled first turns for chat/instruct
-			if((i == 0 && !curr.myturn && curr.unlabelled) || (localsettings.opmode==2 || localsettings.opmode==1))
+			if((!curr.myturn && curr.unlabelled_name) || (localsettings.opmode==2 || localsettings.opmode==1))
 			{
 				namepart = "";
+			}
+			if((!curr.myturn && curr.unlabelled_img) || (localsettings.opmode==2 || localsettings.opmode==1))
+			{
 				showavatar = false;
 			}

--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_JAIS,             "jais"             },
    { LLM_ARCH_NEMOTRON,         "nemotron"         },
    { LLM_ARCH_EXAONE,           "exaone"           },
+    { LLM_ARCH_EXAONE4,          "exaone4"          },
    { LLM_ARCH_RWKV6,            "rwkv6"            },
    { LLM_ARCH_RWKV6QWEN2,       "rwkv6qwen2"       },
    { LLM_ARCH_RWKV7,            "rwkv7"            },
@ -1510,6 +1511,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
+    {
+        LLM_ARCH_EXAONE4,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
+        }
+    },
    {
        LLM_ARCH_RWKV6,
        {
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@ -72,6 +72,7 @@ enum llm_arch {
    LLM_ARCH_JAIS,
    LLM_ARCH_NEMOTRON,
    LLM_ARCH_EXAONE,
+    LLM_ARCH_EXAONE4,
    LLM_ARCH_RWKV6,
    LLM_ARCH_RWKV6QWEN2,
    LLM_ARCH_RWKV7,
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@ -56,6 +56,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "glmedge",           LLM_CHAT_TEMPLATE_GLMEDGE           },
    { "minicpm",           LLM_CHAT_TEMPLATE_MINICPM           },
    { "exaone3",           LLM_CHAT_TEMPLATE_EXAONE_3          },
+    { "exaone4",           LLM_CHAT_TEMPLATE_EXAONE_4          },
    { "rwkv-world",        LLM_CHAT_TEMPLATE_RWKV_WORLD        },
    { "granite",           LLM_CHAT_TEMPLATE_GRANITE           },
    { "gigachat",          LLM_CHAT_TEMPLATE_GIGACHAT          },
@ -168,6 +169,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
    } else if (tmpl_contains(LU8("<｜Assistant｜>")) && tmpl_contains(LU8("<｜User｜>")) && tmpl_contains(LU8("<｜end▁of▁sentence｜>"))) {
        return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
    } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
+        if (tmpl_contains("[|tool|]")) {
+            return LLM_CHAT_TEMPLATE_EXAONE_4;
+        }
        // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
        // EXAONE-3.0-7.8B-Instruct
        return LLM_CHAT_TEMPLATE_EXAONE_3;
@ -532,6 +536,22 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "[|assistant|]";
        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_4) {
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
+            } else if (role == "user") {
+                ss << "[|user|]" << trim(message->content) << "\n";
+            } else if (role == "assistant") {
+                ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
+            } else if (role == "tool") {
+                ss << "[|tool|]" << trim(message->content) << "[|endofturn|]\n";
+            }
+        }
+        if (add_ass) {
+            ss << "[|assistant|]";
+        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
        // this template requires the model to have "\n\n" as EOT token
        for (size_t i = 0; i < chat.size(); i++) {
--- a/src/llama-chat.h
+++ b/src/llama-chat.h
@ -35,6 +35,7 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_GLMEDGE,
    LLM_CHAT_TEMPLATE_MINICPM,
    LLM_CHAT_TEMPLATE_EXAONE_3,
+    LLM_CHAT_TEMPLATE_EXAONE_4,
    LLM_CHAT_TEMPLATE_RWKV_WORLD,
    LLM_CHAT_TEMPLATE_GRANITE,
    LLM_CHAT_TEMPLATE_GIGACHAT,
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -694,7 +694,7 @@ bool llama_context::apply_adapter_cvec(
    return cvec.apply(model, data, len, n_embd, il_start, il_end);
 }

-llm_graph_result_i * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
+llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
    if (mctx && !mctx->apply()) {
        LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
        ret = GGML_STATUS_FAILED;
@ -1312,7 +1312,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
 //

 uint32_t llama_context::graph_max_nodes() const {
-    return std::max<uint32_t>(65536u, 5u*model.n_tensors());
+    return std::max<uint32_t>(1024u, 8u*model.n_tensors());
 }

 llm_graph_result * llama_context::get_gf_res_reserve() const {
@ -1363,7 +1363,7 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
 }

 llm_graph_params llama_context::graph_params(
-                      llm_graph_result_i * res,
+                        llm_graph_result * res,
                      const llama_ubatch & ubatch,
            const llama_memory_context_i * mctx,
            llm_graph_type   gtype) const {
--- a/src/llama-context.h
+++ b/src/llama-context.h
@ -94,7 +94,7 @@ struct llama_context {
    // if memory_context is provided, it will be applied first to the context's memory
    // ret contains the status of the graph computation
    // returns nullptr only if ret != GGML_STATUS_SUCCESS
-    llm_graph_result_i * process_ubatch(
+    llm_graph_result * process_ubatch(
                const llama_ubatch & ubatch,
                    llm_graph_type   gtype,
            llama_memory_context_i * mctx,
@ -199,7 +199,7 @@ public:

 private:
    llm_graph_params graph_params(
-                      llm_graph_result_i * res,
+                        llm_graph_result * res,
                      const llama_ubatch & ubatch,
            const llama_memory_context_i * mctx,
                          llm_graph_type   gtype) const;
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -486,6 +486,10 @@ llm_graph_input_i * llm_graph_result::add_input(llm_graph_input_ptr input) {
    return inputs.back().get();
 }

+void llm_graph_result::set_params(const llm_graph_params & params) {
+    this->params = params;
+}
+
 //
 // llm_graph_context
 //
@ -527,9 +531,10 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    mctx             (params.mctx),
    cross            (params.cross),
    cb_func          (params.cb),
-    res              (static_cast<llm_graph_result *>(params.res)),
-    ctx0             (res->get_ctx()) {
-        res->params = params;
+    res              (params.res),
+    ctx0             (res->get_ctx()),
+    gf               (res->get_gf()) {
+        res->set_params(params);
    }

 void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
@ -901,8 +906,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
    }

    // aggregate experts
+    // note: here we explicitly use hparams.n_expert_used instead of n_expert_used
+    //       to avoid potentially a large number of add nodes during warmup
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/14753
    ggml_tensor * moe_out = nullptr;
-    for (int i = 0; i < n_expert_used; ++i) {
+    for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
        ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
                experts->nb[2], i*experts->nb[1]);

@ -913,7 +921,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
        }
    }

-    if (n_expert_used == 1) {
+    if (hparams.n_expert_used == 1) {
        // avoid returning a non-contiguous tensor
        moe_out = ggml_cont(ctx0, moe_out);
    }
@ -1119,7 +1127,6 @@ ggml_tensor * llm_graph_context::build_pos_bias(ggml_tensor * pos_bucket, ggml_t
 }

 ggml_tensor * llm_graph_context::build_attn_mha(
-         ggml_cgraph * gf,
         ggml_tensor * q,
         ggml_tensor * k,
         ggml_tensor * v,
@ -1253,7 +1260,6 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con

 ggml_tensor * llm_graph_context::build_attn(
        llm_graph_input_attn_no_cache * inp,
-        ggml_cgraph * gf,
        ggml_tensor * wo,
        ggml_tensor * wo_b,
        ggml_tensor * q_cur,
@ -1281,7 +1287,7 @@ ggml_tensor * llm_graph_context::build_attn(
    ggml_tensor * k = k_cur;
    ggml_tensor * v = v_cur;

-    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
+    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
    cb(cur, "kqv_out", il);

    if (wo) {
@ -1337,7 +1343,6 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()

 ggml_tensor * llm_graph_context::build_attn(
        llm_graph_input_attn_kv_unified * inp,
-        ggml_cgraph * gf,
        ggml_tensor * wo,
        ggml_tensor * wo_b,
        ggml_tensor * q_cur,
@ -1370,7 +1375,7 @@ ggml_tensor * llm_graph_context::build_attn(
    ggml_tensor * k = mctx_cur->get_k(ctx0, il);
    ggml_tensor * v = mctx_cur->get_v(ctx0, il);

-    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
+    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
    cb(cur, "kqv_out", il);

    if (wo) {
@ -1390,7 +1395,6 @@ ggml_tensor * llm_graph_context::build_attn(

 ggml_tensor * llm_graph_context::build_attn(
        llm_graph_input_attn_kv_unified_iswa * inp,
-        ggml_cgraph * gf,
        ggml_tensor * wo,
        ggml_tensor * wo_b,
        ggml_tensor * q_cur,
@ -1437,7 +1441,7 @@ ggml_tensor * llm_graph_context::build_attn(
    ggml_tensor * k = mctx_cur->get_k(ctx0, il);
    ggml_tensor * v = mctx_cur->get_v(ctx0, il);

-    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
+    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
    cb(cur, "kqv_out", il);

    if (wo) {
@ -1470,7 +1474,6 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {

 ggml_tensor * llm_graph_context::build_attn(
        llm_graph_input_attn_cross * inp,
-        ggml_cgraph * gf,
        ggml_tensor * wo,
        ggml_tensor * wo_b,
        ggml_tensor * q_cur,
@ -1492,7 +1495,7 @@ ggml_tensor * llm_graph_context::build_attn(
    ggml_tensor * k = k_cur;
    ggml_tensor * v = v_cur;

-    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
+    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
    cb(cur, "kqv_out", il);

    if (wo) {
@ -1550,7 +1553,6 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
 }

 ggml_tensor * llm_graph_context::build_rs(
-        ggml_cgraph * gf,
        ggml_tensor * s,
        ggml_tensor * state_copy,
            int32_t   state_size,
@ -1608,21 +1610,19 @@ llm_graph_input_rs * llm_graph_context::build_rs_inp() const {

 ggml_tensor * llm_graph_context::build_rs(
        llm_graph_input_rs * inp,
-        ggml_cgraph * gf,
        ggml_tensor * s,
            int32_t   state_size,
            int32_t   n_seqs,
        const llm_graph_get_rows_fn & get_state_rows) const {
    const auto * kv_state = inp->mctx;

-    return build_rs(gf, s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), get_state_rows);
+    return build_rs(s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), get_state_rows);
 }

 ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
    llm_graph_input_rs * inp,
-           ggml_cgraph * gf,
    const llama_ubatch & ubatch,
-                 int   il) const {
+                   int   il) const {
    const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);

    const auto token_shift_count = hparams.token_shift_count;
@ -1632,7 +1632,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
    ggml_tensor * token_shift_all = mctx_cur->get_r_l(il);

    ggml_tensor * token_shift = build_rs(
-            inp, gf, token_shift_all,
+            inp, token_shift_all,
            hparams.n_embd_r(), n_seqs);

    token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
@ -1672,7 +1672,6 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
 }

 void llm_graph_context::build_pooling(
-        ggml_cgraph * gf,
        ggml_tensor * cls,
        ggml_tensor * cls_b,
        ggml_tensor * cls_out,
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@ -371,31 +371,11 @@ public:
 // along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
 //   these are used by the llama_context to extact the relevant data, based on the compute parameters

-// TODO: this interface seems redundant - remove it
-class llm_graph_result_i {
-public:
-    virtual ~llm_graph_result_i() = default;
-
-    virtual ggml_tensor * get_tokens()      const = 0;
-    virtual ggml_tensor * get_logits()      const = 0;
-    virtual ggml_tensor * get_embd()        const = 0;
-    virtual ggml_tensor * get_embd_pooled() const = 0;
-
-    virtual ggml_cgraph  * get_gf()  = 0;
-    virtual ggml_context * get_ctx() = 0;
-
-    virtual void reset() = 0;
-
-    virtual void set_inputs(const llama_ubatch * ubatch) = 0;
-
-    virtual bool can_reuse(const llm_graph_params & params) = 0;
-};
-
-using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>;
-
 // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
 using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;

+class llm_graph_result;
+
 struct llm_graph_params {
    llm_arch arch = LLM_ARCH_UNKNOWN;

@ -418,8 +398,7 @@ struct llm_graph_params {

    llm_graph_cb cb;

-    // TODO: temporary
-    llm_graph_result_i * res;
+    llm_graph_result * res;

    // return true if the "other" params would result in a graph with the same topology as with the current params
    //   having the same topology allows us to reuse the graph in some cases
@ -464,35 +443,37 @@ struct llm_graph_params {
    }
 };

-class llm_graph_result : public llm_graph_result_i {
+class llm_graph_result {
 public:
    llm_graph_result(int64_t max_nodes);

    virtual ~llm_graph_result() = default;

-    ggml_tensor * get_tokens()      const override { return t_tokens; }
-    ggml_tensor * get_logits()      const override { return t_logits; }
-    ggml_tensor * get_embd()        const override { return t_embd; }
-    ggml_tensor * get_embd_pooled() const override { return t_embd_pooled; }
+    ggml_tensor * get_tokens()      const { return t_tokens; }
+    ggml_tensor * get_logits()      const { return t_logits; }
+    ggml_tensor * get_embd()        const { return t_embd; }
+    ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }

-    ggml_cgraph  * get_gf()  override { return gf; }
-    ggml_context * get_ctx() override { return ctx_compute.get(); }
+    ggml_cgraph  * get_gf()  const { return gf; }
+    ggml_context * get_ctx() const { return ctx_compute.get(); }

    int64_t get_max_nodes() const;

-    void reset() override;
+    void reset();

-    void set_inputs(const llama_ubatch * ubatch) override;
+    void set_inputs(const llama_ubatch * ubatch);

    // try to update the existing graph result using the new graph parameters in order to reuse it
    // this can only be done if we determine that the resulting graph using the new graph parameters
    //   would be identical to the existing graph. in that case, we simply have to update the memory
    //   contexts of the input tensors of the graph and we can reuse it for another computation
    // return true if the graph was updated and can be reused
-    bool can_reuse(const llm_graph_params & params) override;
+    bool can_reuse(const llm_graph_params & params);

    llm_graph_input_i * add_input(llm_graph_input_ptr input);

+    void set_params(const llm_graph_params & params);
+
    // important graph nodes
    ggml_tensor * t_tokens      = nullptr;
    ggml_tensor * t_logits      = nullptr;
@ -510,6 +491,7 @@ public:

    int64_t max_nodes;

+private:
    // keep a copy of the previous graph parameters
    // we will use this to determine whether the graph can be reused by comparing them with the new parameters
    // note: these are updated after constructing the new graph
@ -519,6 +501,8 @@ public:
    int debug = 0;
 };

+using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>;
+
 //
 // llm_graph_context
 //
@ -576,6 +560,7 @@ struct llm_graph_context {
    llm_graph_result * res;

    ggml_context * ctx0 = nullptr;
+    ggml_cgraph  * gf   = nullptr;

    llm_graph_context(const llm_graph_params & params);
    virtual ~llm_graph_context() = default;
@ -661,7 +646,6 @@ struct llm_graph_context {
    //

    ggml_tensor * build_attn_mha(
-             ggml_cgraph * gf,
             ggml_tensor * q,       // [n_embd_head_q, n_head_q, n_tokens]
             ggml_tensor * k,       // [n_embd_head_k, n_head_k, n_tokens]
             ggml_tensor * v,       // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
@ -674,7 +658,6 @@ struct llm_graph_context {

    ggml_tensor * build_attn(
            llm_graph_input_attn_no_cache * inp,
-            ggml_cgraph * gf,
            ggml_tensor * wo,
            ggml_tensor * wo_b,
            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@ -689,7 +672,6 @@ struct llm_graph_context {

    ggml_tensor * build_attn(
            llm_graph_input_attn_kv_unified * inp,
-            ggml_cgraph * gf,
            ggml_tensor * wo,
            ggml_tensor * wo_b,
            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@ -705,7 +687,6 @@ struct llm_graph_context {
    // note: if k_cur or v_cur are not provided, they will not be stored in the memory
    ggml_tensor * build_attn(
            llm_graph_input_attn_kv_unified_iswa * inp,
-            ggml_cgraph * gf,
            ggml_tensor * wo,
            ggml_tensor * wo_b,
            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@ -720,7 +701,6 @@ struct llm_graph_context {

    ggml_tensor * build_attn(
            llm_graph_input_attn_cross * inp,
-            ggml_cgraph * gf,
            ggml_tensor * wo,
            ggml_tensor * wo_b,
            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
@ -742,7 +722,6 @@ struct llm_graph_context {
    //         implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
    //         `llama_memory_recurrent`
    ggml_tensor * build_rs(
-            ggml_cgraph * gf,
            ggml_tensor * s,
            ggml_tensor * state_copy,
                int32_t   state_size,
@ -757,7 +736,6 @@ struct llm_graph_context {

    ggml_tensor * build_rs(
            llm_graph_input_rs * inp,
-            ggml_cgraph * gf,
            ggml_tensor * s,
                int32_t   state_size,
                int32_t   n_seqs,
@ -765,9 +743,8 @@ struct llm_graph_context {

    ggml_tensor * build_rwkv_token_shift_load(
        llm_graph_input_rs * inp,
-               ggml_cgraph * gf,
        const llama_ubatch & ubatch,
-                     int   il) const;
+                       int   il) const;

    ggml_tensor * build_rwkv_token_shift_store(
             ggml_tensor * token_shift,
@ -784,7 +761,6 @@ struct llm_graph_context {
    //

    void build_pooling(
-            ggml_cgraph * gf,
            ggml_tensor * cls,
            ggml_tensor * cls_b,
            ggml_tensor * cls_out,
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -2161,6 +2161,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            } else if (
                tokenizer_pre == "exaone") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
+            } else if (
+                tokenizer_pre == "exaone4") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
            } else if (
                tokenizer_pre == "chameleon") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;