Merge commit '763d06edb7' into concedo_experimental

# Conflicts: # .github/workflows/build-linux-cross.yml # ggml/CMakeLists.txt # ggml/src/ggml-cann/CMakeLists.txt # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-vulkan/CMakeLists.txt # tools/mtmd/CMakeLists.txt # tools/mtmd/clip.cpp # tools/mtmd/mtmd.cpp # tools/server/CMakeLists.txt
2025-09-10 17:14:36 +00:00 · 2025-05-31 12:44:18 +08:00 · 2025-05-31 12:44:18 +08:00 · c987abf9f5
commit c987abf9f5
parent 0c108f6054 763d06edb7
28 changed files with 777 additions and 291 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -49,6 +49,6 @@ charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset
-[tools/mtmd/miniaudio.h]
+[tools/mtmd/vendor/miniaudio.h]
 trim_trailing_whitespace = unset
 insert_final_newline = unset
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -440,7 +440,7 @@ add_library(ggml
            ggml/include/gguf.h
            ggml/src/gguf.cpp
            ${GGML_SOURCES_CUDA})
-target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools)
+target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
 set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
@ -449,7 +449,7 @@ target_compile_options(ggml PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-use_fast_math -e
 add_library(ggml_v1
            otherarch/ggml_v1.c
            otherarch/ggml_v1.h)
-target_include_directories(ggml_v1 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools)
+target_include_directories(ggml_v1 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor)
 target_compile_features(ggml_v1 PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml_v1 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
 set_target_properties(ggml_v1 PROPERTIES POSITION_INDEPENDENT_CODE ON)
@ -459,7 +459,7 @@ add_library(ggml_v2
            otherarch/ggml_v2.h
            ${GGML_V2_CUDA_SOURCES}
            ${GGML_V2_LEGACY_CUDA_SOURCES})
-target_include_directories(ggml_v2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools)
+target_include_directories(ggml_v2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor)
 target_compile_features(ggml_v2 PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
 set_target_properties(ggml_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
@ -468,7 +468,7 @@ add_library(ggml_v3
            otherarch/ggml_v3.c
            otherarch/ggml_v3.h
            ${GGML_V3_CUDA_SOURCES})
-target_include_directories(ggml_v3 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools)
+target_include_directories(ggml_v3 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor)
 target_compile_features(ggml_v3 PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml_v3 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
 set_target_properties(ggml_v3 PROPERTIES POSITION_INDEPENDENT_CODE ON)
@ -487,42 +487,42 @@ add_library(common2
            src/unicode-data.cpp
            otherarch/utils.cpp
            otherarch/utils.h)
-target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
+target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
 target_compile_features(common2 PUBLIC cxx_std_17) # don't bump
 target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_library(sdtype_adapter
            otherarch/sdcpp/sdtype_adapter.cpp)
-target_include_directories(sdtype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
+target_include_directories(sdtype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
 target_compile_features(sdtype_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(sdtype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(sdtype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_library(whisper_adapter
            otherarch/whispercpp/whisper_adapter.cpp)
-target_include_directories(whisper_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/whispercpp ./tools ./common)
+target_include_directories(whisper_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/whispercpp ./tools ./common)
 target_compile_features(whisper_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_library(tts_adapter
            otherarch/tts_adapter.cpp)
-target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools ./common)
+target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./tools ./common)
 target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_library(embeddings_adapter
            otherarch/embeddings_adapter.cpp)
-target_include_directories(embeddings_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools ./common)
+target_include_directories(embeddings_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./tools ./common)
 target_compile_features(embeddings_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(embeddings_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(embeddings_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_library(gpttype_adapter
            gpttype_adapter.cpp)
-target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
+target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
 target_compile_features(gpttype_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(gpttype_adapter PRIVATE common2 ggml ggml_v1 ggml_v2 ggml_v3 ${LLAMA_EXTRA_LIBS})
 set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
@ -530,7 +530,7 @@ set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 if (LLAMA_CUBLAS)
    set(TARGET koboldcpp_cublas)
    add_library(${TARGET} SHARED expose.cpp expose.h)
-    target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
+    target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
    target_compile_features(${TARGET} PUBLIC cxx_std_17) # don't bump
    set_target_properties(${TARGET} PROPERTIES PREFIX "")
    set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
@ -550,7 +550,7 @@ endif()
 if (LLAMA_HIPBLAS)
    set(TARGET koboldcpp_hipblas)
    add_library(${TARGET} SHARED expose.cpp expose.h)
-    target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
+    target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
    target_compile_features(${TARGET} PUBLIC cxx_std_17) # don't bump
    set_target_properties(${TARGET} PROPERTIES PREFIX "")
    set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas")
--- a/4
+++ b/4
@ -51,8 +51,8 @@ ifdef KCPP_DEBUG
 	CFLAGS = -g -O0
 	CXXFLAGS = -g -O0
 endif
-CFLAGS   += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64
+CFLAGS   += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./tools/mtmd/vendor -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64
-CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64
+CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./tools/mtmd/vendor -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64
 ifndef KCPP_DEBUG
 	CFLAGS += -DNDEBUG -s
 	CXXFLAGS += -DNDEBUG -s
--- a/common/common.cpp
+++ b/common/common.cpp
@ -911,13 +911,16 @@ struct common_init_result common_init_from_params(common_params & params) {
            ok = false;
        }
-        if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
+        bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
-            LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
+        bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
            ok = false;
        }
-        if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
+        if (!has_eos && !has_sep) {
-            LOG_WRN("%s: warning: vocab does not have a  SEP token, reranking will not work\n", __func__);
+            LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
            ok = false;
        } else if (!has_eos) {
            LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
        } else if (!has_sep) {
            LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
            ok = false;
        }
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -423,19 +423,19 @@ class ModelBase:
        try:
            # for security reason, we don't allow loading remote code by default
            # if a model need remote code, we will fallback to config.json
-            return AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
+            config = AutoConfig.from_pretrained(dir_model, trust_remote_code=False).to_dict()
        except Exception as e:
            logger.warning(f"Failed to load model config from {dir_model}: {e}")
            logger.warning("Trying to load config.json instead")
            with open(dir_model / "config.json", "r", encoding="utf-8") as f:
                config = json.load(f)
-                if "llm_config" in config:
+        if "llm_config" in config:
-                    # rename for InternVL
+            # rename for InternVL
-                    config["text_config"] = config["llm_config"]
+            config["text_config"] = config["llm_config"]
-                if "thinker_config" in config:
+        if "thinker_config" in config:
-                    # rename for Qwen2.5-Omni
+            # rename for Qwen2.5-Omni
-                    config["text_config"] = config["thinker_config"]["text_config"]
+            config["text_config"] = config["thinker_config"]["text_config"]
-                return config
+        return config
    @classmethod
    def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@ -1207,7 +1207,7 @@ class MmprojModel(ModelBase):
            self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
            self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
-        else:
+        if not self.has_vision_encoder and not self.has_audio_encoder:
            raise ValueError("MmprojModel must have either vision or audio encoder")
    def write_vocab(self):
@ -1841,7 +1841,8 @@ class StableLMModel(TextModel):
    "MistralForCausalLM",
    "MixtralForCausalLM",
    "VLlama3ForCausalLM",
-    "LlavaForConditionalGeneration")
+    "LlavaForConditionalGeneration",
    "LlamaModel")
 class LlamaModel(TextModel):
    model_arch = gguf.MODEL_ARCH.LLAMA
    undo_permute = True
@ -1921,6 +1922,8 @@ class LlamaModel(TextModel):
        if is_vision_tensor:
            return [] # skip vision tensors
        elif self.hf_arch == "LlamaModel":
            name = "model." + name
        elif name.startswith("model.text_model"):
            name = name.replace("text_model.", "") # for SmolVLM
        elif name.startswith("language_model."):
@ -2169,6 +2172,9 @@ class Llama4VisionModel(MmprojModel):
            # process vision tensors
            if "positional_embedding_vlm" in name and ".weight" not in name:
                name += ".weight"
            if "multi_modal_projector.linear_1" in name:
                # despite the name with number postfix, this is a single fully connected layer
                return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
            return [(self.map_tensor_name(name), data_torch)]
        return []
@ -3676,7 +3682,7 @@ class InternLM3Model(TextModel):
        return [(self.map_tensor_name(name), data_torch)]
-@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel")
+@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification")
 class BertModel(TextModel):
    model_arch = gguf.MODEL_ARCH.BERT
@ -3739,6 +3745,13 @@ class BertModel(TextModel):
        if name.startswith("cls.seq_relationship"):
            return []
        # For BertForSequenceClassification (direct projection layer)
        if name == "classifier.weight":
            name = "classifier.out_proj.weight"
        if name == "classifier.bias":
            name = "classifier.out_proj.bias"
        return [(self.map_tensor_name(name), data_torch)]
    def _xlmroberta_tokenizer_init(self) -> None:
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -288,7 +288,7 @@ logger.info("+++ convert_hf_to_gguf.py was updated")
 tests = [
    "ied 4 ½ months",
-    "Führer",
+    "Äpfel",
    "",
    " ",
    "  ",
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@ -623,8 +623,8 @@ static __global__ void flash_attn_combine_results(
    __builtin_assume(tid < D);
    extern __shared__ float2 meta[];
-    if (tid < 2*parallel_blocks) {
+    for (int i = tid; i < 2*parallel_blocks; i += D) {
-        ((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + tid];
+        ((float *) meta)[i] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + i];
    }
    __syncthreads();
--- a/ggml/src/ggml-opencl/kernels/argsort.cl
+++ b/ggml/src/ggml-opencl/kernels/argsort.cl
@ -0,0 +1,86 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 #define SWAP(x, y, T) { T tmp = (x); (x) = (y); (y) = tmp; }
 enum ggml_sort_order {
    GGML_SORT_ORDER_ASC,
    GGML_SORT_ORDER_DESC,
 };
 kernel void kernel_argsort_f32_i32(
    global float * src0,
    ulong          offset0,
    global int   * dst,
    ulong          offsetd,
    const int      ne00,
    const int      ne00_pad,
    const int      order,
    local int    * dst_row
 ) {
    // bitonic sort
    int col = get_local_id(0);
    int row = get_group_id(1);
    if (col >= ne00_pad) {
        return;
    }
    src0 = (global char  *)((global char *)src0 + offset0);
    dst  = (global float *)((global char *)dst  + offsetd);
    global float * x_row = src0 + row * ne00;
    // initialize indices
    dst_row[col] = col;
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int k = 2; k <= ne00_pad; k *= 2) {
        for (int j = k / 2; j > 0; j /= 2) {
            int ixj = col ^ j;
            if (ixj > col) {
                if ((col & k) == 0) {
                    if (dst_row[col] >= ne00 ||
                        (dst_row[ixj] < ne00 && (order == GGML_SORT_ORDER_ASC ?
                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :
                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))
                    ) {
                        SWAP(dst_row[col], dst_row[ixj], int);
                    }
                } else {
                    if (dst_row[ixj] >= ne00 ||
                        (dst_row[col] < ne00 && (order == GGML_SORT_ORDER_ASC ?
                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :
                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))
                    ) {
                        SWAP(dst_row[col], dst_row[ixj], int);
                    }
                }
            }
            barrier(CLK_LOCAL_MEM_FENCE);
        }
    }
    // copy the result to dst without the padding
    if (col < ne00) {
        dst[row * ne00 + col] = dst_row[col];
    }
 }
--- a/ggml/src/ggml-opencl/kernels/div.cl
+++ b/ggml/src/ggml-opencl/kernels/div.cl
@ -0,0 +1,72 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 //------------------------------------------------------------------------------
 // div
 //------------------------------------------------------------------------------
 kernel void kernel_div(
        global char * src0,
        ulong offset0,
        global char * src1,
        ulong offset1,
        global char * dst,
        ulong offsetd,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne10,
        int ne11,
        int ne12,
        int ne13,
        ulong nb10,
        ulong nb11,
        ulong nb12,
        ulong nb13,
        int ne0,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3
 ) {
    src0 = src0 + offset0;
    src1 = src1 + offset1;
    dst  = dst + offsetd;
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0);
    int i13 = i03 % ne13;
    int i12 = i02 % ne12;
    int i11 = i01 % ne11;
    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
        const int i10 = i0 % ne10;
        *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) / *((global float *)(src1_ptr + i10*nb10));
    }
 }
 // assumption: src1 is a row
 // broadcast src1 into src0
 kernel void kernel_div_row(
        global float4 * src0,
        ulong offset0,
        global float4 * src1,
        ulong offset1,
        global float4 * dst,
        ulong offsetd,
        int ne
 ) {
    src0 = (global float4*)((global char*)src0 + offset0);
    src1 = (global float4*)((global char*)src1 + offset1);
    dst = (global float4*)((global char*)dst + offsetd);
    // This performs better than using %.
    uint gid = get_global_id(0);
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
    dst[gid] = src0[gid] / src1[idx1];
 }
--- a/ggml/src/ggml-opencl/kernels/group_norm.cl
+++ b/ggml/src/ggml-opencl/kernels/group_norm.cl
@ -0,0 +1,72 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 // Workgroup must be a subgroup
 #ifdef INTEL_GPU
 REQD_SUBGROUP_SIZE_32
 #elif defined (ADRENO_GPU)
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_group_norm(
        global float * src0,
        ulong offset0,
        global float * dst,
        ulong offsetd,
        int ne,
        int group_size,
        float eps
 ) {
    src0 = (global float  *)((global char *)src0 + offset0);
    dst  = (global float *)((global char *)dst  + offsetd);
    int start = get_group_id(0) * group_size;
    int end   = start + group_size;
    start += get_local_id(0);
    if (end >= ne) {
        end = ne;
    }
    float tmp = 0.0f;
    for (int j = start; j < end; j += get_local_size(0)) {
        tmp += src0[j];
    }
    tmp = sub_group_reduce_add(tmp);
    const float mean = tmp / group_size;
    tmp = 0.0f;
    for (int j = start; j < end; j += get_local_size(0)) {
        float xi = src0[j] - mean;
        dst[j] = xi;
        tmp += xi * xi;
    }
    tmp = sub_group_reduce_add(tmp);
    const float variance = tmp / group_size;
    const float scale = 1.0f/sqrt(variance + eps);
    for (int j = start; j < end; j += get_local_size(0)) {
        dst[j] *= scale;
    }
 }
--- a/ggml/src/ggml-opencl/kernels/sigmoid.cl
+++ b/ggml/src/ggml-opencl/kernels/sigmoid.cl
@ -0,0 +1,29 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 //------------------------------------------------------------------------------
 // sigmoid
 //------------------------------------------------------------------------------
 kernel void kernel_sigmoid_f32(
        global float * src0,
        ulong offset0,
        global float * dst,
        ulong offsetd
 ) {
    src0 = (global float*)((global char*)src0 + offset0);
    dst = (global float*)((global char*)dst + offsetd);
    dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
 }
 kernel void kernel_sigmoid_f16(
        global half * src0,
        ulong offset0,
        global half * dst,
        ulong offsetd
 ) {
    src0 = (global half*)((global char*)src0 + offset0);
    dst = (global half*)((global char*)dst + offsetd);
    dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
 }
--- a/ggml/src/ggml-opencl/kernels/sub.cl
+++ b/ggml/src/ggml-opencl/kernels/sub.cl
@ -0,0 +1,72 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 //------------------------------------------------------------------------------
 // div
 //------------------------------------------------------------------------------
 kernel void kernel_sub(
        global char * src0,
        ulong offset0,
        global char * src1,
        ulong offset1,
        global char * dst,
        ulong offsetd,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne10,
        int ne11,
        int ne12,
        int ne13,
        ulong nb10,
        ulong nb11,
        ulong nb12,
        ulong nb13,
        int ne0,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3
 ) {
    src0 = src0 + offset0;
    src1 = src1 + offset1;
    dst  = dst + offsetd;
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0);
    int i13 = i03 % ne13;
    int i12 = i02 % ne12;
    int i11 = i01 % ne11;
    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
        const int i10 = i0 % ne10;
        *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) - *((global float *)(src1_ptr + i10*nb10));
    }
 }
 // assumption: src1 is a row
 // broadcast src1 into src0
 kernel void kernel_sub_row(
        global float4 * src0,
        ulong offset0,
        global float4 * src1,
        ulong offset1,
        global float4 * dst,
        ulong offsetd,
        int ne
 ) {
    src0 = (global float4*)((global char*)src0 + offset0);
    src1 = (global float4*)((global char*)src1 + offset1);
    dst = (global float4*)((global char*)dst + offsetd);
    // This performs better than using %.
    uint gid = get_global_id(0);
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
    dst[gid] = src0[gid] - src1[idx1];
 }
--- a/ggml/src/ggml-opencl/kernels/sum_rows.cl
+++ b/ggml/src/ggml-opencl/kernels/sum_rows.cl
@ -0,0 +1,39 @@
 kernel void kernel_sum_rows_f32(
    global float *  src0,
    ulong           offset0,
    global float *  dst,
    ulong           offsetd,
    int             ne00,
    int             ne01,
    int             ne02,
    int             ne03,
    ulong           nb01,
    ulong           nb02,
    ulong           nb03,
    ulong           nb1,
    ulong           nb2,
    ulong           nb3
 ) {
    src0 = (global float *)((global char *)src0 + offset0);
    dst  = (global float *)((global char *)dst  + offsetd);
    int i3 = get_global_id(2);
    int i2 = get_global_id(1);
    int i1 = get_global_id(0);
    if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
        return;
    }
    global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
    global float * dst_row = (global float *) ((global char *) dst  + i1*nb1  + i2*nb2  + i3*nb3);
    float row_sum = 0;
    for (int i0 = 0; i0 < ne00; i0++) {
        row_sum += src_row[i0];
    }
    dst_row[0] = row_sum;
 }
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -12,7 +12,7 @@
 #include "ggml-vulkan.h"
 #include <vulkan/vulkan_core.h>
-#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) || defined(GGML_VULKAN_CHECK_RESULTS)
+#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
 #include <chrono>
 #include "ggml-cpu.h"
 #endif
@ -200,9 +200,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
 #ifdef GGML_VULKAN_MEMORY_DEBUG
 class vk_memory_logger;
 #endif
 #ifdef GGML_VULKAN_PERF
 class vk_perf_logger;
 #endif
 static void ggml_vk_destroy_buffer(vk_buffer& buf);
 static constexpr uint32_t mul_mat_vec_max_cols = 8;
@ -458,9 +456,11 @@ struct vk_device_struct {
 #ifdef GGML_VULKAN_MEMORY_DEBUG
    std::unique_ptr<vk_memory_logger> memory_logger;
 #endif
-#ifdef GGML_VULKAN_PERF
+
    // for GGML_VK_PERF_LOGGER
    std::unique_ptr<vk_perf_logger> perf_logger;
-#endif
+    vk::QueryPool query_pool;
    uint32_t num_queries;
    ~vk_device_struct() {
        VK_LOG_DEBUG("destroy device " << name);
@ -844,8 +844,6 @@ private:
 #define VK_LOG_MEMORY(msg) ((void) 0)
 #endif // GGML_VULKAN_MEMORY_DEBUG
 #if defined(GGML_VULKAN_PERF)
 class vk_perf_logger {
 public:
    void print_timings() {
@ -855,7 +853,7 @@ public:
            for (const auto& time : t.second) {
                total += time;
            }
-            std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " ms" << std::endl;
+            std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
        }
        timings.clear();
@ -884,7 +882,6 @@ public:
 private:
    std::map<std::string, std::vector<uint64_t>> timings;
 };
 #endif // GGML_VULKAN_PERF
 struct ggml_backend_vk_context {
    std::string name;
@ -974,6 +971,8 @@ struct vk_instance_t {
 static bool vk_instance_initialized = false;
 static vk_instance_t vk_instance;
 static bool vk_perf_logger_enabled = false;
 #ifdef GGML_VULKAN_CHECK_RESULTS
 static size_t vk_skip_checks;
 static size_t vk_output_tensor;
@ -2773,9 +2772,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
 #ifdef GGML_VULKAN_MEMORY_DEBUG
        device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
 #endif
-#ifdef GGML_VULKAN_PERF
+        if (vk_perf_logger_enabled) {
-        device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
+            device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
-#endif
+        }
        size_t dev_num = vk_instance.device_indices[idx];
@ -3571,6 +3570,8 @@ static void ggml_vk_instance_init() {
    vk_instance.instance = vk::createInstance(instance_create_info);
    vk_instance_initialized = true;
    vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
    size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
    // Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
@ -8909,7 +8910,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
    ctx->tensor_ctxs[node_idx] = compute_ctx;
-#if defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_PERF)
+#if defined(GGML_VULKAN_CHECK_RESULTS)
    // Force context reset on each node so that each tensor ends up in its own context
    // and can be run and compared to its CPU equivalent separately
    last_node = true;
@ -9529,6 +9530,29 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
    bool first_node_in_batch = true; // true if next node will be first node in a batch
    int submit_node_idx = 0; // index to first node in a batch
    vk_context compute_ctx;
    if (vk_perf_logger_enabled) {
        // allocate/resize the query pool
        if (ctx->device->num_queries < cgraph->n_nodes + 1) {
            if (ctx->device->query_pool) {
                ctx->device->device.destroyQueryPool(ctx->device->query_pool);
            }
            VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
            query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
            query_create_info.queryCount = cgraph->n_nodes + 100;
            ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
            ctx->device->num_queries = query_create_info.queryCount;
        }
        ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
        GGML_ASSERT(ctx->compute_ctx.expired());
        compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
        ctx->compute_ctx = compute_ctx;
        ggml_vk_ctx_begin(ctx->device, compute_ctx);
        compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
    }
    // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
    // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
    // (and scaled down based on model size, so smaller models submit earlier).
@ -9556,6 +9580,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
        bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);
        if (vk_perf_logger_enabled) {
            if (ctx->compute_ctx.expired()) {
                compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
                ctx->compute_ctx = compute_ctx;
                ggml_vk_ctx_begin(ctx->device, compute_ctx);
            } else {
                compute_ctx = ctx->compute_ctx.lock();
            }
            compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+1);
        }
        if (enqueued) {
            ++submitted_nodes;
@ -9577,9 +9612,27 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
        }
    }
-#ifdef GGML_VULKAN_PERF
+    if (vk_perf_logger_enabled) {
-    ctx->device->perf_logger->print_timings();
+        // End the command buffer and submit/wait
-#endif
+        GGML_ASSERT(!ctx->compute_ctx.expired());
        compute_ctx = ctx->compute_ctx.lock();
        ggml_vk_ctx_end(compute_ctx);
        ggml_vk_submit(compute_ctx, ctx->device->fence);
        VK_CHECK(ctx->device->device.waitForFences({ ctx->device->fence }, true, UINT64_MAX), "GGML_VULKAN_PERF waitForFences");
        ctx->device->device.resetFences({ ctx->device->fence });
        // Get the results and pass them to the logger
        std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
        ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
        for (int i = 0; i < cgraph->n_nodes; i++) {
            if (!ggml_vk_is_empty(cgraph->nodes[i])) {
                ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
            }
        }
        ctx->device->perf_logger->print_timings();
    }
    ggml_vk_graph_cleanup(ctx);
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -902,7 +902,6 @@ class TensorNameMap:
        MODEL_TENSOR.V_MMPROJ_FC: (
            "model.connector.modality_projection.proj", # SmolVLM
            "multi_modal_projector.linear_1", # llama 4
        ),
        MODEL_TENSOR.V_MMPROJ_MLP: (
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -455,7 +455,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    }
 int64_t llm_graph_context::n_pos_per_embd() const {
-    return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
+    return hparams.rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
 }
 void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
@ -1562,20 +1562,25 @@ void llm_graph_context::build_pooling(
                ggml_tensor * inp_cls = build_inp_cls();
                inp = ggml_get_rows(ctx0, inp, inp_cls);
-                // classification head
+                if (cls != nullptr && cls_b != nullptr) {
-                // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
+                    // classification head
-                GGML_ASSERT(cls   != nullptr);
+                    // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
-                GGML_ASSERT(cls_b != nullptr);
+                    cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
                    cur = ggml_tanh(ctx0, cur);
-                cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
+                    // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
-                cur = ggml_tanh(ctx0, cur);
+                    // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
-
+                    if (cls_out) {
-                // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
+                        GGML_ASSERT(cls_out_b != nullptr);
-                // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
+                        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
-                if (cls_out) {
+                    }
                } else if (cls_out) {
                    // Single layer classification head (direct projection)
                    // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
                    GGML_ASSERT(cls_out_b != nullptr);
-
+                    cur = ggml_add(ctx0, ggml_mul_mat(ctx0, cls_out, inp), cls_out_b);
-                    cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
+                } else {
                    GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
                }
            } break;
        default:
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@ -757,11 +757,19 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
    const auto & yarn_beta_slow  = cparams.yarn_beta_slow;
    const auto & n_rot     = hparams.n_rot;
-    const auto & rope_type = hparams.rope_type;
+    const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE
                                // @ngxson : this is a workaround
                                // for M-RoPE, we want to rotate the whole vector when doing KV shift
                                // a normal RoPE should work, we just need to use the correct ordering
                                // ref: https://github.com/ggml-org/llama.cpp/pull/13870
                                ? LLAMA_ROPE_TYPE_NEOX
                                : hparams.rope_type;
    // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
-    const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
+    const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
                                    ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
                                    : cparams.yarn_attn_factor;
    ggml_tensor * tmp;
--- a/tools/mtmd/mtmd-audio.cpp
+++ b/tools/mtmd/mtmd-audio.cpp
@ -1,28 +1,5 @@
 // fix problem with std::min and std::max
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #ifndef NOMINMAX
 #   define NOMINMAX
 #endif
 #include <windows.h>
 #endif
 #include "mtmd-audio.h"
 //#define MTMD_AUDIO_DEBUG
 #define MINIAUDIO_IMPLEMENTATION
 #ifndef MTMD_AUDIO_DEBUG
 #   define MA_NO_ENCODING
 #endif
 #define MA_NO_DEVICE_IO
 #define MA_NO_RESOURCE_MANAGER
 #define MA_NO_NODE_GRAPH
 #define MA_NO_ENGINE
 #define MA_NO_GENERATION
 #define MA_API static
 #include "miniaudio.h"
 #define _USE_MATH_DEFINES // for M_PI
 #include <cmath>
 #include <cstdint>
@ -359,69 +336,6 @@ bool preprocess_audio(
 } // namespace whisper_preprocessor
 namespace audio_helpers {
 bool is_audio_file(const char * buf, size_t len) {
    if (len < 12) {
        return false;
    }
    // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
    // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
    bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
    bool is_mp3 = len >= 3 && (
        memcmp(buf, "ID3", 3) == 0 ||
        // Check for MPEG sync word (simplified check)
        ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
    );
    bool is_flac = memcmp(buf, "fLaC", 4) == 0;
    return is_wav || is_mp3 || is_flac;
 }
 // returns true if the buffer is a valid audio file
 bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
    ma_result result;
    const int channels = 1;
    ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
    ma_decoder decoder;
    result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
    if (result != MA_SUCCESS) {
        return false;
    }
    ma_uint64 frame_count;
    ma_uint64 frames_read;
    result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
    if (result != MA_SUCCESS) {
        ma_decoder_uninit(&decoder);
        return false;
    }
    pcmf32_mono.resize(frame_count);
    result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
    if (result != MA_SUCCESS) {
        ma_decoder_uninit(&decoder);
        return false;
    }
 #ifdef MTMD_AUDIO_DEBUG
    // save audio to wav file
    ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
    ma_encoder encoder;
    ma_encoder_init_file("output.wav", &config, &encoder);
    ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
    ma_encoder_uninit(&encoder);
 #endif
    ma_decoder_uninit(&decoder);
    return true;
 }
 } // namespace wav_utils
 // precalculated mel filter banks
 // values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function
 //
--- a/tools/mtmd/mtmd-audio.h
+++ b/tools/mtmd/mtmd-audio.h
@ -32,7 +32,7 @@ struct whisper_filters {
    std::vector<float> data;
 };
-extern bool preprocess_audio(
+bool preprocess_audio(
        const float * samples,
        size_t n_samples,
        const whisper_filters & filters,
@ -40,23 +40,8 @@ extern bool preprocess_audio(
 } // namespace whisper_preprocessor
 // TODO @ngxson : move this helper to mtmd-helpers.cpp
 namespace audio_helpers {
 extern bool is_audio_file(const char * buf, size_t len);
 extern bool decode_audio_from_buf(
        const unsigned char * buf_in,
        size_t len,
        int target_sampler_rate,
        std::vector<float> & pcmf32_mono);
 } // namespace audio_helpers
 namespace whisper_precalc_filters {
-extern whisper_preprocessor::whisper_filters get_128_bins();
+whisper_preprocessor::whisper_filters get_128_bins();
 } // namespace whisper_precalc_filters
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@ -7,6 +7,7 @@
 #include "console.h"
 #include "chat.h"
 #include "mtmd.h"
 #include "mtmd-helper.h"
 #include <vector>
 #include <limits.h>
@ -143,7 +144,7 @@ struct mtmd_cli_context {
    }
    bool load_media(const std::string & fname) {
-        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
        if (!bmp.ptr) {
            return false;
        }
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@ -1,10 +1,37 @@
 // fix problem with std::min and std::max
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #ifndef NOMINMAX
 #   define NOMINMAX
 #endif
 #include <windows.h>
 #endif
 #include "mtmd.h"
 #include "mtmd-helper.h"
 #include "llama.h"
 #include <algorithm>
 #include <cinttypes>
 #include <vector>
 //#define MTMD_AUDIO_DEBUG
 #define MINIAUDIO_IMPLEMENTATION
 #ifndef MTMD_AUDIO_DEBUG
 #   define MA_NO_ENCODING
 #endif
 #define MA_NO_DEVICE_IO
 #define MA_NO_RESOURCE_MANAGER
 #define MA_NO_NODE_GRAPH
 #define MA_NO_ENGINE
 #define MA_NO_GENERATION
 #define MA_API static
 #include "vendor/miniaudio.h"
 #define STB_IMAGE_IMPLEMENTATION
 #include "vendor/stb_image.h"
 #define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
 #define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
@ -315,3 +342,118 @@ int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
    return 0;
 }
 namespace audio_helpers {
 static bool is_audio_file(const char * buf, size_t len) {
    if (len < 12) {
        return false;
    }
    // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
    // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
    bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
    bool is_mp3 = len >= 3 && (
        memcmp(buf, "ID3", 3) == 0 ||
        // Check for MPEG sync word (simplified check)
        ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
    );
    bool is_flac = memcmp(buf, "fLaC", 4) == 0;
    return is_wav || is_mp3 || is_flac;
 }
 // returns true if the buffer is a valid audio file
 static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
    ma_result result;
    const int channels = 1;
    ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
    ma_decoder decoder;
    result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
    if (result != MA_SUCCESS) {
        return false;
    }
    ma_uint64 frame_count;
    ma_uint64 frames_read;
    result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
    if (result != MA_SUCCESS) {
        ma_decoder_uninit(&decoder);
        return false;
    }
    pcmf32_mono.resize(frame_count);
    result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
    if (result != MA_SUCCESS) {
        ma_decoder_uninit(&decoder);
        return false;
    }
 #ifdef MTMD_AUDIO_DEBUG
    // save audio to wav file
    ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
    ma_encoder encoder;
    ma_encoder_init_file("output.wav", &config, &encoder);
    ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
    ma_encoder_uninit(&encoder);
 #endif
    ma_decoder_uninit(&decoder);
    return true;
 }
 } // namespace audio_helpers
 mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
    if (audio_helpers::is_audio_file((const char *)buf, len)) {
        std::vector<float> pcmf32;
        int bitrate = mtmd_get_audio_bitrate(ctx);
        if (bitrate < 0) {
            LOG_ERR("This model does not support audio input\n");
            return nullptr;
        }
        if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) {
            LOG_ERR("Unable to read WAV audio file from buffer\n");
            return nullptr;
        }
        return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
    }
    // otherwise, we assume it's an image
    mtmd_bitmap * result = nullptr;
    {
        int nx, ny, nc;
        auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3);
        if (!data) {
            LOG_ERR("%s: failed to decode image bytes\n", __func__);
            return nullptr;
        }
        result = mtmd_bitmap_init(nx, ny, data);
        stbi_image_free(data);
    }
    return result;
 }
 mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
    std::vector<unsigned char> buf;
    FILE * f = fopen(fname, "rb");
    if (!f) {
        LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
        return nullptr;
    }
    fseek(f, 0, SEEK_END);
    long file_size = ftell(f);
    fseek(f, 0, SEEK_SET);
    buf.resize(file_size);
    size_t n_read = fread(buf.data(), 1, file_size, f);
    fclose(f);
    if (n_read != (size_t)file_size) {
        LOG_ERR("Failed to read entire file %s", fname);
        return nullptr;
    }
    return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
 }
--- a/tools/mtmd/mtmd-helper.h
+++ b/tools/mtmd/mtmd-helper.h
@ -0,0 +1,91 @@
 #ifndef MTMD_HELPER_H
 #define MTMD_HELPER_H
 #include "ggml.h"
 #include "llama.h"
 #include "mtmd.h"
 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 //
 // libmtmd helper functions
 //
 // Please note that these helpers are not guaranteed to be stable.
 // BREAKING CHANGES are expected.
 //
 // helper function to construct a mtmd_bitmap from a file
 // it calls mtmd_helper_bitmap_init_from_buf() internally
 // returns nullptr on failure
 // this function is thread-safe
 MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
 // helper function to construct a mtmd_bitmap from a buffer containing a file
 // supported formats:
 //     image: formats supported by stb_image: jpg, png, bmp, gif, etc.
 //     audio: formats supported by miniaudio: wav, mp3, flac
 // note: audio files will be auto-detected based on magic bytes
 // returns nullptr on failure
 // this function is thread-safe
 MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
 // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
 MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
 // helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
 // normally, n_pos is equal to n_tokens, but for M-RoPE it is different
 MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
 // helper function that automatically:
 // 1. run llama_decode() on text chunks
 // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
 // if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
 // otherwise, returns 0 on success
 // this function is NOT thread-safe
 MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
                                         struct llama_context * lctx,
                                         const mtmd_input_chunks * chunks,
                                         llama_pos n_past,
                                         llama_seq_id seq_id,
                                         int32_t n_batch,
                                         bool logits_last,
                                         llama_pos * new_n_past);
 // works like mtmd_helper_eval_chunks(), but only for a single chunk
 // this function is NOT thread-safe
 MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
                                               struct llama_context * lctx,
                                               const mtmd_input_chunk * chunk,
                                               llama_pos n_past,
                                               llama_seq_id seq_id,
                                               int32_t n_batch,
                                               bool logits_last,
                                               llama_pos * new_n_past);
 // helper function to decode an image whose embeddings have already been calculated
 // this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
 // ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
 MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
                                                struct llama_context * lctx,
                                                const mtmd_input_chunk * chunk,
                                                float * encoded_embd,
                                                llama_pos n_past,
                                                llama_seq_id seq_id,
                                                int32_t n_batch,
                                                llama_pos * new_n_past);
 #ifdef __cplusplus
 } // extern "C"
 #endif
 //
 // C++ wrappers
 //
 #endif
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@ -819,53 +819,12 @@ bool mtmd_support_audio(mtmd_context * ctx) {
    return ctx->ctx_a != nullptr;
 }
-// these 2 helpers below use internal clip_image_u8_ptr,
+int mtmd_get_audio_bitrate(mtmd_context * ctx) {
-// so unfortunately they cannot moved to mtmd-helper.h
+    if (!ctx->ctx_a) {
-// however, in theory, user can decode image file to bitmap using
+        return -1;
 // whichever library they want, and then use mtmd_bitmap_init() to create bitmap
 mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
    if (audio_helpers::is_audio_file((const char *)buf, len)) {
        std::vector<float> pcmf32;
        if (!audio_helpers::decode_audio_from_buf(buf, len, COMMON_SAMPLE_RATE, pcmf32)) {
            LOG_ERR("Unable to read WAV audio file from buffer\n");
            return nullptr;
        }
        return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
    }
-
+    // for now, we assume that all audio models have the same bitrate
-    clip_image_u8_ptr img_u8(clip_image_u8_init());
+    return 16000; // 16kHz
    bool ok = clip_image_load_from_bytes(buf, len, img_u8.get(),2048);
    if (!ok) {
        LOG_ERR("Unable to load image from buffer\n");
        return nullptr;
    }
    uint32_t nx, ny;
    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
    return mtmd_bitmap_init(nx, ny, data);
 }
 mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
    std::vector<unsigned char> buf;
    FILE * f = fopen(fname, "rb");
    if (!f) {
        LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
        return nullptr;
    }
    fseek(f, 0, SEEK_END);
    long file_size = ftell(f);
    fseek(f, 0, SEEK_SET);
    buf.resize(file_size);
    size_t n_read = fread(buf.data(), 1, file_size, f);
    fclose(f);
    if (n_read != (size_t)file_size) {
        LOG_ERR("Failed to read entire file %s", fname);
        return nullptr;
    }
    return mtmd_helper_bitmap_init_from_buf(buf.data(), buf.size());
 }
 //
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@ -109,6 +109,10 @@ MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
 // whether the current model supports audio input
 MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
 // get audio bitrate in Hz, for example 16000 for Whisper
 // return -1 if audio is not supported
 MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
 // mtmd_bitmap
 //
 // if bitmap is image:
@ -209,75 +213,6 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 /////////////////////////////////////////
 //
 // Helper functions (can be implemented based on other functions)
 //
 // Please note that these helpers are not guaranteed to be stable.
 // BREAKING CHANGES are expected.
 //
 // helper function to construct a mtmd_bitmap from a file
 // it calls mtmd_helper_bitmap_init_from_buf() internally
 // returns nullptr on failure
 // this function is thread-safe
 MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname);
 // helper function to construct a mtmd_bitmap from a buffer containing a file
 // supported formats:
 //     image: formats supported by stb_image: jpg, png, bmp, gif, etc.
 //     audio: formats supported by miniaudio: wav, mp3, flac
 // note: audio files will be auto-detected based on magic bytes
 // returns nullptr on failure
 // this function is thread-safe
 MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len);
 // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
 MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
 // helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
 // normally, n_pos is equal to n_tokens, but for M-RoPE it is different
 MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
 // helper function that automatically:
 // 1. run llama_decode() on text chunks
 // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
 // if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
 // otherwise, returns 0 on success
 // this function is NOT thread-safe
 MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
                                         struct llama_context * lctx,
                                         const mtmd_input_chunks * chunks,
                                         llama_pos n_past,
                                         llama_seq_id seq_id,
                                         int32_t n_batch,
                                         bool logits_last,
                                         llama_pos * new_n_past);
 // works like mtmd_helper_eval_chunks(), but only for a single chunk
 // this function is NOT thread-safe
 MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
                                               struct llama_context * lctx,
                                               const mtmd_input_chunk * chunk,
                                               llama_pos n_past,
                                               llama_seq_id seq_id,
                                               int32_t n_batch,
                                               bool logits_last,
                                               llama_pos * new_n_past);
 // helper function to decode an image whose embeddings have already been calculated
 // this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
 // ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
 MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
                                                struct llama_context * lctx,
                                                const mtmd_input_chunk * chunk,
                                                float * encoded_embd,
                                                llama_pos n_past,
                                                llama_seq_id seq_id,
                                                int32_t n_batch,
                                                llama_pos * new_n_past);
 /////////////////////////////////////////
 // test function, to be used in test-mtmd-c-api.c
 MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
--- a/tools/mtmd/vendor/miniaudio.h
+++ b/tools/mtmd/vendor/miniaudio.h
--- a/tools/mtmd/vendor/stb_image.h
+++ b/tools/mtmd/vendor/stb_image.h
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -9,6 +9,7 @@
 #include "sampling.h"
 #include "speculative.h"
 #include "mtmd.h"
 #include "mtmd-helper.h"
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
@ -4187,7 +4188,7 @@ int main(int argc, char ** argv) {
                    throw std::runtime_error("This server does not support multimodal");
                }
                for (auto & file : files) {
-                    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
+                    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size()));
                    if (!bmp.ptr) {
                        throw std::runtime_error("Failed to load image or audio file");
                    }
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@ -6,6 +6,7 @@
 #include "arg.h" // common_remote_get_content
 #include "base64.hpp"
 #include "mtmd.h"
 #include "mtmd-helper.h"
 // increase max payload length to allow use of larger context size
 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
@ -264,13 +265,19 @@ static size_t validate_utf8(const std::string& text) {
 static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) {
    llama_tokens result;
    // Get EOS token - use SEP token as fallback if EOS is not available
    llama_token eos_token = llama_vocab_eos(vocab);
    if (eos_token == LLAMA_TOKEN_NULL) {
        eos_token = llama_vocab_sep(vocab);
    }
    result.reserve(doc.size() + query.size() + 4);
    result.push_back(llama_vocab_bos(vocab));
    result.insert(result.end(), query.begin(), query.end());
-    result.push_back(llama_vocab_eos(vocab));
+    result.push_back(eos_token);
    result.push_back(llama_vocab_sep(vocab));
    result.insert(result.end(), doc.begin(), doc.end());
-    result.push_back(llama_vocab_eos(vocab));
+    result.push_back(eos_token);
    return result;
 }
@ -573,7 +580,7 @@ struct oaicompat_parser_options {
 // used by /chat/completions endpoint
 static json oaicompat_chat_params_parse(
-    const json & body, /* openai api json semantics */
+    json & body, /* openai api json semantics */
    const oaicompat_parser_options & opt,
    std::vector<raw_buffer> & out_files)
 {
@ -624,7 +631,7 @@ static json oaicompat_chat_params_parse(
    if (!body.contains("messages")) {
        throw std::runtime_error("'messages' is required");
    }
-    json messages = body.at("messages");
+    json & messages = body.at("messages");
    if (!messages.is_array()) {
        throw std::runtime_error("Expected 'messages' to be an array");
    }