Merge branch 'upstream' into concedo_experimental

# Conflicts: # common/CMakeLists.txt # common/arg.cpp # common/chat.cpp # examples/parallel/README.md # examples/parallel/parallel.cpp # ggml/cmake/common.cmake # ggml/src/CMakeLists.txt # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/rope.cpp # models/ggml-vocab-bert-bge.gguf.inp # models/ggml-vocab-bert-bge.gguf.out # models/ggml-vocab-command-r.gguf.inp # models/ggml-vocab-command-r.gguf.out # models/ggml-vocab-deepseek-coder.gguf.inp # models/ggml-vocab-deepseek-coder.gguf.out # models/ggml-vocab-deepseek-llm.gguf.inp # models/ggml-vocab-deepseek-llm.gguf.out # models/ggml-vocab-falcon.gguf.inp # models/ggml-vocab-falcon.gguf.out # models/ggml-vocab-gpt-2.gguf.inp # models/ggml-vocab-gpt-2.gguf.out # models/ggml-vocab-llama-bpe.gguf.inp # models/ggml-vocab-llama-bpe.gguf.out # models/ggml-vocab-llama-spm.gguf.inp # models/ggml-vocab-llama-spm.gguf.out # models/ggml-vocab-mpt.gguf.inp # models/ggml-vocab-mpt.gguf.out # models/ggml-vocab-phi-3.gguf.inp # models/ggml-vocab-phi-3.gguf.out # models/ggml-vocab-qwen2.gguf.inp # models/ggml-vocab-qwen2.gguf.out # models/ggml-vocab-refact.gguf.inp # models/ggml-vocab-refact.gguf.out # models/ggml-vocab-starcoder.gguf.inp # models/ggml-vocab-starcoder.gguf.out # requirements/requirements-gguf_editor_gui.txt # tests/CMakeLists.txt # tests/test-chat.cpp # tests/test-grammar-integration.cpp # tests/test-json-schema-to-grammar.cpp # tools/mtmd/CMakeLists.txt # tools/run/run.cpp # tools/server/CMakeLists.txt
2025-09-11 09:34:37 +00:00 · 2025-05-31 13:04:21 +08:00 · 2025-05-31 13:04:21 +08:00 · b08dca65ed
commit b08dca65ed
parent c987abf9f5 e562eece7c
62 changed files with 3208 additions and 27493 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -49,6 +49,6 @@ charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset
-[tools/mtmd/vendor/miniaudio.h]
+[vendor/miniaudio/miniaudio.h]
 trim_trailing_whitespace = unset
 insert_final_newline = unset
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -440,7 +440,7 @@ add_library(ggml
            ggml/include/gguf.h
            ggml/src/gguf.cpp
            ${GGML_SOURCES_CUDA})
-target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor)
+target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
 set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
@ -449,7 +449,7 @@ target_compile_options(ggml PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-use_fast_math -e
 add_library(ggml_v1
            otherarch/ggml_v1.c
            otherarch/ggml_v1.h)
-target_include_directories(ggml_v1 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor)
+target_include_directories(ggml_v1 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor)
 target_compile_features(ggml_v1 PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml_v1 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
 set_target_properties(ggml_v1 PROPERTIES POSITION_INDEPENDENT_CODE ON)
@ -459,7 +459,7 @@ add_library(ggml_v2
            otherarch/ggml_v2.h
            ${GGML_V2_CUDA_SOURCES}
            ${GGML_V2_LEGACY_CUDA_SOURCES})
-target_include_directories(ggml_v2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor)
+target_include_directories(ggml_v2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor)
 target_compile_features(ggml_v2 PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
 set_target_properties(ggml_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
@ -468,7 +468,7 @@ add_library(ggml_v3
            otherarch/ggml_v3.c
            otherarch/ggml_v3.h
            ${GGML_V3_CUDA_SOURCES})
-target_include_directories(ggml_v3 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor)
+target_include_directories(ggml_v3 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor)
 target_compile_features(ggml_v3 PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml_v3 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
 set_target_properties(ggml_v3 PROPERTIES POSITION_INDEPENDENT_CODE ON)
@ -487,42 +487,42 @@ add_library(common2
            src/unicode-data.cpp
            otherarch/utils.cpp
            otherarch/utils.h)
-target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
+target_include_directories(common2 PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
 target_compile_features(common2 PUBLIC cxx_std_17) # don't bump
 target_link_libraries(common2 PRIVATE ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(common2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_library(sdtype_adapter
            otherarch/sdcpp/sdtype_adapter.cpp)
-target_include_directories(sdtype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
+target_include_directories(sdtype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
 target_compile_features(sdtype_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(sdtype_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(sdtype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_library(whisper_adapter
            otherarch/whispercpp/whisper_adapter.cpp)
-target_include_directories(whisper_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/whispercpp ./tools ./common)
+target_include_directories(whisper_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./otherarch/whispercpp ./tools ./common)
 target_compile_features(whisper_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(whisper_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(whisper_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_library(tts_adapter
            otherarch/tts_adapter.cpp)
-target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./tools ./common)
+target_include_directories(tts_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./tools ./common)
 target_compile_features(tts_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(tts_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(tts_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_library(embeddings_adapter
            otherarch/embeddings_adapter.cpp)
-target_include_directories(embeddings_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./tools ./common)
+target_include_directories(embeddings_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./tools ./common)
 target_compile_features(embeddings_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(embeddings_adapter PRIVATE common2 ggml ${LLAMA_EXTRA_LIBS})
 set_target_properties(embeddings_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 add_library(gpttype_adapter
            gpttype_adapter.cpp)
-target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
+target_include_directories(gpttype_adapter PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
 target_compile_features(gpttype_adapter PUBLIC cxx_std_17) # don't bump
 target_link_libraries(gpttype_adapter PRIVATE common2 ggml ggml_v1 ggml_v2 ggml_v3 ${LLAMA_EXTRA_LIBS})
 set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
@ -530,7 +530,7 @@ set_target_properties(gpttype_adapter PROPERTIES POSITION_INDEPENDENT_CODE ON)
 if (LLAMA_CUBLAS)
    set(TARGET koboldcpp_cublas)
    add_library(${TARGET} SHARED expose.cpp expose.h)
-    target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
+    target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
    target_compile_features(${TARGET} PUBLIC cxx_std_17) # don't bump
    set_target_properties(${TARGET} PROPERTIES PREFIX "")
    set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_cublas")
@ -550,7 +550,7 @@ endif()
 if (LLAMA_HIPBLAS)
    set(TARGET koboldcpp_hipblas)
    add_library(${TARGET} SHARED expose.cpp expose.h)
-    target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./tools/mtmd/vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
+    target_include_directories(${TARGET} PUBLIC . ./ggml/include ./ggml/src ./ggml/src/ggml-cpu ./include ./otherarch ./otherarch/tools ./vendor/stb ./vendor ./otherarch/sdcpp ./otherarch/sdcpp/thirdparty ./tools ./common)
    target_compile_features(${TARGET} PUBLIC cxx_std_17) # don't bump
    set_target_properties(${TARGET} PROPERTIES PREFIX "")
    set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME "koboldcpp_hipblas")
--- a/4
+++ b/4
@ -51,8 +51,8 @@ ifdef KCPP_DEBUG
 	CFLAGS = -g -O0
 	CXXFLAGS = -g -O0
 endif
-CFLAGS   += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./tools/mtmd/vendor -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64
+CFLAGS   += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64
-CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./tools/mtmd/vendor -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64
+CXXFLAGS += -I. -Iggml/include -Iggml/src -Iggml/src/ggml-cpu -Iinclude -Isrc -I./common -I./vendor -I./vendor/stb -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./otherarch/sdcpp -I./otherarch/sdcpp/thirdparty -I./include/vulkan -O3 -fno-finite-math-only -std=c++17 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE -DGGML_USE_CPU -DGGML_USE_CPU_AARCH64
 ifndef KCPP_DEBUG
 	CFLAGS += -DNDEBUG -s
 	CXXFLAGS += -DNDEBUG -s
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -1,7 +1,9 @@
 #include "gguf.h" // for reading GGUF splits
 #include "arg.h"
 #include "chat.h"
 #include "common.h"
 #include "gguf.h" // for reading GGUF splits
 #include "json-schema-to-grammar.h"
 #include "log.h"
 #include "sampling.h"
 #include "chat.h"
@ -16,6 +18,9 @@
 #include <windows.h>
 #endif
 #define JSON_ASSERT GGML_ASSERT
 #include <nlohmann/json.hpp>
 #include <algorithm>
 #include <climits>
 #include <cstdarg>
@ -35,8 +40,6 @@
 #include <future>
 #endif
 #include "json-schema-to-grammar.h"
 using json = nlohmann::ordered_json;
 std::initializer_list<enum llama_example> mmproj_examples = {
--- a/common/chat-parser.h
+++ b/common/chat-parser.h
@ -2,9 +2,10 @@
 #include "chat.h"
 #include "json-partial.h"
 #include "json.hpp"
 #include "regex-partial.h"
 #include <nlohmann/json.hpp>
 #include <optional>
 #include <string>
 #include <vector>
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -1,6 +1,7 @@
 #include "chat.h"
 #include "chat-parser.cpp"
 #include "common.h"
 #include "json-partial.h"
 #include "json-schema-to-grammar.h"
 #include "log.h"
 #include "json-partial.cpp"
@ -16,7 +17,6 @@
 #include <string>
 #include <vector>
 static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
    auto time = std::chrono::system_clock::to_time_t(now);
    auto local_time = *std::localtime(&time);
--- a/common/common.cpp
+++ b/common/common.cpp
@ -11,7 +11,7 @@
 #include "log.cpp"
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
+#include <nlohmann/json.hpp>
 #include "json-schema-to-grammar.cpp"
 #include "llama.h"
 #include "chat.cpp"
--- a/common/json-partial.cpp
+++ b/common/json-partial.cpp
@ -1,9 +1,10 @@
-#include <json-partial.h>
+#include "json-partial.h"
 #include "ggml.h"
 #include "log.h"
 #include <string>
-#include <json.hpp>
+#include "log.h"
 #include <nlohmann/json.hpp>
 #include <string>
 using json = nlohmann::ordered_json;
--- a/common/json-partial.h
+++ b/common/json-partial.h
@ -1,5 +1,6 @@
 #pragma once
-#include <json.hpp>
+
 #include <nlohmann/json.hpp>
 // Healing marker (empty if the JSON was fully parsed / wasn't healed).
 struct common_healing_marker {
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@ -1,8 +1,9 @@
 #include "json-schema-to-grammar.h"
 #include "common.h"
 #include <nlohmann/json.hpp>
 #include <algorithm>
 #include <fstream>
 #include <map>
 #include <regex>
 #include <sstream>
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@ -1,9 +1,9 @@
 #pragma once
-#include "ggml.h"
+#include <nlohmann/json_fwd.hpp>
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
+
-#define JSON_ASSERT GGML_ASSERT
+#include <functional>
-#include "json.hpp"
+#include <string>
 std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
                                   bool force_gbnf = false);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -523,15 +523,15 @@ class TextModel(ModelBase):
            self.gguf_writer.add_context_length(n_ctx)
            logger.info(f"gguf: context length = {n_ctx}")
-        if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
+        if (n_embd := self.find_hparam(["hidden_size", "n_embd", "dim"], optional=True)) is not None:
            self.gguf_writer.add_embedding_length(n_embd)
            logger.info(f"gguf: embedding length = {n_embd}")
-        if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
+        if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
            self.gguf_writer.add_feed_forward_length(n_ff)
            logger.info(f"gguf: feed forward length = {n_ff}")
-        if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
+        if (n_head := self.find_hparam(["num_attention_heads", "n_head", "n_heads"], optional=True)) is not None:
            self.gguf_writer.add_head_count(n_head)
            logger.info(f"gguf: head count = {n_head}")
@ -674,12 +674,12 @@ class TextModel(ModelBase):
        if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
            # ref: https://huggingface.co/tiiuae/falcon-7b
            res = "falcon"
        if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
            # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
            res = "falcon3"
        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
            # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
            res = "bert-bge"
        if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
            # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
            res = "falcon3"
        if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
            # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
            res = "bert-bge-large"
@ -731,9 +731,6 @@ class TextModel(ModelBase):
        if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
            res = "jina-v2-code"
        if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b" or chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
            res = "chatglm-bpe"
        if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
            # ref: https://huggingface.co/LumiOpen/Viking-7B
            res = "viking"
@ -764,9 +761,6 @@ class TextModel(ModelBase):
        if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
            # ref: https://huggingface.co/facebook/chameleon-7b
            res = "chameleon"
        if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
            # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
            res = "minerva-7b"
        if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
            # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
            res = "roberta-bpe"
@ -797,15 +791,24 @@ class TextModel(ModelBase):
        if chkhsh == "d353350c764d8c3b39c763113960e4fb4919bea5fbf208a0e3b22e8469dc7406":
            # ref: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct
            res = "llama4"
        if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
            # ref: https://huggingface.co/THUDM/glm-4-9b-hf
            res = "glm4"
        if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
            # ref: https://huggingface.co/mistral-community/pixtral-12b
            res = "pixtral"
        if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
            # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
            res = "seed-coder"
        if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
            res = "chatglm-bpe"
        if chkhsh == "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516":
            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
            res = "chatglm-bpe"
        if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
            # ref: https://huggingface.co/THUDM/glm-4-9b-hf
            res = "glm4"
        if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
            # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
            res = "minerva-7b"
        if res is None:
            logger.warning("\n")
@ -1044,6 +1047,10 @@ class TextModel(ModelBase):
        special_vocab.chat_template = "rwkv-world"
        # hack: Add '\n\n' as the EOT token to make it chat normally
        special_vocab._set_special_token("eot", 261)
        # hack: Override these as they have already been set (incorrectly)
        special_vocab.special_token_ids["bos"] = 0
        special_vocab.special_token_ids["eos"] = 0
        special_vocab.add_to_gguf(self.gguf_writer)
    def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
@ -3690,11 +3697,21 @@ class BertModel(TextModel):
        super().__init__(*args, **kwargs)
        self.vocab_size = None
        if cls_out_labels := self.hparams.get("id2label"):
            if len(cls_out_labels) == 2 and cls_out_labels[0] == "LABEL_0":
                # Remove dummy labels added by AutoConfig
                cls_out_labels = None
        self.cls_out_labels = cls_out_labels
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self.gguf_writer.add_causal_attention(False)
        self._try_set_pooling_type()
        if self.cls_out_labels:
            key_name = gguf.Keys.Classifier.OUTPUT_LABELS.format(arch = gguf.MODEL_ARCH_NAMES[self.model_arch])
            self.gguf_writer.add_array(key_name, [v for k, v in sorted(self.cls_out_labels.items())])
    def set_vocab(self):
        tokens, toktypes, tokpre = self.get_vocab_base()
        self.vocab_size = len(tokens)
@ -3745,12 +3762,13 @@ class BertModel(TextModel):
        if name.startswith("cls.seq_relationship"):
            return []
-        # For BertForSequenceClassification (direct projection layer)
+        if self.cls_out_labels:
-        if name == "classifier.weight":
+            # For BertForSequenceClassification (direct projection layer)
-            name = "classifier.out_proj.weight"
+            if name == "classifier.weight":
                name = "classifier.out_proj.weight"
-        if name == "classifier.bias":
+            if name == "classifier.bias":
-            name = "classifier.out_proj.bias"
+                name = "classifier.out_proj.bias"
        return [(self.map_tensor_name(name), data_torch)]
@ -3771,44 +3789,93 @@ class BertModel(TextModel):
        from sentencepiece import sentencepiece_model_pb2 as model
        tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
        tokenizer_json = {}
        tokenizer_config_json = {}
        if not tokenizer_path.is_file():
-            raise FileNotFoundError(f"File not found: {tokenizer_path}")
+            tokenizer_path = self.dir_model / 'tokenizer.json'
            tokenizer_config_path = self.dir_model / 'tokenizer_config.json'
-        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
+            if not tokenizer_path.is_file():
-        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
+                raise FileNotFoundError(f"File not found: {tokenizer_path}")
        assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM
-        add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
+            from base64 import b64decode
-        remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
+            from transformers import AutoTokenizer
-        precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
+            tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
-        tokenizer = SentencePieceProcessor()
+            with open(tokenizer_path, "r", encoding="utf-8") as fp:
-        tokenizer.LoadFromFile(str(tokenizer_path))
+                tokenizer_json = json.load(fp)
-        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+            if tokenizer_config_path.is_file():
                with open(tokenizer_config_path, "r", encoding="utf-8") as fp:
                    tokenizer_config_json = json.load(fp)
            add_prefix = tokenizer.add_prefix_space
            remove_whitespaces = tokenizer.clean_up_tokenization_spaces
            precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
            vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size)
        else:
            sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
            sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
            assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM
            add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
            remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
            precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
            tokenizer = SentencePieceProcessor()
            tokenizer.LoadFromFile(str(tokenizer_path))
            vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
        scores: list[float] = [-10000.0] * vocab_size
        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
-        for token_id in range(tokenizer.vocab_size()):
+        if isinstance(tokenizer, SentencePieceProcessor):
-            piece = tokenizer.IdToPiece(token_id)
+            for token_id in range(tokenizer.vocab_size()):
-            text = piece.encode("utf-8")
+                piece = tokenizer.IdToPiece(token_id)
-            score = tokenizer.GetScore(token_id)
+                text = piece.encode("utf-8")
                score = tokenizer.GetScore(token_id)
-            toktype = SentencePieceTokenTypes.NORMAL
+                toktype = SentencePieceTokenTypes.NORMAL
-            if tokenizer.IsUnknown(token_id):
+                if tokenizer.IsUnknown(token_id):
-                toktype = SentencePieceTokenTypes.UNKNOWN
+                    toktype = SentencePieceTokenTypes.UNKNOWN
-            elif tokenizer.IsControl(token_id):
+                elif tokenizer.IsControl(token_id):
-                toktype = SentencePieceTokenTypes.CONTROL
+                    toktype = SentencePieceTokenTypes.CONTROL
-            elif tokenizer.IsUnused(token_id):
+                elif tokenizer.IsUnused(token_id):
-                toktype = SentencePieceTokenTypes.UNUSED
+                    toktype = SentencePieceTokenTypes.UNUSED
-            elif tokenizer.IsByte(token_id):
+                elif tokenizer.IsByte(token_id):
-                toktype = SentencePieceTokenTypes.BYTE
+                    toktype = SentencePieceTokenTypes.BYTE
-            tokens[token_id] = text
+                tokens[token_id] = text
-            scores[token_id] = score
+                scores[token_id] = score
-            toktypes[token_id] = toktype
+                toktypes[token_id] = toktype
        else:
            added_vocab = tokenizer.get_added_vocab()
            unk_token = tokenizer_config_json.get("unk_token")
            unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
            for token_id in range(vocab_size):
                piece = tokenizer._convert_id_to_token(token_id)
                text = piece.encode("utf-8")
                score = tokenizer_json["model"]["vocab"][token_id][1]
                toktype = SentencePieceTokenTypes.NORMAL
                if token_id == unk_token_id:
                    toktype = SentencePieceTokenTypes.UNKNOWN
                elif token_id in tokenizer.all_special_ids:
                    toktype = SentencePieceTokenTypes.CONTROL
                elif token_id in added_vocab.values():
                    toktype = SentencePieceTokenTypes.USER_DEFINED
                # No reliable way to detect this, but jina doesn't have any
                # elif tokenizer.IsByte(token_id):
                #     toktype = SentencePieceTokenTypes.BYTE
                tokens[token_id] = text
                scores[token_id] = score
                toktypes[token_id] = toktype
        if vocab_size > len(tokens):
            pad_count = vocab_size - len(tokens)
@ -3818,15 +3885,16 @@ class BertModel(TextModel):
                scores.append(-1000.0)
                toktypes.append(SentencePieceTokenTypes.UNUSED)
-        # realign tokens (see HF tokenizer code)
+        if isinstance(tokenizer, SentencePieceProcessor):
-        tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
+            # realign tokens (see HF tokenizer code)
-        scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
+            tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
-        toktypes = [
+            scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
-            SentencePieceTokenTypes.CONTROL,
+            toktypes = [
-            SentencePieceTokenTypes.CONTROL,
+                SentencePieceTokenTypes.CONTROL,
-            SentencePieceTokenTypes.CONTROL,
+                SentencePieceTokenTypes.CONTROL,
-            SentencePieceTokenTypes.UNKNOWN,
+                SentencePieceTokenTypes.CONTROL,
-        ] + toktypes[3:-1]
+                SentencePieceTokenTypes.UNKNOWN,
            ] + toktypes[3:-1]
        self.gguf_writer.add_tokenizer_model("t5")
        self.gguf_writer.add_tokenizer_pre("default")
@ -3846,7 +3914,27 @@ class BertModel(TextModel):
        self.gguf_writer.add_add_eos_token(True)
-@ModelBase.register("RobertaModel")
+@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
 class DistilBertModel(BertModel):
    model_arch = gguf.MODEL_ARCH.BERT
    def set_gguf_parameters(self):
        self.gguf_writer.add_layer_norm_eps(1e-12)
        logger.info("gguf: layer norm epsilon = 1e-12")
        super().set_gguf_parameters()
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        if name.startswith("distilbert."):
            name = name[11:]
        # These layers act as MLM head, so we don't need them
        if name.startswith("vocab_"):
            return []
        return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("RobertaModel", "RobertaForSequenceClassification")
 class RobertaModel(BertModel):
    model_arch = gguf.MODEL_ARCH.BERT
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -1,28 +1,6 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 # This script downloads the tokenizer models of the specified models from Huggingface and
 # generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
 #
 # This is necessary in order to analyze the type of pre-tokenizer used by the model and
 # provide the necessary information to llama.cpp via the GGUF header in order to implement
 # the same pre-tokenizer.
 #
 # ref: https://github.com/ggml-org/llama.cpp/pull/6920
 #
 # Instructions:
 #
 # - Add a new model to the "models" list
 # - Run the script with your huggingface token:
 #
 #   python3 convert_hf_to_gguf_update.py <huggingface_token>
 #
 # - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
 # - Update llama.cpp with the new pre-tokenizer if necessary
 #
 # TODO: generate tokenizer tests for llama.cpp
 #
 import logging
 import os
 import pathlib
@ -32,6 +10,7 @@ import requests
 import sys
 import json
 import shutil
 import argparse
 from hashlib import sha256
 from enum import IntEnum, auto
@ -41,6 +20,11 @@ logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger("convert_hf_to_gguf_update")
 sess = requests.Session()
 convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
 convert_py = convert_py_pth.read_text(encoding="utf-8")
 hf_token_pth = pathlib.Path.home() / ".cache" / "huggingface" / "token"
 hf_token = hf_token_pth.read_text(encoding="utf-8").strip() if hf_token_pth.exists() else None
 class TOKENIZER_TYPE(IntEnum):
    SPM = auto()
@ -49,20 +33,49 @@ class TOKENIZER_TYPE(IntEnum):
    UGM = auto()
 DOC_STRING = """
 This script downloads the tokenizer models of the specified models from Huggingface and
 generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
 /!\\ It is intended to be used by contributors and is not meant to be run by end users
 This is necessary in order to analyze the type of pre-tokenizer used by the model and
 provide the necessary information to llama.cpp via the GGUF header in order to implement
 the same pre-tokenizer.
 ref: https://github.com/ggml-org/llama.cpp/pull/6920
 Instructions:
 - Add a new model to the "models" list
 - Run the script with your huggingface token
    By default, token will be read from ~/.cache/huggingface/token
 - The convert_hf_to_gguf.py script will have had its get_vocab_base_pre() function updated
 - Update llama.cpp with the new pre-tokenizer if necessary
 """
 # TODO: generate tokenizer tests for llama.cpp
 parser = argparse.ArgumentParser(description=DOC_STRING, formatter_class=argparse.RawTextHelpFormatter)
 parser.add_argument(
    "--full", action="store_true",
    help="download full list of models - make sure you have access to all of them",
 )
 parser.add_argument(
    "hf_token",
    help="optional HF token",
    nargs="?",
 )
 args = parser.parse_args()
 hf_token = args.hf_token if args.hf_token is not None else hf_token
 if hf_token is None:
    logger.error("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token")
    sys.exit(1)
 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
 CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
 if len(sys.argv) == 2:
    token = sys.argv[1]
    if not token.startswith("hf_"):
        logger.info("Huggingface token seems invalid")
        logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
        sys.exit(1)
 else:
    logger.info("Usage: python convert_hf_to_gguf_update.py <huggingface_token>")
    sys.exit(1)
 # TODO: add models here, base models preferred
 models = [
    {"name": "llama-spm",        "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
@ -103,7 +116,6 @@ models = [
    {"name": "exaone",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
    {"name": "phi-2",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
    {"name": "chameleon",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
    {"name": "minerva-7b",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
    {"name": "roberta-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
    {"name": "gigachat",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
    {"name": "megrez",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
@ -114,11 +126,19 @@ models = [
    {"name": "trillion",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
    {"name": "bailingmoe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
    {"name": "llama4",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
    {"name": "glm4",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
    {"name": "pixtral",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
    {"name": "seed-coder",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
 ]
 # some models are known to be broken upstream, so we will skip them as exceptions
 pre_computed_hashes = [
    # chatglm-bpe has 2 hashes, why?
    {"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b"},
    {"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
    {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
 ]
 def download_file_with_auth(url, token, save_path):
    headers = {"Authorization": f"Bearer {token}"}
@ -169,9 +189,29 @@ def download_model(model):
            if os.path.isfile(save_path):
                logger.info(f"{name}: File {save_path} already exists - skipping")
                continue
-            download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
+            download_file_with_auth(f"{repo}/resolve/main/{file}", hf_token, save_path)
 # get list of existing models and chkhsh from the convert_hf_to_gguf.py file
 # returns mapping res --> chkhsh
 def get_existing_models(convert_py):
    pattern = r'if chkhsh == "([a-f0-9]{64})":\s*\n\s*.*\s*res = "([^"]+)"'
    matches = re.findall(pattern, convert_py)
    output = {}
    for chkhsh, res in matches:
        output[res] = chkhsh
    return output
 existing_models = {}
 all_models = models.copy()
 if not args.full:
    # Filter out models that already exist in convert_hf_to_gguf.py
    existing_models = get_existing_models(convert_py)
    all_models = models.copy()
    models = [model for model in all_models if model["name"] not in existing_models]
 logging.info(f"Downloading {len(models)} models...")
 for model in models:
    try:
        download_model(model)
@ -182,9 +222,10 @@ for model in models:
 # generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
 src_ifs = ""
-for model in models:
+for model in [*all_models, *pre_computed_hashes]:
    name = model["name"]
    tokt = model["tokt"]
    chkhsh = model.get("chkhsh")
    if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
        continue
@ -195,35 +236,44 @@ for model in models:
        continue
    # create the tokenizer
-    try:
+    if chkhsh is not None:
-        if name == "t5":
+        # if the model has a pre-computed hash, use it
-            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
+        logger.info(f"Using pre-computed hash for model {name}: {chkhsh}")
-        else:
+    elif name in existing_models:
-            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+        # if the model already exists in convert_hf_to_gguf.py, skip compute hash
-    except OSError as e:
+        chkhsh = existing_models[name]
-        logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
+    else:
-        continue  # Skip to the next model if the tokenizer can't be loaded
+        # otherwise, compute the hash of the tokenizer
        try:
            logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
            if name == "t5":
                tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
            else:
                tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
        except OSError as e:
            logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
            continue  # Skip to the next model if the tokenizer can't be loaded
-    chktok = tokenizer.encode(CHK_TXT)
+        chktok = tokenizer.encode(CHK_TXT)
-    chkhsh = sha256(str(chktok).encode()).hexdigest()
+        chkhsh = sha256(str(chktok).encode()).hexdigest()
-    logger.info(f"model: {name}")
+        logger.info(f"model: {name}")
-    logger.info(f"tokt: {tokt}")
+        logger.info(f"tokt: {tokt}")
-    logger.info(f"repo: {model['repo']}")
+        logger.info(f"repo: {model['repo']}")
-    logger.info(f"chktok: {chktok}")
+        logger.info(f"chktok: {chktok}")
-    logger.info(f"chkhsh: {chkhsh}")
+        logger.info(f"chkhsh: {chkhsh}")
-    # print the "pre_tokenizer" content from the tokenizer.json
+        # print the "pre_tokenizer" content from the tokenizer.json
-    with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
+        with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
-        cfg = json.load(f)
+            cfg = json.load(f)
-        normalizer = cfg["normalizer"]
+            normalizer = cfg["normalizer"]
-        logger.info("normalizer: " + json.dumps(normalizer, indent=4))
+            logger.info("normalizer: " + json.dumps(normalizer, indent=4))
-        pre_tokenizer = cfg["pre_tokenizer"]
+            pre_tokenizer = cfg["pre_tokenizer"]
-        logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
+            logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
-        if "ignore_merges" in cfg["model"]:
+            if "ignore_merges" in cfg["model"]:
-            logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
+                logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
-    logger.info("")
+        logger.info("")
    src_ifs += f"        if chkhsh == \"{chkhsh}\":\n"
    src_ifs += f"            # ref: {model['repo']}\n"
@ -271,8 +321,6 @@ src_func = f"""
        return res
 """
 convert_py_pth = pathlib.Path("convert_hf_to_gguf.py")
 convert_py = convert_py_pth.read_text(encoding="utf-8")
 convert_py = re.sub(
    r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
    lambda m: m.group(1) + src_func + m.group(3),
@ -367,6 +415,10 @@ for model in models:
        logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
        continue  # Skip this model and continue with the next one in the loop
    if not os.path.exists(f"models/ggml-vocab-{name}.gguf"):
        logger.info(f"Skip vocab files for model {name}, no GGUF file found")
        continue
    with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
        for text in tests:
            f.write(f"{text}")
--- a/docs/multimodal.md
+++ b/docs/multimodal.md
@ -1,109 +0,0 @@
 # Multimodal
 llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools support this feature:
 - [llama-mtmd-cli](../tools/mtmd/README.md)
 - [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API
 Currently, we support **image** and **audio** input. Audio is highly experimental and may have reduced quality.
 To enable it, you can use one of the 2 methods below:
 - Use `-hf` option with a supported model (see a list of pre-quantized model below)
    - To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
    - To load a model using `-hf` while using a custom mmproj file, use `--mmproj local_file.gguf`
 - Use `-m model.gguf` option with `--mmproj file.gguf` to specify text and multimodal projector respectively
 By default, multimodal projector will be offloaded to GPU. To disable this, add `--no-mmproj-offload`
 For example:
 ```sh
 # simple usage with CLI
 llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
 # simple usage with server
 llama-server -hf ggml-org/gemma-3-4b-it-GGUF
 # using local file
 llama-server -m gemma-3-4b-it-Q4_K_M.gguf --mmproj mmproj-gemma-3-4b-it-Q4_K_M.gguf
 # no GPU offload
 llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
 ```
 ## Pre-quantized models
 These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/collections/ggml-org/multimodal-ggufs-68244e01ff1f39e5bebeeedc
 Replaces the `(tool_name)` with the name of binary you want to use. For example, `llama-mtmd-cli` or `llama-server`
 NOTE: some models may require large context window, for example: `-c 8192`
 **Vision models**:
 ```sh
 # Gemma 3
 (tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
 (tool_name) -hf ggml-org/gemma-3-12b-it-GGUF
 (tool_name) -hf ggml-org/gemma-3-27b-it-GGUF
 # SmolVLM
 (tool_name) -hf ggml-org/SmolVLM-Instruct-GGUF
 (tool_name) -hf ggml-org/SmolVLM-256M-Instruct-GGUF
 (tool_name) -hf ggml-org/SmolVLM-500M-Instruct-GGUF
 (tool_name) -hf ggml-org/SmolVLM2-2.2B-Instruct-GGUF
 (tool_name) -hf ggml-org/SmolVLM2-256M-Video-Instruct-GGUF
 (tool_name) -hf ggml-org/SmolVLM2-500M-Video-Instruct-GGUF
 # Pixtral 12B
 (tool_name) -hf ggml-org/pixtral-12b-GGUF
 # Qwen 2 VL
 (tool_name) -hf ggml-org/Qwen2-VL-2B-Instruct-GGUF
 (tool_name) -hf ggml-org/Qwen2-VL-7B-Instruct-GGUF
 # Qwen 2.5 VL
 (tool_name) -hf ggml-org/Qwen2.5-VL-3B-Instruct-GGUF
 (tool_name) -hf ggml-org/Qwen2.5-VL-7B-Instruct-GGUF
 (tool_name) -hf ggml-org/Qwen2.5-VL-32B-Instruct-GGUF
 (tool_name) -hf ggml-org/Qwen2.5-VL-72B-Instruct-GGUF
 # Mistral Small 3.1 24B (IQ2_M quantization)
 (tool_name) -hf ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF
 # InternVL 2.5 and 3
 (tool_name) -hf ggml-org/InternVL2_5-1B-GGUF
 (tool_name) -hf ggml-org/InternVL2_5-4B-GGUF
 (tool_name) -hf ggml-org/InternVL3-1B-Instruct-GGUF
 (tool_name) -hf ggml-org/InternVL3-2B-Instruct-GGUF
 (tool_name) -hf ggml-org/InternVL3-8B-Instruct-GGUF
 (tool_name) -hf ggml-org/InternVL3-14B-Instruct-GGUF
 # Llama 4 Scout
 (tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
 # Moondream2 20250414 version
 (tool_name) -hf ggml-org/moondream2-20250414-GGUF
 ```
 **Audio models**:
 ```sh
 # Ultravox 0.5
 (tool_name) -hf ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF
 (tool_name) -hf ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF
 # Qwen2-Audio and SeaLLM-Audio
 # note: no pre-quantized GGUF this model, as they have very poor result
 # ref: https://github.com/ggml-org/llama.cpp/pull/13760
 ```
 **Mixed modalities**:
 ```sh
 # Qwen2.5 Omni
 # Capabilities: audio input, vision input
 (tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
 (tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
 ```
--- a/examples/sycl/run-llama3.sh
+++ b/examples/sycl/run-llama3.sh
@ -1,28 +0,0 @@
 #!/bin/bash
 #  MIT license
 #  Copyright (C) 2025 Intel Corporation
 #  SPDX-License-Identifier: MIT
 # If you want more control, DPC++ Allows selecting a specific device through the
 # following environment variable
 #export ONEAPI_DEVICE_SELECTOR="level_zero:0"
 source /opt/intel/oneapi/setvars.sh
 #export GGML_SYCL_DEBUG=1
 #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
 INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
 MODEL_FILE=models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
 NGL=99 # Layers offloaded to the GPU. If the device runs out of memory, reduce this value according to the model you are using.
 CONTEXT=4096
 if [ $# -gt 0 ]; then
    GGML_SYCL_DEVICE=$1
    echo "Using $GGML_SYCL_DEVICE as the main GPU"
    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
 else
    #use multiple GPUs with same max compute units
    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT}
 fi
--- a/examples/sycl/win-run-llama3.bat
+++ b/examples/sycl/win-run-llama3.bat
@ -1,9 +0,0 @@
 ::  MIT license
 ::  Copyright (C) 2024 Intel Corporation
 ::  SPDX-License-Identifier: MIT
 set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
 .\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -e -ngl 99
--- a/examples/training/CMakeLists.txt
+++ b/examples/training/CMakeLists.txt
@ -1,5 +0,0 @@
 set(TARGET llama-finetune)
 add_executable(${TARGET} finetune.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/training/README.md
+++ b/examples/training/README.md
@ -1,17 +0,0 @@
 # llama.cpp/examples/training
 This directory contains examples related to language model training using llama.cpp/GGML.
 So far finetuning is technically functional (for FP32 models and limited hardware setups) but the code is very much WIP.
 Finetuning of Stories 260K and LLaMA 3.2 1b seems to work with 24 GB of memory.
 **For CPU training, compile llama.cpp without any additional backends such as CUDA.**
 **For CUDA training, use the maximum number of GPU layers.**
 Proof of concept:
 ``` sh
 export model_name=llama_3.2-1b && export quantization=f32
 ./build/bin/llama-finetune --file wikitext-2-raw/wiki.test.raw -ngl 999 --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512
 ./build/bin/llama-perplexity --file wikitext-2-raw/wiki.test.raw -ngl 999 --model finetuned-model.gguf
 ```
 The perplexity value of the finetuned model should be lower after training on the test set for 2 epochs.
--- a/examples/training/finetune.cpp
+++ b/examples/training/finetune.cpp
@ -1,96 +0,0 @@
 #include "arg.h"
 #include "common.h"
 #include "log.h"
 #include "llama.h"
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <vector>
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 int main(int argc, char ** argv) {
    common_params params;
    params.escape = false;
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
        return 1;
    }
    if (params.use_mmap) {
        LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__);
        params.use_mmap = false;
    }
    if (params.cache_type_k != GGML_TYPE_F32) {
        LOG_INF("%s: force changing k cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
        params.cache_type_k = GGML_TYPE_F32;
    }
    if (params.cache_type_v != GGML_TYPE_F32) {
        LOG_INF("%s: force changing v cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
        params.cache_type_v = GGML_TYPE_F32;
    }
    common_init();
    llama_backend_init();
    llama_numa_init(params.numa);
    // load the model and apply lora adapter, if any
    common_init_result llama_init = common_init_from_params(params);
    llama_model_ptr   & model = llama_init.model;
    llama_context_ptr & ctx   = llama_init.context;
    if (model == NULL) {
        LOG_ERR("%s: unable to load model\n", __func__);
        return 1;
    }
    // print system information
    {
        LOG_INF("\n");
        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
    }
    constexpr float val_split = 0.05f;
    std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
    ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);
    struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
    optimizer_params.adamw.alpha = 1e-7f; // learning rate
    struct llama_opt_params lopt_params {
        /*n_ctx_train     =*/ 0,
        /*param_filter    =*/ llama_opt_param_filter_all,
        /*param_filter_ud =*/ nullptr,
        /*get_opt_pars    =*/ ggml_opt_get_constant_optimizer_params,
        /*get_opt_pars_ud =*/ &optimizer_params,
    };
    llama_opt_init(ctx.get(), model.get(), lopt_params);
    const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split);
    ggml_opt_result_t result_train = ggml_opt_result_init();
    ggml_opt_result_t result_eval  = ggml_opt_result_init();
    for (int epoch = 0; epoch < 2; ++epoch) {
        llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
            ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
        fprintf(stderr, "\n");
        ggml_opt_result_reset(result_train);
        ggml_opt_result_reset(result_eval);
    }
    ggml_opt_result_free(result_train);
    ggml_opt_result_free(result_eval);
    llama_model_save_to_file(model.get(), "finetuned-model.gguf");
    llama_backend_free();
    return 0;
 }
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@ -1346,7 +1346,10 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
    // allocate graph
    if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
        // the re-allocation may cause the split inputs to be moved to a different address
-        ggml_backend_sched_synchronize(sched);
+        // synchronize without ggml_backend_sched_synchronize to avoid changing cur_copy
        for (int i = 0; i < sched->n_backends; i++) {
            ggml_backend_synchronize(sched->backends[i]);
        }
 #ifndef NDEBUG
        GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
 #endif
@ -1570,7 +1573,6 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra
    ggml_backend_sched_split_graph(sched, graph);
    if (!ggml_backend_sched_alloc_splits(sched)) {
        return false;
    }
@ -1604,9 +1606,12 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
    for (int i = 0; i < sched->n_backends; i++) {
        ggml_backend_synchronize(sched->backends[i]);
    }
-    // reset the current copy to 0 so that the graphs will be similar during generation
+    if (!sched->is_alloc) {
-    // necessary for CUDA graphs
+        // if the graph is not already allocated, always use copy 0 after a synchronization
-    sched->cur_copy = 0;
+        // this ensures that during generation the same copy is used every time,
        // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
        sched->cur_copy = 0;
    }
 }
 void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@ -6996,7 +6996,11 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
 void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
 #ifdef __ARM_FEATURE_MATMUL_INT8
    assert((nrc == 2) || (nrc == 1));
 #else
    assert(nrc == 1);
 #endif
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
@ -7013,6 +7017,146 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
    uint32_t utmp[4];
 #if defined(__ARM_FEATURE_MATMUL_INT8)
    if (nrc == 2) {
        const block_q4_K * GGML_RESTRICT x0 = x;
        const block_q4_K * GGML_RESTRICT x1 = (const block_q4_K *) ((const uint8_t *)vx + bx);
        const block_q8_K * GGML_RESTRICT y0 = y;
        const block_q8_K * GGML_RESTRICT y1 = (const block_q8_K *) ((const uint8_t *)vy + by);
        const uint8x16_t m4b = vdupq_n_u8(0x0f);
        float32x4_t vfsum = vdupq_n_f32(0.0f);
        for (int i = 0; i < nb; ++i, ++x0, ++x1, ++y0, ++y1) {
            const uint8_t * GGML_RESTRICT qx0 = x0->qs;
            const uint8_t * GGML_RESTRICT qx1 = x1->qs;
            const  int8_t * GGML_RESTRICT qy0 = y0->qs;
            const  int8_t * GGML_RESTRICT qy1 = y1->qs;
            // decode scales and mins
            int8_t x0_scales[8], x1_scales[8];
            int16x8_t x0_mins, x1_mins;
            {
                uint32_t scales_mins[3];
                memcpy(scales_mins, x0->scales, 12);
                const uint32_t mins_0_3 = scales_mins[1] & kmask1;
                const uint32_t mins_4_7 = ((scales_mins[2] >> 4) & kmask2) | (((scales_mins[1] >> 6) & kmask3) << 4);
                const uint32x2_t mins = {mins_0_3, mins_4_7};
                x0_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins)));
                uint32_t scales[2];
                scales[0] = scales_mins[0] & kmask1; // scales 0~3
                scales[1] = (scales_mins[2] & kmask2) | (((scales_mins[0] >> 6) & kmask3) << 4); // scales 4~7
                memcpy(x0_scales, scales, 8);
            }
            {
                uint32_t scales_mins[3];
                memcpy(scales_mins, x1->scales, 12);
                const uint32_t mins_0_3 = scales_mins[1] & kmask1;
                const uint32_t mins_4_7 = ((scales_mins[2] >> 4) & kmask2) | (((scales_mins[1] >> 6) & kmask3) << 4);
                const uint32x2_t mins = {mins_0_3, mins_4_7};
                x1_mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins)));
                uint32_t scales[2];
                scales[0] = scales_mins[0] & kmask1; // scales 0~3
                scales[1] = (scales_mins[2] & kmask2) | (((scales_mins[0] >> 6) & kmask3) << 4); // scales 4~7
                memcpy(x1_scales, scales, 8);
            }
            int32x4_t visum = {0};
            // process 64 data points per iteration, totally 256 data points
            for (int j = 0; j < QK_K / 64; ++j, qx0 += 32, qx1 += 32, qy0 += 64, qy1 += 64) {
                const int8x16x4_t vy0 = vld1q_s8_x4(qy0);
                const int8x16x4_t vy1 = vld1q_s8_x4(qy1);
                int8x16_t vx0[4], vx1[4];
                {
                    const uint8x16x2_t vv = vld1q_u8_x2(qx0);
                    vx0[0] = vreinterpretq_s8_u8(vandq_u8(vv.val[0], m4b));
                    vx0[1] = vreinterpretq_s8_u8(vandq_u8(vv.val[1], m4b));
                    vx0[2] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[0], 4));
                    vx0[3] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[1], 4));
                }
                {
                    const uint8x16x2_t vv = vld1q_u8_x2(qx1);
                    vx1[0] = vreinterpretq_s8_u8(vandq_u8(vv.val[0], m4b));
                    vx1[1] = vreinterpretq_s8_u8(vandq_u8(vv.val[1], m4b));
                    vx1[2] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[0], 4));
                    vx1[3] = vreinterpretq_s8_u8(vshrq_n_u8(vv.val[1], 4));
                }
                // process 32 data points (share same block scale) per iteration
                for (int k = 0; k < 2; ++k) {
                    const int blk = j * 2 + k;
                    const int32x4_t block_scale = {
                        x0_scales[blk],
                        x0_scales[blk],
                        x1_scales[blk],
                        x1_scales[blk],
                    };
                    int32x4_t vr = {0};
                    for (int l = 0; l < 2; ++l) {
                        const int idx = k * 2 + l;
                        const int64x2_t vx0_s64 = vreinterpretq_s64_s8(vx0[idx]);
                        const int64x2_t vx1_s64 = vreinterpretq_s64_s8(vx1[idx]);
                        const int64x2_t vy0_s64 = vreinterpretq_s64_s8(vy0.val[idx]);
                        const int64x2_t vy1_s64 = vreinterpretq_s64_s8(vy1.val[idx]);
                        const int8x16_t vx_l = vreinterpretq_s8_s64(vzip1q_s64(vx0_s64, vx1_s64));
                        const int8x16_t vx_h = vreinterpretq_s8_s64(vzip2q_s64(vx0_s64, vx1_s64));
                        const int8x16_t vy_l = vreinterpretq_s8_s64(vzip1q_s64(vy0_s64, vy1_s64));
                        const int8x16_t vy_h = vreinterpretq_s8_s64(vzip2q_s64(vy0_s64, vy1_s64));
                        vr = vmmlaq_s32(vr, vx_l, vy_l);
                        vr = vmmlaq_s32(vr, vx_h, vy_h);
                    }
                    // apply block scale, will NOT overflow
                    // block_scale * sum_256(int4*int8) <= 2^(8+8+4+8) = 28 bits
                    visum = vmlaq_s32(visum, vr, block_scale);
                }
            }
            // adjust bias, apply superblock scale
            {
                int32_t bias[4];
                // no obvious uplift from sve sdot-16, just use neon mul add
                const int16x8_t y0_sums = vpaddq_s16(vld1q_s16(y0->bsums), vld1q_s16(y0->bsums+8));
                const int16x8_t y1_sums = vpaddq_s16(vld1q_s16(y1->bsums), vld1q_s16(y1->bsums+8));
                bias[0] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y0_sums), vget_low_s16(x0_mins)),
                                               vmull_s16(vget_high_s16(y0_sums), vget_high_s16(x0_mins))));
                bias[1] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x0_mins)),
                                               vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x0_mins))));
                bias[2] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y0_sums), vget_low_s16(x1_mins)),
                                               vmull_s16(vget_high_s16(y0_sums), vget_high_s16(x1_mins))));
                bias[3] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x1_mins)),
                                               vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x1_mins))));
                const float32x4_t dmins = {
                    GGML_FP16_TO_FP32(x0->dmin) * y0->d,
                    GGML_FP16_TO_FP32(x0->dmin) * y1->d,
                    GGML_FP16_TO_FP32(x1->dmin) * y0->d,
                    GGML_FP16_TO_FP32(x1->dmin) * y1->d,
                };
                vfsum = vmlsq_f32(vfsum, vcvtq_f32_s32(vld1q_s32(bias)), dmins);
                const float32x4_t superblock_scale = {
                    GGML_FP16_TO_FP32(x0->d) * y0->d,
                    GGML_FP16_TO_FP32(x0->d) * y1->d,
                    GGML_FP16_TO_FP32(x1->d) * y0->d,
                    GGML_FP16_TO_FP32(x1->d) * y1->d,
                };
                vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale);
            }
        }
        // vfsum = ABCD -> ACBD
        // AC -> s, BD -> (s+bs)
        vfsum = vzip1q_f32(vfsum, vextq_f32(vfsum, vfsum, 2));
        vst1_f32(s,      vget_low_f32 (vfsum));
        vst1_f32(s + bs, vget_high_f32(vfsum));
        return;
    }
 #endif
 #ifdef __ARM_FEATURE_SVE
    float sumf = 0;
    for (int i = 0; i < nb; ++i) {
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -274,7 +274,11 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
        .from_float               = quantize_row_q4_K,
        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
 #if defined (__ARM_FEATURE_MATMUL_INT8)
        .nrows                    = 2,
 #else
        .nrows                    = 1,
 #endif
    },
    [GGML_TYPE_Q5_K] = {
        .from_float               = quantize_row_q5_K,
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@ -7633,39 +7633,83 @@ static void ggml_compute_forward_ssm_scan_f32(
    const int ir1 = MIN(ir0 + dr, nr);
    const int ir  = ir1 - ir0;
-    for (int i3 = 0; i3 < n_s; ++i3) {
+    #ifdef __ARM_FEATURE_SVE
-        for (int i2 = 0; i2 < n_t; ++i2) {
+        for (int i3 = 0; i3 < n_s; ++i3) {
-            const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
+            for (int i2 = 0; i2 < n_t; ++i2) {
-            const float * x  = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+                const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
-            const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
+                const float * x  = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
-            const float * A  = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
+                const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
-            const float * B  = (const float *) ((const char *) src4->data +  i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
+                const float * A  = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
-            const float * C  = (const float *) ((const char *) src5->data +  i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
+                const float * B  = (const float *) ((const char *) src4->data +  i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
-                  float * y  = (      float *) ((      char *) dst->data  + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
+                const float * C  = (const float *) ((const char *) src5->data +  i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
-                  float * s  = (      float *) ((      char *) dst->data  + ir0*(src0->nb[1]) + i3*(src0->nb[2]) +     src1->nb[3]);  // {d_state, d_inner, n_s}
+                    float * y  = (      float *) ((      char *) dst->data  + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
                    float * s  = (      float *) ((      char *) dst->data  + ir0*(src0->nb[1]) + i3*(src0->nb[2]) +     src1->nb[3]);  // {d_state, d_inner, n_s}
-            // use the output as the source for the next token-wise iterations
+                // use the output as the source for the next token-wise iterations
-            if (i2 > 0) { s0 = s; }
+                if (i2 > 0) { s0 = s; }
-            // d_inner
+                // d_inner
-            for (int i1 = 0; i1 < ir; ++i1) {
+                for (int i1 = 0; i1 < ir; ++i1) {
-                // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
+                    float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
-                float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
+                    float x_dt = x[i1] * dt_soft_plus;
-                float x_dt = x[i1] * dt_soft_plus;
+                    svfloat32_t vx_dt = GGML_F32_VEC_SET1(x_dt);
-                float sumf = 0.0f;
+                    svfloat32_t vdt_soft_plus = GGML_F32_VEC_SET1(dt_soft_plus);
-                // d_state
+                    svfloat32_t r1_vector = GGML_F32_VEC_ZERO;
-                for (int i0 = 0; i0 < nc; ++i0) {
+
-                    int i = i0 + i1*nc;
+                    for (int64_t k = 0; k < nc; k += svcntw()) {
-                    // state = prev_state * dA + dB * x
+                        svfloat32_t vA = GGML_F32_VEC_LOAD(&A[i1*nc + k]);
-                    float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
+                        svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k]);
-                    // y = rowwise_dotprod(state, C)
+                        svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k]);
-                    sumf += state * C[i0];
+                        svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[i1*nc + k]);
-                    s[i] = state;
+
                        svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA);
                        t1 = exp_ps_sve(svptrue_b32(), t1);
                        svfloat32_t t2 = GGML_F32_VEC_MUL(vx_dt, vB);
                        vs0 = GGML_F32_VEC_FMA(vs0, t1, t2);
                        r1_vector = GGML_F32_VEC_ADD(GGML_F32_VEC_MUL(vs0, vC), r1_vector);
                        GGML_F32_VEC_STORE(&s[i1*nc + k], vs0);
                    }
                    y[i1] = GGML_F32xt_REDUCE_ONE(r1_vector);
                }
                y[i1] = sumf;
            }
        }
-    }
+    #else
        for (int i3 = 0; i3 < n_s; ++i3) {
            for (int i2 = 0; i2 < n_t; ++i2) {
                const float * s0 = (const float *) ((const char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); // {d_state, d_inner, n_s}
                const float * x  = (const float *) ((const char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
                const float * dt = (const float *) ((const char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1]) + i3*(src2->nb[2])); // {d_inner, n_t, n_s}
                const float * A  = (const float *) ((const char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
                const float * B  = (const float *) ((const char *) src4->data +  i2*(src4->nb[1]) + i3*(src4->nb[2])); // {d_state, n_t, n_s}
                const float * C  = (const float *) ((const char *) src5->data +  i2*(src5->nb[1]) + i3*(src5->nb[2])); // {d_state, n_t, n_s}
                    float * y  = (      float *) ((      char *) dst->data  + ir0*(src1->nb[0]) + i2*(src1->nb[1]) + i3*(src1->nb[2])); // {d_inner, n_t, n_s}
                    float * s  = (      float *) ((      char *) dst->data  + ir0*(src0->nb[1]) + i3*(src0->nb[2]) +     src1->nb[3]);  // {d_state, d_inner, n_s}
                // use the output as the source for the next token-wise iterations
                if (i2 > 0) { s0 = s; }
                // d_inner
                for (int i1 = 0; i1 < ir; ++i1) {
                    // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
                    float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
                    float x_dt = x[i1] * dt_soft_plus;
                    float sumf = 0.0f;
                    // d_state
                    for (int i0 = 0; i0 < nc; ++i0) {
                        int i = i0 + i1*nc;
                        // state = prev_state * dA + dB * x
                        float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
                        // y = rowwise_dotprod(state, C)
                        sumf += state * C[i0];
                        s[i] = state;
                    }
                    y[i1] = sumf;
                }
            }
        }
    #endif
 }
 void ggml_compute_forward_ssm_scan(
@ -8070,6 +8114,14 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
        #define GGML_F32X_MUL GGML_F32x16_MUL
        #define GGML_F32X_FMA GGML_F32x16_FMA
        #define WKV_VECTOR_SIZE 16
    #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
        #define GGML_F32X GGML_F32xt
        #define GGML_F32X_SET1 GGML_F32xt_SET1
        #define GGML_F32X_LOAD GGML_F32xt_LOAD
        #define GGML_F32X_STORE GGML_F32xt_STORE
        #define GGML_F32X_MUL GGML_F32xt_MUL
        #define GGML_F32X_FMA GGML_F32xt_FMA
        #define WKV_VECTOR_SIZE 8
    #elif defined(__ARM_NEON) && defined(__aarch64__)
        #define GGML_F32X GGML_F32x4
        #define GGML_F32X_SET1 GGML_F32x4_SET1
@ -8080,8 +8132,14 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
        #define WKV_VECTOR_SIZE 4
    #endif
    int wkv_vector_size;
    #ifdef WKV_VECTOR_SIZE
-        const int64_t vec_count = head_size / WKV_VECTOR_SIZE;
+        #if defined(__ARM_FEATURE_SVE)
            wkv_vector_size = svcntw();
        #else
            wkv_vector_size = WKV_VECTOR_SIZE;
        #endif
        const int64_t vec_count = head_size / wkv_vector_size;
        for (int64_t t = 0; t < T; t++) {
            size_t t_offset = t * t_stride;
@ -8111,7 +8169,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
                    GGML_F32X time_decay_vec = GGML_F32X_SET1(time_decay_val);
                    for (int64_t j = 0; j < vec_count; j++) {
-                        size_t base_j = j * WKV_VECTOR_SIZE;
+                        size_t base_j = j * wkv_vector_size;
                        size_t t_h_j_offset = t_h_offset + base_j;
                        size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
@ -8136,7 +8194,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
                    }
                    // Handle remaining elements, this will not be used.
-                    for (int64_t j = vec_count * WKV_VECTOR_SIZE; j < head_size; j++) {
+                    for (int64_t j = vec_count * wkv_vector_size; j < head_size; j++) {
                        size_t t_h_j_offset = t_h_offset + j;
                        size_t h_2d_i_j_offset = h_2d_i_offset + j;
                        float v_val = v[t_h_j_offset];
@ -8272,6 +8330,14 @@ static void ggml_compute_forward_gla_f32(
        #define GGML_F32X_MUL GGML_F32x16_MUL
        #define GGML_F32X_FMA GGML_F32x16_FMA
        #define GLA_VECTOR_SIZE 16
    #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
        #define GGML_F32X GGML_F32xt
        #define GGML_F32X_SET1 GGML_F32xt_SET1
        #define GGML_F32X_LOAD GGML_F32xt_LOAD
        #define GGML_F32X_STORE GGML_F32xt_STORE
        #define GGML_F32X_MUL GGML_F32xt_MUL
        #define GGML_F32X_FMA GGML_F32xt_FMA
        #define GLA_VECTOR_SIZE 8
    #elif defined(__ARM_NEON) && defined(__aarch64__)
        #define GGML_F32X GGML_F32x4
        #define GGML_F32X_SET1 GGML_F32x4_SET1
@ -8282,8 +8348,14 @@ static void ggml_compute_forward_gla_f32(
        #define GLA_VECTOR_SIZE 4
    #endif
    int gla_vector_size;
    #ifdef GLA_VECTOR_SIZE
-        const int64_t vec_count = head_size / GLA_VECTOR_SIZE;
+        #if defined(__ARM_FEATURE_SVE)
            gla_vector_size = svcntw();
        #else
            gla_vector_size = GLA_VECTOR_SIZE;
        #endif
        const int64_t vec_count = head_size / gla_vector_size;
        for (int64_t t = 0; t < T; t++) {
            size_t t_offset = t * t_stride;
@ -8310,7 +8382,7 @@ static void ggml_compute_forward_gla_f32(
                    GGML_F32X g_vec = GGML_F32X_SET1(g_val);
                    for (int64_t j = 0; j < vec_count; j++) {
-                        size_t base_j = j * GLA_VECTOR_SIZE;
+                        size_t base_j = j * gla_vector_size;
                        size_t t_h_j_offset = t_h_offset + base_j;
                        size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
@ -8334,7 +8406,7 @@ static void ggml_compute_forward_gla_f32(
                    }
                    // Handle remaining elements, this will not be used.
-                    for (int64_t j = vec_count * GLA_VECTOR_SIZE; j < head_size; j++) {
+                    for (int64_t j = vec_count * gla_vector_size; j < head_size; j++) {
                        size_t t_h_j_offset = t_h_offset + j;
                        size_t h_2d_i_j_offset = h_2d_i_offset + j;
                        float v_val = v[t_h_j_offset];
@ -8443,83 +8515,126 @@ static void ggml_compute_forward_rwkv_wkv7_f32(
    int64_t h_stride_2d = head_size * head_size;
    #if defined(GGML_SIMD)
-        for (int64_t t = 0; t < T; t++) {
+        #if defined(__ARM_FEATURE_SVE)
-            int64_t t_offset = t * t_stride;
+            // scalar Route to scalar implementation       //TODO: Write SVE code
-            int64_t state_offset = head_size * C * (t / (T / n_seqs));
+            for (int64_t t = 0; t < T; t++) {
-            float * state_cur = state + state_offset;
+                int64_t t_offset = t * t_stride;
-            float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
+                int64_t state_offset = head_size * C * (t / (T / n_seqs));
                float * state_cur = state + state_offset;
                float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
-            for (int64_t h = h_start; h < h_end; h++) {
+                for (int64_t h = h_start; h < h_end; h++) {
-                int64_t h_offset = h * h_stride;
+                    int64_t h_offset = h * h_stride;
-                int64_t t_h_offset = t_offset + h_offset;
+                    int64_t t_h_offset = t_offset + h_offset;
-                int64_t h_2d_offset = h * h_stride_2d;
+                    int64_t h_2d_offset = h * h_stride_2d;
-                for (int64_t ii = 0; ii < head_size; ii++) {
+                    for (int64_t i = 0; i < head_size; i++) {
-                    int64_t t_h_i_offset = t_h_offset + ii;
+                        int64_t t_h_i_offset = t_h_offset + i;
-                    int64_t h_2d_i_offset = h_2d_offset + ii * h_stride;
+                        int64_t h_2d_i_offset = h_2d_offset + i * h_stride;
-                    GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]);
+                        float v_val = v[t_h_i_offset];
-                    float sa = 0;
+                        float sa = 0, result = 0;
-                    {
+                        for (int64_t j = 0; j < head_size; j++) {
-                        GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
+                            sa += a[t_h_offset + j] * state_prev[h_2d_i_offset + j];
                        GGML_F32_VEC ax[GGML_F32_ARR];
                        GGML_F32_VEC ay[GGML_F32_ARR];
                        for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) {
                            for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
                                ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]);
                                ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]);
                                sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]);
                            }
                        }
                        GGML_F32_VEC_REDUCE(sa, sum);
                    }
-                    GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa);
+                        for (int64_t j = 0; j < head_size; j++) {
                            int64_t t_h_j_offset = t_h_offset + j;
                            int64_t h_2d_i_j_offset = h_2d_i_offset + j;
-                    int64_t j = 0;
+                            float r_val = r[t_h_j_offset];
-                    GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
+                            float w_val = w[t_h_j_offset];
-                    for (; j < head_size; j += GGML_F32_STEP) {
+                            float k_val = k[t_h_j_offset];
-                        for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
+                            float b_val = b[t_h_j_offset];
-                            int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR;
+                            float kv_val = v_val * k_val;
-                            int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR;
+                            float prev_state_val = state_prev[h_2d_i_j_offset];
-
+                            state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
-                            GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]);
+                            result += state_cur[h_2d_i_j_offset] * r_val;
                            GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]);
                            GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]);
                            GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]);
                            k_vec = GGML_F32_VEC_MUL(v_vec, k_vec);
                            GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]);
                            // kv + s * decay + sa * b
                            state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec);
                            state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec);
                            GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec);
                            result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec);
                        }
-                    }
+                        dst_data[t_h_i_offset] = result;
                    GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec);
                    // There shouldn't be left-overs though.
                    for (; j < head_size; j++) {
                        int64_t t_h_j_offset = t_h_offset + j;
                        int64_t h_2d_i_j_offset = h_2d_i_offset + j;
                        float r_val = r[t_h_j_offset];
                        float w_val = w[t_h_j_offset];
                        float k_val = k[t_h_j_offset];
                        float b_val = b[t_h_j_offset];
                        float kv_val = v[t_h_i_offset] * k_val;
                        float prev_state_val = state_prev[h_2d_i_j_offset];
                        state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
                        dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val;
                    }
                }
            }
-        }
+        #else
            for (int64_t t = 0; t < T; t++) {
                int64_t t_offset = t * t_stride;
                int64_t state_offset = head_size * C * (t / (T / n_seqs));
                float * state_cur = state + state_offset;
                float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
                for (int64_t h = h_start; h < h_end; h++) {
                    int64_t h_offset = h * h_stride;
                    int64_t t_h_offset = t_offset + h_offset;
                    int64_t h_2d_offset = h * h_stride_2d;
                    for (int64_t ii = 0; ii < head_size; ii++) {
                        int64_t t_h_i_offset = t_h_offset + ii;
                        int64_t h_2d_i_offset = h_2d_offset + ii * h_stride;
                        GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]);
                        float sa = 0;
                        {
                            GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
                            GGML_F32_VEC ax[GGML_F32_ARR];
                            GGML_F32_VEC ay[GGML_F32_ARR];
                            for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) {
                                for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
                                    ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]);
                                    ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]);
                                    sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]);
                                }
                            }
                            GGML_F32_VEC_REDUCE(sa, sum);
                        }
                        GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa);
                        int64_t j = 0;
                        GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
                        for (; j < head_size; j += GGML_F32_STEP) {
                            for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
                                int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR;
                                int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR;
                                GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]);
                                GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]);
                                GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]);
                                GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]);
                                k_vec = GGML_F32_VEC_MUL(v_vec, k_vec);
                                GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]);
                                // kv + s * decay + sa * b
                                state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec);
                                state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec);
                                GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec);
                                result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec);
                            }
                        }
                        GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec);
                        // There shouldn't be left-overs though.
                        for (; j < head_size; j++) {
                            int64_t t_h_j_offset = t_h_offset + j;
                            int64_t h_2d_i_j_offset = h_2d_i_offset + j;
                            float r_val = r[t_h_j_offset];
                            float w_val = w[t_h_j_offset];
                            float k_val = k[t_h_j_offset];
                            float b_val = b[t_h_j_offset];
                            float kv_val = v[t_h_i_offset] * k_val;
                            float prev_state_val = state_prev[h_2d_i_j_offset];
                            state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
                            dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val;
                        }
                    }
                }
            }
        #endif
    #else
        for (int64_t t = 0; t < T; t++) {
            int64_t t_offset = t * t_stride;
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@ -17,7 +17,123 @@
 //   number of elements to fit in a single register
 //
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
 #define GGML_SIMD
 // F32 SVE
 #define GGML_F32_EPR 8
 #define DEFAULT_PG svptrue_b32()
 #define GGML_F32xt                        svfloat32_t
 #define GGML_F32xt_ZERO                   svdup_n_f32(0.0f)
 #define GGML_F32xt_SET1(x)                svdup_n_f32(x)
 #define GGML_F32xt_LOAD_IMPL(pg, a, ...)  svld1_f32(pg, a)
 #define GGML_F32xt_LOAD(...)              GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
 #define GGML_F32xt_STORE_IMPL(pg,a,b)     svst1_f32(pg, a, b)
 #define GGML_F32xt_STORE(...)             GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
 #define GGML_F32xt_FMA_IMPL(pg, a, b, c)  svmad_f32_m(pg, a, b, c)
 #define GGML_F32xt_FMA(...)               GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
 #define GGML_F32xt_ADD_IMPL(pg, a, b)     svadd_f32_m(pg, a, b)
 #define GGML_F32xt_ADD(...)               GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
 #define GGML_F32xt_MUL_IMPL(pg, a, b)     svmul_f32_m(pg, a, b)
 #define GGML_F32xt_MUL(...)               GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__)
 #define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
 #define GGML_F32xt_REDUCE_ONE(...)        GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__)
 #define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)  \
 {                                                      \
    sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2);        \
    sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4);        \
    sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6);        \
    sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8);        \
    sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3);        \
    sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7);        \
    sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5);        \
    (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1);  \
 }
 #define GGML_F32xt_REDUCE(...) GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__)
 #define GGML_F32_VEC        GGML_F32xt
 #define GGML_F32_VEC_ZERO   GGML_F32xt_ZERO
 #define GGML_F32_VEC_SET1   GGML_F32xt_SET1
 #define GGML_F32_VEC_LOAD   GGML_F32xt_LOAD
 #define GGML_F32_VEC_STORE  GGML_F32xt_STORE
 #define GGML_F32_VEC_FMA    GGML_F32xt_FMA
 #define GGML_F32_VEC_ADD    GGML_F32xt_ADD
 #define GGML_F32_VEC_MUL    GGML_F32xt_MUL
 #define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
 // F16 NEON
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
    #define GGML_F16_STEP 32
    #define GGML_F16_EPR  8
    #define GGML_F16x8              float16x8_t
    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
    #define GGML_F16x8_LOAD(x)      vld1q_f16((const __fp16 *)(x))
    #define GGML_F16x8_STORE        vst1q_f16
    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
    #define GGML_F16x8_ADD          vaddq_f16
    #define GGML_F16x8_MUL          vmulq_f16
    #define GGML_F16x8_REDUCE(res, x)                               \
    do {                                                            \
        int offset = GGML_F16_ARR >> 1;                             \
        for (int i = 0; i < offset; ++i) {                          \
            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
        }                                                           \
        offset >>= 1;                                               \
        for (int i = 0; i < offset; ++i) {                          \
            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
        }                                                           \
        offset >>= 1;                                               \
        for (int i = 0; i < offset; ++i) {                          \
            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
        }                                                           \
        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
        (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
    } while (0)
    #define GGML_F16_VEC                GGML_F16x8
    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
 #else
    // if FP16 vector arithmetic is not supported, we use FP32 instead
    // and take advantage of the vcvt_ functions to convert to/from FP16
    #define GGML_F16_STEP 16
    #define GGML_F16_EPR  4
    #define GGML_F32Cx4              float32x4_t
    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
    #define GGML_F32Cx4_ADD          vaddq_f32
    #define GGML_F32Cx4_MUL          vmulq_f32
    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
    #define GGML_F16_VEC                GGML_F32Cx4
    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
 #endif
 #elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
 #define GGML_SIMD
--- a/ggml/src/ggml-cpu/vec.cpp
+++ b/ggml/src/ggml-cpu/vec.cpp
@ -17,29 +17,98 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
 #if defined(GGML_SIMD)
    float sumf = 0.0f;
    const int np = (n & ~(GGML_F32_STEP - 1));
-    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
+    #if defined(__ARM_FEATURE_SVE)
        const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
        const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
        const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
-    GGML_F32_VEC ax[GGML_F32_ARR];
+        const int np = (n & ~(ggml_f32_step - 1));
-    GGML_F32_VEC ay[GGML_F32_ARR];
+        svfloat32_t sum1 = svdup_n_f32(0.0f);
        svfloat32_t sum2 = svdup_n_f32(0.0f);
        svfloat32_t sum3 = svdup_n_f32(0.0f);
        svfloat32_t sum4 = svdup_n_f32(0.0f);
        svfloat32_t sum5 = svdup_n_f32(0.0f);
        svfloat32_t sum6 = svdup_n_f32(0.0f);
        svfloat32_t sum7 = svdup_n_f32(0.0f);
        svfloat32_t sum8 = svdup_n_f32(0.0f);
        svfloat32_t ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8;
        svfloat32_t ay1,ay2,ay3,ay4,ay5,ay6,ay7,ay8;
        for (int i = 0; i < np; i += ggml_f32_step) {
            ax1 = GGML_F32_VEC_LOAD(x + i);
            ay1 = GGML_F32_VEC_LOAD(y + i);
            sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
+            ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
-        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
-            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+            sum2 = GGML_F32_VEC_FMA(ax2, ay2, sum2);
            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
+            ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
            ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
            sum3 = GGML_F32_VEC_FMA(ax3, ay3, sum3);
            ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
            ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
            sum4 = GGML_F32_VEC_FMA(ax4, ay4, sum4);
            ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
            ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
            sum5 = GGML_F32_VEC_FMA(ax5, ay5, sum5);
            ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
            ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
            sum6 = GGML_F32_VEC_FMA(ax6, ay6, sum6);
            ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
            ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
            sum7 = GGML_F32_VEC_FMA(ax7, ay7, sum7);
            ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
            ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
            sum8 = GGML_F32_VEC_FMA(ax8, ay8, sum8);
        }
-    }
+        // leftovers
        // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
        const int np2 = (n & ~(ggml_f32_epr - 1));
        for (int i = np; i < np2; i += ggml_f32_epr) {
            ax1 = GGML_F32_VEC_LOAD(x + i);
            ay1 = GGML_F32_VEC_LOAD(y + i);
            sum1 = GGML_F32_VEC_FMA(ax1, ay1, sum1);
        }
        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
        if (np2 < n) {
            svbool_t pg = svwhilelt_b32(np2, n);
            ax1 = svld1_f32(pg, x + np2);
            ay1 = svld1_f32(pg, y + np2);
            sum1 = svmad_f32_m(pg, ax1, ay1, sum1);
        }
        // reduce sum1,sum2 to sum1
        GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
    #else
        const int np = (n & ~(GGML_F32_STEP - 1));
-    // reduce sum0..sum3 to sum0
+        GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
    GGML_F32_VEC_REDUCE(sumf, sum);
-    // leftovers
+        GGML_F32_VEC ax[GGML_F32_ARR];
-    for (int i = np; i < n; ++i) {
+        GGML_F32_VEC ay[GGML_F32_ARR];
-        sumf += x[i]*y[i];
+
-    }
+        for (int i = 0; i < np; i += GGML_F32_STEP) {
            for (int j = 0; j < GGML_F32_ARR; j++) {
                ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
                sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
            }
        }
        // reduce sum0..sum3 to sum0
        GGML_F32_VEC_REDUCE(sumf, sum);
        // leftovers
        for (int i = np; i < n; ++i) {
            sumf += x[i]*y[i];
        }
    #endif
 #else
    // scalar
    ggml_float sumf = 0.0;
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@ -5,6 +5,7 @@
 #include "ggml-impl.h"
 #include "simd-mappings.h"
 #include "ggml.h"
 #include "ggml-cpu.h"
 #if defined(GGML_USE_ACCELERATE)
 #include <Accelerate/Accelerate.h>
@ -148,27 +149,108 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
 inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
 #if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
+    #if defined(__ARM_FEATURE_SVE)
-    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
+        const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
        const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
        const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
-    GGML_F32_VEC ax[GGML_F32_ARR];
+        const int np = (n & ~(ggml_f32_step - 1));
-    GGML_F32_VEC ay[GGML_F32_ARR];
+        svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
        svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
        for (int i = 0; i < np; i += ggml_f32_step) {
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
+            ax1 = GGML_F32_VEC_LOAD(x + i);
-        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ay1 = GGML_F32_VEC_LOAD(y + i);
-            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+            ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
            ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+            GGML_F32_VEC_STORE(y + i, ay1);
            ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
            ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
            ay2 = GGML_F32_VEC_FMA(ax2, vx, ay2);
            GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
            ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
            ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
            ay3 = GGML_F32_VEC_FMA(ax3, vx, ay3);
            GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
            ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
            ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
            ay4 = GGML_F32_VEC_FMA(ax4, vx, ay4);
            GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
            ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
            ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
            ay5 = GGML_F32_VEC_FMA(ax5, vx, ay5);
            GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
            ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
            ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
            ay6 = GGML_F32_VEC_FMA(ax6, vx, ay6);
            GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
            ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
            ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
            ay7 = GGML_F32_VEC_FMA(ax7, vx, ay7);
            GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
            ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
            ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
            ay8 = GGML_F32_VEC_FMA(ax8, vx, ay8);
            GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
        }
-    }
+        // leftovers
        // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
        const int np2 = (n & ~(ggml_f32_epr - 1));
        for (int i = np; i < np2; i += ggml_f32_epr) {
            ax1 = GGML_F32_VEC_LOAD(x + i);
            ay1 = GGML_F32_VEC_LOAD(y + i);
            ay1 = GGML_F32_VEC_FMA(ax1, vx, ay1);
-    // leftovers
+            GGML_F32_VEC_STORE(y + i, ay1);
-    for (int i = np; i < n; ++i) {
+        }
-        y[i] += x[i]*v;
+        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
-    }
+        if (np2 < n) {
            svbool_t pg =svwhilelt_b32(np2, n);
            ax1 = svld1_f32(pg, x + np2);
            ay1 = svld1_f32(pg, y + np2);
            ay1 = svmad_f32_m(pg, ax1, vx, ay1);
            svst1_f32(pg, y + np2, ay1);
        }
    #else
        const int np = (n & ~(GGML_F32_STEP - 1));
        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
        GGML_F32_VEC ax[GGML_F32_ARR];
        GGML_F32_VEC ay[GGML_F32_ARR];
        for (int i = 0; i < np; i += GGML_F32_STEP) {
            for (int j = 0; j < GGML_F32_ARR; j++) {
                ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
                ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
            }
        }
        // leftovers
        for (int i = np; i < n; ++i) {
            y[i] += x[i]*v;
        }
    #endif
 #else
    // scalar
    for (int i = 0; i < n; ++i) {
@ -220,36 +302,45 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
    }
 #if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
+    #if defined(__ARM_FEATURE_SVE)
-
+        // scalar Route to scalar implementation       //TODO: Write SVE code
-    GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
+        for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-
+            for (int i = 0; i < n; ++i) {
-    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+                y[i] += x[k][i]*v[k][0];
        vx[k] = GGML_F32_VEC_SET1(v[k][0]);
    }
    GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
    GGML_F32_VEC ay[GGML_F32_ARR];
    for (int i = 0; i < np; i += GGML_F32_STEP) {
        for (int j = 0; j < GGML_F32_ARR; j++) {
            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
            for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
                ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
                ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
            }
            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
        }
-    }
+    #else
        const int np = (n & ~(GGML_F32_STEP - 1));
-    // leftovers
+        GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
-    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+
-        for (int i = np; i < n; ++i) {
+        for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
-            y[i] += x[k][i]*v[k][0];
+            vx[k] = GGML_F32_VEC_SET1(v[k][0]);
        }
-    }
+
        GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
        GGML_F32_VEC ay[GGML_F32_ARR];
        for (int i = 0; i < np; i += GGML_F32_STEP) {
            for (int j = 0; j < GGML_F32_ARR; j++) {
                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
                for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
                    ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
                    ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
                }
                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
            }
        }
        // leftovers
        for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
            for (int i = np; i < n; ++i) {
                y[i] += x[k][i]*v[k][0];
            }
        }
    #endif
 #else
    // scalar
    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
@ -265,25 +356,53 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
 #if defined(GGML_USE_ACCELERATE)
    vDSP_vsmul(y, 1, &v, y, 1, n);
 #elif defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
+    #if defined(__ARM_FEATURE_SVE)
        const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
        const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
        const int ggml_f32_step = 2 * ggml_f32_epr;
-    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
+        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
        const int np = (n & ~(ggml_f32_step - 1));
        svfloat32_t ay1;
        svfloat32_t ay2;
        for (int i = 0; i < np; i += ggml_f32_step) {
            ay1 = GGML_F32_VEC_LOAD(y + i);
            ay1 = GGML_F32_VEC_MUL(ay1, vx);
            GGML_F32_VEC_STORE(y + i, ay1);
-    GGML_F32_VEC ay[GGML_F32_ARR];
+            ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
-
+            ay2 = GGML_F32_VEC_MUL(ay2, vx);
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
+            GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
        for (int j = 0; j < GGML_F32_ARR; j++) {
            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
            ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
        }
-    }
+        // leftovers
        // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
        if (np < n) {
            svbool_t pg = svwhilelt_b32(np, n);
            ay1 = svld1_f32(pg, y + np);
            ay1 = svmul_f32_m(pg, ay1, vx);
            svst1_f32(pg, y + np, ay1);
        }
    #else
        const int np = (n & ~(GGML_F32_STEP - 1));
-    // leftovers
+        GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
-    for (int i = np; i < n; ++i) {
+
-        y[i] *= v;
+        GGML_F32_VEC ay[GGML_F32_ARR];
-    }
+
        for (int i = 0; i < np; i += GGML_F32_STEP) {
            for (int j = 0; j < GGML_F32_ARR; j++) {
                ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
                ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
                GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
            }
        }
        // leftovers
        for (int i = np; i < n; ++i) {
            y[i] *= v;
        }
    #endif
 #else
    // scalar
    for (int i = 0; i < n; ++i) {
@ -528,6 +647,42 @@ inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
 #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
 #endif
 /* Below function was borrowed from the GitHub repository:
 https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
 #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
    inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
        // Constants
        const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
        const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
        const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
        const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
        const svfloat32_t one = svdup_n_f32(1.0f);
        const svfloat32_t inactive1 = svdup_n_f32(0.0f);
        const svint32_t inactive2 = svdup_n_s32(0);
        // Algorithm starts here
        svfloat32_t t0 = svmul_f32_m(pg, src, log2_e);  // y = x * log2(e)
        svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0);         // rount to int (float)
        svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1);         // n
        t1 = svsub_f32_m(pg, t0, t1);   // a = y - floor(y)
        t1 = svadd_f32_m(pg, t1, one);  // b = a + 1
        svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17);  // v = b >> 17 (u32)
        svfloat32_t t4 = svexpa_f32(t3);                                   // c = fexpa(v)
        t4 = svscale_f32_m(pg, t4, t2);                                    // fexpa(v) * 2^(n)
        // and_(t2.d, t1.d, not_mask17.d)
        svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
        t5 = svsub_f32_m(pg, t1, t5);                // z
        t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq);  // ln2 + half_ln2_sq * z
        t0 = svmla_f32_m(pg, one, t5, t0);           // 1 + (ln2 * z) + (half_ln2_sq * z * z)
        t0 = svmul_f32_m(pg, t0, t4);                // Final result
        return t0;
    }
 #endif
 #if defined(__ARM_NEON) && defined(__aarch64__)
 // adapted from arm limited optimized routine
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -2999,9 +2999,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            {
                struct ggml_tensor * a = op->src[0];
                struct ggml_tensor * b = op->src[1];
                // for small weight matrices the active device can end up without any rows, don't use row split in those cases
                // this avoids some edge cases (and the performance would not be good anyways)
                if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
                    if (a->ne[2] > 1 || a->ne[3] > 1) {
                        return false;
                    }
                    // for small weight matrices the active device can end up without any rows, don't use row split in those cases
                    // this avoids some edge cases (and the performance would not be good anyways)
                    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
                    int64_t row_low;
                    int64_t row_high;
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -177,6 +177,9 @@ class Keys:
        EMBEDDING_LENGTH = "{arch}.convnext.embedding_length"
        BLOCK_COUNT      = "{arch}.convnext.block_count"
    class Classifier:
        OUTPUT_LABELS = "{arch}.classifier.output_labels"
    class Tokenizer:
        MODEL                = "tokenizer.ggml.model"
        PRE                  = "tokenizer.ggml.pre"
@ -1033,6 +1036,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.POS_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.ATTN_OUT_NORM,
        MODEL_TENSOR.ATTN_QKV,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -49,6 +49,7 @@ class TensorInfo:
 class GGUFValue:
    value: Any
    type: GGUFValueType
    sub_type: GGUFValueType | None = None
 class WriterState(Enum):
@ -238,7 +239,7 @@ class GGUFWriter:
            for key, val in kv_data.items():
                kv_bytes += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
-                kv_bytes += self._pack_val(val.value, val.type, add_vtype=True)
+                kv_bytes += self._pack_val(val.value, val.type, add_vtype=True, sub_type=val.sub_type)
            fout.write(kv_bytes)
@ -268,11 +269,11 @@ class GGUFWriter:
            fout.flush()
        self.state = WriterState.TI_DATA
-    def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
+    def add_key_value(self, key: str, val: Any, vtype: GGUFValueType, sub_type: GGUFValueType | None = None) -> None:
        if any(key in kv_data for kv_data in self.kv_data):
            raise ValueError(f'Duplicated key name {key!r}')
-        self.kv_data[0][key] = GGUFValue(value=val, type=vtype)
+        self.kv_data[0][key] = GGUFValue(value=val, type=vtype, sub_type=sub_type)
    def add_uint8(self, key: str, val: int) -> None:
        self.add_key_value(key,val, GGUFValueType.UINT8)
@ -1022,7 +1023,7 @@ class GGUFWriter:
            pack_prefix = '<' if self.endianess == GGUFEndian.LITTLE else '>'
        return struct.pack(f'{pack_prefix}{fmt}', value)
-    def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes:
+    def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool, sub_type: GGUFValueType | None = None) -> bytes:
        kv_data = bytearray()
        if add_vtype:
@ -1043,7 +1044,9 @@ class GGUFWriter:
            if len(val) == 0:
                raise ValueError("Invalid GGUF metadata array. Empty array")
-            if isinstance(val, bytes):
+            if sub_type is not None:
                ltype = sub_type
            elif isinstance(val, bytes):
                ltype = GGUFValueType.UINT8
            else:
                ltype = GGUFValueType.get_type(val[0])
--- a/gguf-py/gguf/scripts/gguf_editor_gui.py
+++ b/gguf-py/gguf/scripts/gguf_editor_gui.py
@ -1521,19 +1521,21 @@ class GGUFEditorWindow(QMainWindow):
                    continue
                # Apply changes if any
                sub_type = None
                if field.name in self.metadata_changes:
                    value_type, value = self.metadata_changes[field.name]
                    if value_type == GGUFValueType.ARRAY:
                        # Handle array values
-                        element_type, array_values = value
+                        sub_type, value = value
                        writer.add_array(field.name, array_values)
                    else:
                        writer.add_key_value(field.name, value, value_type)
                else:
                    # Copy original value
                    value = field.contents()
-                    if value is not None and field.types:
+                    value_type = field.types[0]
-                        writer.add_key_value(field.name, value, field.types[0])
+                    if value_type == GGUFValueType.ARRAY:
                        sub_type = field.types[-1]
                if value is not None:
                    writer.add_key_value(field.name, value, value_type, sub_type=sub_type)
            # Add new metadata
            for key, (value_type, value) in self.metadata_changes.items():
@ -1541,7 +1543,12 @@ class GGUFEditorWindow(QMainWindow):
                if self.reader.get_field(key) is not None:
                    continue
-                writer.add_key_value(key, value, value_type)
+                sub_type = None
                if value_type == GGUFValueType.ARRAY:
                    # Handle array values
                    sub_type, value = value
                writer.add_key_value(key, value, value_type, sub_type=sub_type)
            # Add tensors (including data)
            for tensor in self.reader.tensors:
--- a/gguf-py/gguf/scripts/gguf_new_metadata.py
+++ b/gguf-py/gguf/scripts/gguf_new_metadata.py
@ -24,6 +24,7 @@ class MetadataDetails(NamedTuple):
    type: gguf.GGUFValueType
    value: Any
    description: str = ''
    sub_type: gguf.GGUFValueType | None = None
 def get_field_data(reader: gguf.GGUFReader, key: str) -> Any:
@ -57,7 +58,9 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
            logger.debug(f'Removing {field.name}')
            continue
-        old_val = MetadataDetails(field.types[0], field.contents())
+        val_type = field.types[0]
        sub_type = field.types[-1] if val_type == gguf.GGUFValueType.ARRAY else None
        old_val = MetadataDetails(val_type, field.contents(), sub_type=sub_type)
        val = new_metadata.get(field.name, old_val)
        if field.name in new_metadata:
@ -67,7 +70,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
            logger.debug(f'Copying {field.name}')
        if val.value is not None:
-            writer.add_key_value(field.name, val.value, val.type)
+            writer.add_key_value(field.name, val.value, val.type, sub_type=sub_type if val.sub_type is None else val.sub_type)
    if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
        logger.debug('Adding chat template(s)')
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -157,6 +157,7 @@ class TensorNameMap:
            "h.{bid}.attn.c_attn",                                                 # gpt2
            "transformer.h.{bid}.mixer.Wqkv",                                      # phi2
            "encoder.layers.{bid}.attn.Wqkv",                                      # nomic-bert
            "encoder.layers.{bid}.mixer.Wqkv",                                     # jina
            "model.layers.{bid}.self_attn.qkv_proj",                               # phi3
            "encoder.layers.{bid}.self_attention.query_key_value",                 # chatglm
            "transformer.layers.{bid}.attn.qkv_proj",                              # openelm
@ -168,6 +169,7 @@ class TensorNameMap:
            "model.layers.{bid}.self_attn.q_proj_no_perm",               # llama-custom
            "layers.{bid}.attention.wq",                                 # llama-pth
            "encoder.layer.{bid}.attention.self.query",                  # bert
            "transformer.layer.{bid}.attention.q_lin",                   # distillbert
            "transformer.h.{bid}.attn.q_proj",                           # gpt-j
            "model.layers.layers.{bid}.self_attn.q_proj",                # plamo
            "model.layers.{bid}.attention.wq",                           # internlm2
@ -182,6 +184,7 @@ class TensorNameMap:
            "model.layers.{bid}.self_attn.k_proj_no_perm",             # llama-custom
            "layers.{bid}.attention.wk",                               # llama-pth
            "encoder.layer.{bid}.attention.self.key",                  # bert
            "transformer.layer.{bid}.attention.k_lin",                 # distillbert
            "transformer.h.{bid}.attn.k_proj",                         # gpt-j
            "transformer.h.{bid}.attn.k",                              # refact
            "model.layers.layers.{bid}.self_attn.k_proj",              # plamo
@ -196,6 +199,7 @@ class TensorNameMap:
            "model.layers.{bid}.self_attn.v_proj",                       # llama-hf nemotron olmoe olmo2 phimoe
            "layers.{bid}.attention.wv",                                 # llama-pth
            "encoder.layer.{bid}.attention.self.value",                  # bert
            "transformer.layer.{bid}.attention.v_lin",                   # distillbert
            "transformer.h.{bid}.attn.v_proj",                           # gpt-j
            "transformer.h.{bid}.attn.v",                                # refact
            "model.layers.layers.{bid}.self_attn.v_proj",                # plamo
@ -216,6 +220,7 @@ class TensorNameMap:
            "model.layers.{bid}.self_attn.linear_attn",                     # deci
            "layers.{bid}.attention.wo",                                    # llama-pth
            "encoder.layer.{bid}.attention.output.dense",                   # bert
            "transformer.layer.{bid}.attention.out_lin",                    # distillbert
            "transformer.h.{bid}.attn.out_proj",                            # gpt-j
            "language_model.encoder.layers.{bid}.self_attention.dense",     # persimmon
            "model.layers.{bid}.self_attn.dense",                           # persimmon
@ -224,6 +229,7 @@ class TensorNameMap:
            "model.layers.layers.{bid}.self_attn.o_proj",                   # plamo
            "model.layers.{bid}.attention.wo",                              # internlm2
            "encoder.layers.{bid}.attn.out_proj",                           # nomic-bert
            "encoder.layers.{bid}.mixer.out_proj",                          # jina
            "transformer.decoder_layer.{bid}.multi_head_attention.linear",  # Grok
            "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj",        # dbrx
            "encoder.layers.{bid}.self_attention.dense",                    # chatglm
@ -235,6 +241,7 @@ class TensorNameMap:
        # Attention output norm
        MODEL_TENSOR.ATTN_OUT_NORM: (
            "encoder.layer.{bid}.attention.output.LayerNorm",  # bert
            "transformer.layer.{bid}.sa_layer_norm",           # distillbert
            "encoder.layers.{bid}.norm1",                      # nomic-bert
            "transformer.decoder_layer.{bid}.rms_norm_1",      # Grok
            "transformer.blocks.{bid}.norm_attn_norm.norm_2",  # dbrx
@ -311,6 +318,7 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.up_proj",                         # llama-hf refact nemotron olmo2
            "layers.{bid}.feed_forward.w3",                           # llama-pth
            "encoder.layer.{bid}.intermediate.dense",                 # bert
            "transformer.layer.{bid}.ffn.lin1",                       # distillbert
            "transformer.h.{bid}.mlp.fc_in",                          # gpt-j
            "transformer.h.{bid}.mlp.linear_3",                       # refact
            "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
@ -394,6 +402,7 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.down_proj",                       # llama-hf nemotron olmo2
            "layers.{bid}.feed_forward.w2",                           # llama-pth
            "encoder.layer.{bid}.output.dense",                       # bert
            "transformer.layer.{bid}.ffn.lin2",                       # distillbert
            "transformer.h.{bid}.mlp.fc_out",                         # gpt-j
            "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
            "model.layers.{bid}.mlp.dense_4h_to_h",                   # persimmon
@ -455,6 +464,7 @@ class TensorNameMap:
        MODEL_TENSOR.LAYER_OUT_NORM: (
            "encoder.layer.{bid}.output.LayerNorm",         # bert
            "transformer.layer.{bid}.output_layer_norm",    # distillbert
            "encoder.layers.{bid}.norm2",                   # nomic-bert
            "transformer.decoder_layer.{bid}.rms_norm_3",   # Grok
            "encoder.layer.{bid}.mlp.layernorm",            # jina-bert-v2
@ -825,6 +835,7 @@ class TensorNameMap:
        MODEL_TENSOR.CLS: (
            "classifier",       # jina
            "classifier.dense", # roberta
            "pre_classifier",   # distillbert
        ),
        MODEL_TENSOR.CLS_OUT: (
--- a/gguf-py/gguf/utility.py
+++ b/gguf-py/gguf/utility.py
@ -231,7 +231,7 @@ class SafetensorRemote:
        response.raise_for_status()
        # Get raw byte data
-        return response.content[:size]
+        return response.content[slice(size if size > -1 else None)]
    @classmethod
    def check_file_exist(cls, url: str) -> bool:
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.16.3"
+version = "0.17.0"
 description = "Read and write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
--- a/klite.embd
+++ b/klite.embd
@ -8918,17 +8918,30 @@ Current version indicated by LITEVER below.
 		},
 		// aicharactercards.com
 		{
-			name: "aicharactercards.com",
+			name: "AICC / character-tavern.com",
 			urlParam: "aicc",
 			inputBox: {
-				text: "Enter aicharactercards.com prompt URL",
+				text: "Enter aicharactercards.com or character-tavern.com prompt URL",
 				placeholder: "https://aicharactercards.com/character-cards/work-jobs/deffcolony/lara-lightland",
 			},
 			extraction: (userInput) => {
 				if (userInput.match(/aicharactercards\.com\//i) && userInput.match(/sdm_process_download/i))
 				{
 					return userInput;
-				} else {
+				} else if(userInput.match(/character-tavern\.com\//i))
 				{
 					userInput = userInput.replaceAll("%20","%2520");
 					if(userInput.match(/com\/character\//i))
 					{
 						userInput = userInput.split(".png")[0];
 						userInput = userInput.split("?")[0];
 						userInput = userInput.split(".com/character/")[1];
 						userInput = userInput.endsWith('/') ? userInput.slice(0, -1) : userInput;
 						return `https://cards.character-tavern.com/${userInput}.png?action=download`;
 					}
 					return userInput;
 				}
 				 else {
 					userInput = userInput.split("#")[0].split("?")[0];
 					userInput = userInput.endsWith('/') ? userInput.slice(0, -1) : userInput;
 					if (userInput.match(/aicharactercards\.com\//i) || userInput.match(/AICC\//i)) {
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -52,7 +52,7 @@ logit_bias_max = 512
 dry_seq_break_max = 128
 # global vars
-KcppVersion = "1.92.1"
+KcppVersion = "1.93"
 showdebug = True
 kcpp_instance = None #global running instance
 global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False}
--- a/models/ggml-vocab-nomic-bert-moe.gguf
+++ b/models/ggml-vocab-nomic-bert-moe.gguf
--- a/models/ggml-vocab-nomic-bert-moe.gguf.inp
+++ b/models/ggml-vocab-nomic-bert-moe.gguf.inp
@ -1,112 +0,0 @@
 ied 4 ½ months
 __ggml_vocab_test__
 Führer
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 this is 🦙.cpp
 __ggml_vocab_test__
 w048 7tuijk dsdfhu
 __ggml_vocab_test__
 нещо на Български
 __ggml_vocab_test__
 កាន់តែពិសេសអាចខលចេញ
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
  Hello
 __ggml_vocab_test__
   Hello
 __ggml_vocab_test__
    Hello
 __ggml_vocab_test__
    Hello
    Hello
 __ggml_vocab_test__
 (
 __ggml_vocab_test__
 =
 __ggml_vocab_test__
 ' era
 __ggml_vocab_test__
 Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
 __ggml_vocab_test__
 !!!!!!
 __ggml_vocab_test__
 3
 __ggml_vocab_test__
 33
 __ggml_vocab_test__
 333
 __ggml_vocab_test__
 3333
 __ggml_vocab_test__
 33333
 __ggml_vocab_test__
 333333
 __ggml_vocab_test__
 3333333
 __ggml_vocab_test__
 33333333
 __ggml_vocab_test__
 333333333
 __ggml_vocab_test__
 Cửa Việt
 __ggml_vocab_test__
 discards
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
 __ggml_vocab_test__
--- a/models/ggml-vocab-nomic-bert-moe.gguf.out
+++ b/models/ggml-vocab-nomic-bert-moe.gguf.out
@ -1,46 +0,0 @@
 17 297 201 78660 21775
 72805 4097 56
 35378 8999
 35378 8999
 35378 6661
 35378 6661
 35378 6661 38
 35378 4 8999 38
 35378 4 8999 38
 903 83 6 3 5 238 6366
 148 7709 1019 361 458 134362 104 7 71 420 1132
 14271 29 117152
 6 149561 78270 48967 64254 7616 81705
 6 247206 15 33176 16 6 247442 6 3 15755 15 144227 8705 18255 40292 158 4460 33 27686 16 6 142325 15 191 538 28 121505 450 1556 6863 10002 47 1098 16
 35378
 35378
 35378
 35378
 35378
 35378 35378
 15
 2203
 242 1615
 35378 4 113 25 5584 38 11249 621 398 6 201344 705 23638 213 9007 133 1879 2681 2592 135224 1906 6087
 6 90827
 138
 3912
 6 66000
 138 66000
 3912 66000
 6 66000 66000
 138 66000 66000
 3912 66000 66000
 6 66000 66000 66000
 199152 3763
 17116 99397
 6 247206 15 33176 16 6 247442 6 3 15755 15 144227 8705 18255 40292 158 4460 33 27686 16 6 142325 6 3 138 3912 6 66000 138 66000 3912 66000 6 66000 66000 138 66000 66000 3912 66000 66000 80308 1031 5 363 138 27 363 6 149561 78270 48967 201344 705 23638 213 9007 133 1879 2681 2592 135224 1906 6087 6 110405 1369 69112 69112 69112 14271 29 117152 5106 4765 4765 1135 164721 164721 164721 58 58 58 58 2551 90827 32 85908 87 25 272 2809 242 18 18345 764 25 7 2685 4 242 11766 398 9077 32 242 594 959 9077 87 25 1181 3249 442 4 242 397 398 1884 3060 26156 32 1401 25 26455 10 25 141 866
--- a/models/templates/Qwen-QwQ-32B.jinja
+++ b/models/templates/Qwen-QwQ-32B.jinja
@ -1,62 +0,0 @@
 {%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0]['role'] == 'system' %}
        {{- messages[0]['content'] }}
    {%- else %}
        {{- '' }}
    {%- endif %}
    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
 {%- else %}
    {%- if messages[0]['role'] == 'system' %}
        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
  {%- endif %}
 {%- endif %}
 {%- for message in messages %}
    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
    {%- elif message.role == "assistant" and not message.tool_calls %}
        {%- set content = message.content %}
        {%- if not loop.last %}
            {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
        {%- endif %}
        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
    {%- elif message.role == "assistant" %}
        {%- set content = message.content %}
        {%- if not loop.last %}
            {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
        {%- endif %}
        {{- '<|im_start|>' + message.role }}
        {%- if message.content %}
            {{- '\n' + content }}
        {%- endif %}
        {%- for tool_call in message.tool_calls %}
            {%- if tool_call.function is defined %}
                {%- set tool_call = tool_call.function %}
            {%- endif %}
            {{- '\n<tool_call>\n{"name": "' }}
            {{- tool_call.name }}
            {{- '", "arguments": ' }}
            {{- tool_call.arguments | tojson }}
            {{- '}\n</tool_call>' }}
        {%- endfor %}
        {{- '<|im_end|>\n' }}
    {%- elif message.role == "tool" %}
        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
            {{- '<|im_start|>user' }}
        {%- endif %}
        {{- '\n<tool_response>\n' }}
        {{- message.content }}
        {{- '\n</tool_response>' }}
        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
            {{- '<|im_end|>\n' }}
        {%- endif %}
    {%- endif %}
 {%- endfor %}
 {%- if add_generation_prompt %}
    {{- '<|im_start|>assistant\n<think>\n' }}
 {%- endif %}
--- a/models/templates/Qwen-Qwen3-0.6B.jinja
+++ b/models/templates/Qwen-Qwen3-0.6B.jinja
@ -1,85 +0,0 @@
 {%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0].role == 'system' %}
        {{- messages[0].content + '\n\n' }}
    {%- endif %}
    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
 {%- else %}
    {%- if messages[0].role == 'system' %}
        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
    {%- endif %}
 {%- endif %}
 {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
 {%- for message in messages[::-1] %}
    {%- set index = (messages|length - 1) - loop.index0 %}
    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
        {%- set ns.multi_step_tool = false %}
        {%- set ns.last_query_index = index %}
    {%- endif %}
 {%- endfor %}
 {%- for message in messages %}
    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
    {%- elif message.role == "assistant" %}
        {%- set content = message.content %}
        {%- set reasoning_content = '' %}
        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
            {%- set reasoning_content = message.reasoning_content %}
        {%- else %}
            {%- if '</think>' in message.content %}
                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
            {%- endif %}
        {%- endif %}
        {%- if loop.index0 > ns.last_query_index %}
            {%- if loop.last or (not loop.last and reasoning_content) %}
                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
            {%- else %}
                {{- '<|im_start|>' + message.role + '\n' + content }}
            {%- endif %}
        {%- else %}
            {{- '<|im_start|>' + message.role + '\n' + content }}
        {%- endif %}
        {%- if message.tool_calls %}
            {%- for tool_call in message.tool_calls %}
                {%- if (loop.first and content) or (not loop.first) %}
                    {{- '\n' }}
                {%- endif %}
                {%- if tool_call.function %}
                    {%- set tool_call = tool_call.function %}
                {%- endif %}
                {{- '<tool_call>\n{"name": "' }}
                {{- tool_call.name }}
                {{- '", "arguments": ' }}
                {%- if tool_call.arguments is string %}
                    {{- tool_call.arguments }}
                {%- else %}
                    {{- tool_call.arguments | tojson }}
                {%- endif %}
                {{- '}\n</tool_call>' }}
            {%- endfor %}
        {%- endif %}
        {{- '<|im_end|>\n' }}
    {%- elif message.role == "tool" %}
        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
            {{- '<|im_start|>user' }}
        {%- endif %}
        {{- '\n<tool_response>\n' }}
        {{- message.content }}
        {{- '\n</tool_response>' }}
        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
            {{- '<|im_end|>\n' }}
        {%- endif %}
    {%- endif %}
 {%- endfor %}
 {%- if add_generation_prompt %}
    {{- '<|im_start|>assistant\n' }}
    {%- if enable_thinking is defined and enable_thinking is false %}
        {{- '<think>\n\n</think>\n\n' }}
    {%- endif %}
 {%- endif %}
--- a/otherarch/sdcpp/model.h
+++ b/otherarch/sdcpp/model.h
@ -12,7 +12,7 @@
 #include "ggml-backend.h"
 #include "ggml.h"
-#include "json.hpp"
+#include <nlohmann/json.hpp>
 #include "zip.h"
 #include "gguf.h"
--- a/otherarch/sdcpp/t5.hpp
+++ b/otherarch/sdcpp/t5.hpp
@ -12,7 +12,7 @@
 #include "darts.h"
 #include "ggml_extend.hpp"
-#include "json.hpp"
+#include <nlohmann/json.hpp>
 #include "model.h"
 // Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h
--- a/otherarch/sdcpp/thirdparty/json.hpp
+++ b/otherarch/sdcpp/thirdparty/json.hpp
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@ -174,6 +174,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
    { LLM_KV_CONVNEXT_BLOCK_COUNT,      "%s.convnext.block_count"      },
    { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
    { LLM_KV_TOKENIZER_MODEL,                "tokenizer.ggml.model"                    },
    { LLM_KV_TOKENIZER_PRE,                  "tokenizer.ggml.pre"                      },
    { LLM_KV_TOKENIZER_LIST,                 "tokenizer.ggml.tokens"                   },
@ -448,6 +450,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_TOKEN_TYPES,     "token_types" },
            { LLM_TENSOR_POS_EMBD,        "position_embd" },
            { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@ -213,6 +213,8 @@ enum llm_kv {
    LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
    LLM_KV_CONVNEXT_BLOCK_COUNT,
    LLM_KV_CLASSIFIER_OUTPUT_LABELS,
    // deprecated:
    LLM_KV_TOKENIZER_PREFIX_ID,
    LLM_KV_TOKENIZER_SUFFIX_ID,
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@ -131,6 +131,9 @@ struct llama_hparams {
    bool attn_soft_cap = false;
    bool use_kq_norm   = true;
    // for Classifiers
    uint32_t n_cls_out = 1;
    // llama4
    uint32_t n_moe_layer_step        = 0;
    uint32_t n_no_rope_layer_step    = 4;
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -688,6 +688,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
                ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
                ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type, false);
                ml.get_arr_n(LLM_KV_CLASSIFIER_OUTPUT_LABELS, hparams.n_cls_out, false);
                switch (hparams.n_layer) {
                    case 3:
@ -2209,7 +2210,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
            case LLM_ARCH_NOMIC_BERT_MOE:
                {
                    tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
-                    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
+                    type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
                    if (arch == LLM_ARCH_BERT) {
                        pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD,    "weight"), {n_embd, n_ctx_train}, 0);
@ -2217,8 +2218,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        cls   = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
                        cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"),   {n_embd},         TENSOR_NOT_REQUIRED);
-                        cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
+                        cls_out   = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
-                        cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {1},         TENSOR_NOT_REQUIRED);
+                        cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"),   {hparams.n_cls_out},         TENSOR_NOT_REQUIRED);
                    }
                    tok_norm   = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
@ -2227,7 +2228,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];
-                        if (arch == LLM_ARCH_BERT) {
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
                        layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
                        if (!layer.wqkv) {
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd}, 0);
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i),   {n_embd}, 0);
@ -2236,12 +2240,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, 0);
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i),   {n_embd_gqa}, 0);
                        } else {
                            layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
                        }
                        if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
                            layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
                        }
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {n_embd, n_embd}, 0);
@ -5987,8 +5985,10 @@ struct llm_build_bert : public llm_graph_context {
        inpL = build_inp_embd(model.tok_embd);
        // token types are hardcoded to zero ("Sentence A")
-        ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
+        if (model.type_embd) {
-        inpL = ggml_add(ctx0, inpL, type_row0);
+            ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
            inpL = ggml_add(ctx0, inpL, type_row0);
        }
        if (model.arch == LLM_ARCH_BERT) {
            inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
        }
@ -6009,36 +6009,11 @@ struct llm_build_bert : public llm_graph_context {
            ggml_tensor * Vcur;
            // self-attention
-            if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
+            if (model.layers[il].wqkv) {
                Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
                if (model.layers[il].attn_q_norm) {
                    Qcur = build_norm(Qcur,
                            model.layers[il].attn_q_norm,
                            model.layers[il].attn_q_norm_b,
                            LLM_NORM, il);
                }
                Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
                if (model.layers[il].attn_k_norm) {
                    Kcur = build_norm(Kcur,
                            model.layers[il].attn_k_norm,
                            model.layers[il].attn_k_norm_b,
                            LLM_NORM, il);
                }
                Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
            } else {
                // compute Q and K and RoPE them
                cur = build_lora_mm(model.layers[il].wqkv, cur);
                cb(cur, "wqkv", il);
-                if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
+                if (model.layers[il].bqkv) {
                    cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
                    cb(cur, "bqkv", il);
                }
@ -6046,11 +6021,32 @@ struct llm_build_bert : public llm_graph_context {
                Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
                Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
                Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
            } else {
                Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, cur), model.layers[il].bq);
                Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, cur), model.layers[il].bk);
                Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, cur), model.layers[il].bv);
            }
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+            if (model.layers[il].attn_q_norm) {
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Qcur = build_norm(Qcur,
-                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+                        model.layers[il].attn_q_norm,
                        model.layers[il].attn_q_norm_b,
                        LLM_NORM, il);
            }
            if (model.layers[il].attn_k_norm) {
                Kcur = build_norm(Kcur,
                        model.layers[il].attn_k_norm,
                        model.layers[il].attn_k_norm_b,
                        LLM_NORM, il);
            }
            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
            // RoPE
            if (model.arch == LLM_ARCH_NOMIC_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
                Qcur = ggml_rope_ext(
                        ctx0, Qcur, inp_pos, nullptr,
                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@ -13366,7 +13362,6 @@ llm_graph_result_ptr llama_model::build_graph(
    switch (arch) {
        case LLM_ARCH_LLAMA:
        case LLM_ARCH_MINICPM:
            {
                llm = std::make_unique<llm_build_llama>(*this, params, gf);
            } break;
@ -13607,6 +13602,7 @@ llm_graph_result_ptr llama_model::build_graph(
            } break;
        case LLM_ARCH_GRANITE:
        case LLM_ARCH_GRANITE_MOE:
        case LLM_ARCH_MINICPM:
            {
                llm = std::make_unique<llm_build_granite>(*this, params, gf);
            } break;
--- a/tests/test-chat-parser.cpp
+++ b/tests/test-chat-parser.cpp
@ -1,355 +0,0 @@
 //  Tests chat handling, including grammar generation and parsing for tool calling, for various templates.
 //
 //  Also acts as a CLI to generate a Markdown summary of the formats of Jinja templates,
 //  e.g. given Minja (http://github.com/google/minja) checked out in parent dir:
 //
 //    cmake -B build && cmake --build build --parallel && ./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null
 //
 #include <exception>
 #include <iostream>
 #include <json.hpp>
 #include <string>
 #include "chat-parser.h"
 #include "common.h"
 #include "log.h"
 #include "regex-partial.h"
 using json = nlohmann::ordered_json;
 template <class T>
 static void assert_equals(const T & expected, const T & actual) {
    if (expected != actual) {
        std::cerr << "Expected: " << expected << std::endl;
        std::cerr << "Actual: " << actual << std::endl;
        std::cerr << std::flush;
        throw std::runtime_error("Test failed");
    }
 }
 static void assert_equals(const char * expected, const std::string & actual) {
  return assert_equals<std::string>(expected, actual);
 }
 static void assert_throws(const std::function<void()> & fn, const std::string & expected_exception_pattern = "") {
    try {
        fn();
    } catch (const std::exception & e) {
      if (expected_exception_pattern.empty()) {
          return;
        }
        std::regex expected_exception_regex(expected_exception_pattern);
        std::string actual_message = e.what();
        if (std::regex_search(actual_message, expected_exception_regex)) {
            return;
        }
        throw std::runtime_error("Exception doesn't match expected pattern: " + actual_message + " (pattern: " + expected_exception_pattern + ")");
        throw std::runtime_error("Exception of unexpected type: " + std::string(e.what()));
    }
    throw std::runtime_error("Exception was expected but not thrown");
 }
 static void test_reasoning() {
  {
    common_chat_msg_parser builder("<tnk>Cogito</tnk>Ergo sum", /* is_partial= */ false, {
        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
        /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
        /* .reasoning_in_content = */ false,
        /* .thinking_forced_open = */ false,
    });
    assert_equals(false, builder.try_parse_reasoning("<tnk>", "</tnk>"));
    assert_equals("<tnk>Cogito</tnk>Ergo sum", builder.consume_rest());
  }
  {
    common_chat_msg_parser builder("<tnk>Cogito</tnk>Ergo sum", /* is_partial= */ false, {
        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
        /* .reasoning_in_content = */ false,
        /* .thinking_forced_open = */ false,
    });
    assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
    assert_equals(std::string("Cogito"), builder.result().reasoning_content);
    assert_equals("Ergo sum", builder.consume_rest());
  }
  {
    common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, {
        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
        /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
        /* .reasoning_in_content = */ false,
        /* .thinking_forced_open = */ false,
    });
    assert_equals(false, builder.try_parse_reasoning("<tnk>", "</tnk>"));
    assert_equals("Cogito</tnk>Ergo sum", builder.consume_rest());
  }
  {
    common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, {
        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
        /* .reasoning_in_content = */ false,
        /* .thinking_forced_open = */ true,
    });
    assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
    assert_equals(std::string("Cogito"), builder.result().reasoning_content);
    assert_equals("Ergo sum", builder.consume_rest());
  }
  {
    common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, {
        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
        /* .reasoning_in_content = */ true,
        /* .thinking_forced_open = */ true,
    });
    assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
    assert_equals("<think>Cogito</think>", builder.result().content);
    assert_equals("Ergo sum", builder.consume_rest());
  }
 }
 static void test_regex() {
  auto test_throws = [](const std::string & input, const std::string & regex, const std::string & expected_exception_pattern = "") {
    common_chat_msg_parser builder(input, /* is_partial= */ false, {});
    assert_throws([&]() { builder.consume_regex(common_regex(regex)); }, expected_exception_pattern);
  };
  test_throws("Hello, world!", "abc", "^abc$");
  test_throws("Hello, world!", "e", "^e$");
  {
    common_chat_msg_parser builder("Hello, world!", /* is_partial= */ false, {});
    builder.consume_regex(common_regex("Hello"));
    assert_equals(", world!", builder.consume_rest());
  }
  {
    // When in non partial mode, we can say whether the regex was consumed or not.
    common_chat_msg_parser builder("Hello,", /* is_partial= */ false, {});
    assert_equals(false, builder.try_consume_regex(common_regex("Hello, world!")).has_value());
  }
  {
    common_chat_msg_parser builder("Hello,", /* is_partial= */ false, {});
    auto res = builder.try_consume_regex(common_regex("H(el)l(?:o, world!)?"));
    assert_equals(true, res.has_value());
    // Verify captures
    assert_equals<size_t>(2, res->groups.size());
    assert_equals("Hell", builder.str(res->groups[0]));
    assert_equals("el", builder.str(res->groups[1]));
    // Verify position is after the match
    assert_equals<size_t>(4, builder.pos());
    assert_equals("o,", builder.consume_rest());
  }
  {
    // But in partial mode, we have a partial final match / can't decide, so we throw a partial exception.
    common_chat_msg_parser builder("Hello,", /* is_partial= */ true, {});
    assert_throws([&]() {
      builder.try_consume_regex(common_regex("Hello, world!"));
    }, "^Hello, world!$");
  }
  // Now regardless of the mode, we can tell these aren't a match.
  for (const auto is_partial : {false, true}) {
    common_chat_msg_parser builder("Hello,", is_partial, {});
    assert_equals(false, builder.try_consume_regex(common_regex("a(b|c)(d|e)f")).has_value());
  }
  for (const auto is_partial : {false, true}) {
    common_chat_msg_parser builder("Hello,", is_partial, {});
    assert_equals(false, builder.try_consume_literal("Oh"));
  }
 }
 const std::vector<std::string> barely_healable_jsons = {
  "{",
  "{\"",
  "{\"\\",
  "{\"n",
  "{\"name\"",
  "{\"name\":",
  "{\"name\":\"",
  "{\"name\":\"\\",
  "{\"name\":\"python",
  "{\"name\":\"python\\",
  "{\",",
  "{\":",
  "{\"[",
  "{\"]",
  "{\"{",
  "{\"}",
  "{\"1",
  "{\"name\":\",",
  "{\"name\":\":",
  "{\"name\":\"[",
  "{\"name\":\"]",
  "{\"name\":\"{",
  "{\"name\":\"}",
  "{\"name\":\"1",
 };
 static void test(const std::string & input, bool is_partial, const std::vector<std::vector<std::string>> & args_paths, const std::vector<std::vector<std::string>> & content_paths, const std::string & expected) {
  common_chat_msg_parser builder(input, is_partial, {});
  auto js = builder.try_consume_json_with_dumped_args(args_paths, content_paths);
  assert_equals(true, js.has_value());
  assert_equals(is_partial, js->is_partial);
  assert_equals(expected, args_paths.size() == 1 && args_paths[0].empty() ? js->value.get<std::string>() : js->value.dump());
 }
 static void test_with_args(const std::string & input, const std::string & expected, bool parse_as_partial = true, bool is_partial = true) {
  common_chat_msg_parser builder(input, parse_as_partial, {});
  auto js = builder.try_consume_json_with_dumped_args({{"args"}}, {});
  assert_equals(true, js.has_value());
  assert_equals(is_partial, js->is_partial);
  assert_equals(expected, js->value.dump());
 }
 static void test_json_with_dumped_args_no_args() {
  // Normal JSON, nothing to heal, nothing to dump
  test("{\"name\": \"python\"}", false, {}, {}, "{\"name\":\"python\"}");
  // Full json is args
  test("{\"name\": \"python\"}", false, {{}}, {}, "{\"name\":\"python\"}");
  // If the arguments are further down, don't heal partial content.
  for (const auto & src : barely_healable_jsons) {
    test(src, true, {{"arguments"}}, {}, "{}");
  }
  // But heal content that isn't partial.
  test("{\"name\": \"python\"", true, {{"arguments"}}, {}, "{\"name\":\"python\"}");
 }
 static void test_json_with_dumped_args() {
  // Partial content.
  test("{\"content\": \"t", true, {}, {{"content"}}, "{\"content\":\"t\"}");
  test("{\"content\": \"", true, {}, {{"content"}}, "{\"content\":\"\"}");
  test("{\"content\": ", true, {}, {{"content"}}, "{}");
  // If the entire JSON is the arguments, healing it them dumping it produces the same output as the input (just reformatted).
  test("{\"name\": \"python", true, {{}}, {}, "{\"name\":\"python");
  for (const auto & src : barely_healable_jsons) {
    test(src, true, {{}}, {}, src);
  }
  // Full JSON w/ args
  for (auto parse_as_partial : {true, false}) {
    test_with_args(
      R"({"name": "python", "args": {"arg1": 1}})",
      R"({"name":"python","args":"{\"arg1\":1}"})",
      parse_as_partial,
      /* is_partial= */ false
    );
  }
  // Partial JSON w/ partial args
  test_with_args(
    R"({"foo": "bar", "args": {")",
    R"({"foo":"bar","args":"{\""})"
  );
  // Partial args broken in object key
  test_with_args(
    R"({"foo": "bar", "args": {"ar)",
    R"({"foo":"bar","args":"{\"ar"})"
  );
  // Partial args broken after object key
  test_with_args(
    R"({"foo": "bar", "args": {"arg1")",
    R"({"foo":"bar","args":"{\"arg1\""})"
  );
  // Partial args broken before object value
  test_with_args(
    R"({"foo": "bar", "args": {"arg1":)",
    R"({"foo":"bar","args":"{\"arg1\":"})"
  );
  // Partial args broken before object value (space)
  test_with_args(
    R"({"foo": "bar", "args": {"arg1": )",
    R"({"foo":"bar","args":"{\"arg1\":"})"
  );
  // Partial args broken in object value that may not be complete (int)
  test_with_args(
    R"({"foo": "bar", "args": {"arg1": 1)",
    R"({"foo":"bar","args":"{\"arg1\":"})"
  );
  // Partial args broken in object value that is complete (int)
  test_with_args(
    R"({"foo": "bar", "args": {"arg1": 1 )",
    R"({"foo":"bar","args":"{\"arg1\":1"})"
  );
  // Partial args broken in object value that is incomplete (string)
  test_with_args(
    R"({"foo": "bar", "args": {"arg1": ")",
    R"({"foo":"bar","args":"{\"arg1\":\""})"
  );
  // Partial args broken in object value that is complete (string)
  test_with_args(
    R"({"foo": "bar", "args": {"arg1": "1")",
    R"({"foo":"bar","args":"{\"arg1\":\"1\""})"
  );
  // Partial args broken on array opening
  test_with_args(
    R"({"foo": "bar", "args": [)",
    R"({"foo":"bar","args":"["})"
  );
  // Partial args broken on array value that is incomplete (int)
  test_with_args(
    R"({"foo": "bar", "args": [1)",
    R"({"foo":"bar","args":"["})"
  );
  // Partial args broken on array value that is complete (int)
  test_with_args(
    R"({"foo": "bar", "args": [1 )",
    R"({"foo":"bar","args":"[1"})"
  );
  // Partial args broken on array value that is complete (string)
  test_with_args(
    R"({"foo": "bar", "args": ["1")",
    R"({"foo":"bar","args":"[\"1\""})"
  );
  // Partial args broken after array value
  test_with_args(
    R"({"foo": "bar", "args": [1,)",
    R"({"foo":"bar","args":"[1,"})"
  );
  // Partial args broken on nested array
  test_with_args(
    R"({"foo": "bar", "args": {"arg1": [)",
    R"({"foo":"bar","args":"{\"arg1\":["})"
  );
 }
 static void test_positions() {
  {
    common_chat_msg_parser builder("Hello, world!", /* is_partial= */ false, {});
    assert_equals<size_t>(0, builder.pos());
    assert_throws([&]() { builder.move_to(100); });
    assert_equals<size_t>(0, builder.pos());
    assert_throws([&]() { builder.move_back(1); });
    assert_equals<size_t>(0, builder.pos());
    builder.move_to(8);
    assert_equals<size_t>(8, builder.pos());
    builder.move_back(1);
    assert_equals<size_t>(7, builder.pos());
    assert_equals("world!", builder.consume_rest());
    builder.move_to(0);
    assert_equals<size_t>(0, builder.pos());
    assert_throws([&]() { builder.finish(); });
    assert_equals<size_t>(0, builder.pos());
    builder.move_to(builder.input().size());
    builder.finish();
  }
  {
    common_chat_msg_parser builder("Hello, world!", /* is_partial= */ true, {});
    builder.move_to(builder.input().size());
    assert_equals<size_t>(builder.input().size(), builder.pos());
    builder.finish();
  }
 }
 int main() {
    test_positions();
    test_json_with_dumped_args_no_args();
    test_json_with_dumped_args();
    test_reasoning();
    test_regex();
    std::cout << "All tests passed!\n";
    return 0;
 }
--- a/tests/test-json-partial.cpp
+++ b/tests/test-json-partial.cpp
@ -1,237 +0,0 @@
 #include "common.h"
 #include "json-partial.h"
 #include <exception>
 #include <iostream>
 #include <stdexcept>
 template <class T> static void assert_equals(const T & expected, const T & actual) {
  if (expected != actual) {
      std::cerr << "Expected: " << expected << std::endl;
      std::cerr << "Actual: " << actual << std::endl;
      std::cerr << std::flush;
      throw std::runtime_error("Test failed");
  }
 }
 static void test_json_healing() {
  auto parse = [](const std::string & str) {
      std::cerr << "# Parsing: " << str << '\n';
      std::string::const_iterator it = str.begin();
      const auto end = str.end();
      common_json out;
      std::string healing_marker = "$llama.cpp.json$";
      if (common_json_parse(it, end, healing_marker, out)) {
          auto dump = out.json.dump();
          std::cerr << "Parsed: " << dump << '\n';
          std::cerr << "Magic: " << out.healing_marker.json_dump_marker << '\n';
          std::string result;
          if (!out.healing_marker.json_dump_marker.empty()) {
              auto i = dump.find(out.healing_marker.json_dump_marker);
              if (i == std::string::npos) {
                  throw std::runtime_error("Failed to find magic in dump " + dump + " (magic: " + out.healing_marker.json_dump_marker + ")");
              }
              result = dump.substr(0, i);
          } else {
            result = dump;
          }
          std::cerr << "Result: " << result << '\n';
          if (string_starts_with(str, result)) {
            std::cerr << "Failure!\n";
          }
        //   return dump;
      } else {
        throw std::runtime_error("Failed to parse: " + str);
      }
  };
  auto parse_all = [&](const std::string & str) {
      for (size_t i = 1; i < str.size(); i++) {
          parse(str.substr(0, i));
      }
  };
  parse_all("{\"a\": \"b\"}");
  parse_all("{\"hey\": 1, \"ho\\\"ha\": [1]}");
  parse_all("[{\"a\": \"b\"}]");
  auto test = [&](const std::vector<std::string> & inputs, const std::string & expected, const std::string & expected_marker) {
      for (const auto & input : inputs) {
        common_json out;
        assert_equals(true, common_json_parse(input, "$foo", out));
        assert_equals<std::string>(expected, out.json.dump());
        assert_equals<std::string>(expected_marker, out.healing_marker.json_dump_marker);
      }
  };
  // No healing needed:
  test(
    {
      R"([{"a":"b"}, "y"])",
    },
    R"([{"a":"b"},"y"])",
    ""
  );
  // Partial literals can't be healed:
  test(
    {
      R"([1)",
      R"([tru)",
      R"([n)",
      R"([nul)",
      R"([23.2)",
    },
    R"(["$foo"])",
    R"("$foo)"
  );
  test(
    {
      R"({"a": 1)",
      R"({"a": tru)",
      R"({"a": n)",
      R"({"a": nul)",
      R"({"a": 23.2)",
    },
    R"({"a":"$foo"})",
    R"("$foo)"
  );
  test(
    {
      R"({)",
    },
    R"({"$foo":1})",
    R"("$foo)"
  );
  test(
    {
      R"([)",
    },
    R"(["$foo"])",
    R"("$foo)"
  );
  // Healing right after a full literal
  test(
    {
      R"(1 )",
    },
    R"(1)",
    ""
  );
  test(
    {
      R"(true)",
      R"(true )",
    },
    R"(true)",
    ""
  );
  test(
    {
      R"(null)",
      R"(null )",
    },
    R"(null)",
    ""
  );
  test(
    {
      R"([1 )",
    },
    R"([1,"$foo"])",
    R"(,"$foo)"
  );
  test(
    {
      R"([{})",
      R"([{} )",
    },
    R"([{},"$foo"])",
    R"(,"$foo)"
  );
  test(
    {
      R"([true)",
    },
    // TODO: detect the true/false/null literal was complete
    R"(["$foo"])",
    R"("$foo)"
  );
  test(
    {
      R"([true )",
    },
    R"([true,"$foo"])",
    R"(,"$foo)"
  );
  test(
    {
      R"([true,)",
    },
    R"([true,"$foo"])",
    R"("$foo)"
  );
  // Test nesting
  test(
    {
      R"([{"a": [{"b": [{)",
    },
    R"([{"a":[{"b":[{"$foo":1}]}]}])",
    R"("$foo)"
  );
  test(
    {
      R"([{"a": [{"b": [)",
    },
    R"([{"a":[{"b":["$foo"]}]}])",
    R"("$foo)"
  );
  test(
    {
      R"([{"a": "b"})",
      R"([{"a": "b"} )",
    },
    R"([{"a":"b"},"$foo"])",
    R"(,"$foo)"
  );
  test(
    {
      R"([{"a": "b"},)",
      R"([{"a": "b"}, )",
    },
    R"([{"a":"b"},"$foo"])",
    R"("$foo)"
  );
  test(
    {
      R"({ "code)",
    },
    R"({"code$foo":1})",
    R"($foo)"
  );
  test(
    {
      R"({ "code\)",
    },
    R"({"code\\$foo":1})",
    R"(\$foo)"
  );
  test(
    {
      R"({ "code")",
    },
    R"({"code":"$foo"})",
    R"(:"$foo)"
  );
  test(
    {
      R"({ "key")",
    },
    R"({"key":"$foo"})",
    R"(:"$foo)"
  );
 }
 int main() {
    test_json_healing();
    std::cerr << "All tests passed.\n";
    return 0;
 }
--- a/tests/test-regex-partial.cpp
+++ b/tests/test-regex-partial.cpp
@ -1,288 +0,0 @@
 //  Tests common_regex (esp. its partial final matches support).
 #include "common.h"
 #include "regex-partial.h"
 #include <sstream>
 #include <iostream>
 #include <optional>
 template <class T> static void assert_equals(const T & expected, const T & actual) {
    if (expected != actual) {
        std::cerr << "Expected: " << expected << std::endl;
        std::cerr << "  Actual: " << actual << std::endl;
        std::cerr << std::flush;
        throw std::runtime_error("Test failed");
    }
 }
 struct test_case {
    std::string pattern;
    struct input_output {
        std::string input;
        common_regex_match output;
    };
    std::vector<input_output> inputs_outputs;
 };
 static std::string common_regex_match_type_name(common_regex_match_type type) {
    switch (type) {
        case COMMON_REGEX_MATCH_TYPE_NONE:
            return "COMMON_REGEX_MATCH_TYPE_NONE";
        case COMMON_REGEX_MATCH_TYPE_PARTIAL:
            return "COMMON_REGEX_MATCH_TYPE_PARTIAL";
        case COMMON_REGEX_MATCH_TYPE_FULL:
            return "COMMON_REGEX_MATCH_TYPE_FULL";
    }
    return "?";
 }
 static void test_regex() {
    printf("[%s]\n", __func__);
    auto test = [](const test_case & test_case) {
        common_regex cr(test_case.pattern);
        std::cout << "Testing pattern: /" << test_case.pattern << "/\n";
        // std::cout << "    partial rev: " << cr.reversed_partial_pattern.str() << '\n';
        for (const auto & input_output : test_case.inputs_outputs) {
            std::cout << "  Input: " << input_output.input << '\n';
            auto m = cr.search(input_output.input, 0);
            if (m != input_output.output) {
                auto match_to_str = [&](const std::optional<common_regex_match> & m) {
                    std::ostringstream ss;
                    if (m->type == COMMON_REGEX_MATCH_TYPE_NONE) {
                        ss << "<no match>";
                    } else {
                        GGML_ASSERT(!input_output.output.groups.empty());
                        std::vector<std::string> parts;
                        for (const auto & g : m->groups) {
                            parts.push_back("{" + std::to_string(g.begin) + ", " + std::to_string(g.end) + "}");
                        }
                        ss << "{" << common_regex_match_type_name(m->type) << ", {" << string_join(parts, ", ") << "}}";
                    }
                    return ss.str();
                };
                std::cout << "    Expected: " << match_to_str(input_output.output) << '\n';
                std::cout << "         Got: " << match_to_str(m) << '\n';
                std::cout << " Inverted pattern: /" << regex_to_reversed_partial_regex(test_case.pattern) << "/\n";
                throw std::runtime_error("Test failed");
            }
        }
    };
    test({
        "a",
        {
            {"a", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 1}}}},
            {"b", {COMMON_REGEX_MATCH_TYPE_NONE, {}}},
            {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 1}}}},
            {"ba", {COMMON_REGEX_MATCH_TYPE_FULL, {{1, 2}}}},
        }
    });
    test({
        "abcd",
        {
            {"abcd", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
            {"abcde", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
            {"abc", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
            {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
            {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
            {"d", {}},
            {"bcd", {}},
            {"cde", {}},
            {"cd", {}},
            {"yeah ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{5, 7}}}},
            {"abbie", {}},
            {"", {}},
        }
    });
    test({
        ".*?ab",
        {
            {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
            {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
            {"dab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
            {"dabc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
            {"da", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
            {"d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
        }
    });
    test({
        "a.*?b",
        {
            {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
            {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
            {"a b", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
            {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
            {"argh", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
            {"d", {}},
            {"b", {}},
        }
    });
    test({
        "ab(?:cd){2,4}ef",
        {
            // {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, 0, {}}},
            {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
            {"abcd", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
            {"abcde", {}},
            {"abcdef", {}},
            {"abcdcd", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
            {"abcdcde", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 7}}}},
            {"abcdcdef", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}}}},
            {"abcdcdcdcdef", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 12}}}},
            {"abcdcdcdcdcdef", {}},
            {"abcde", {}},
            {"yea", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{2, 3}}}},
        }
    });
    test({
        "a(?:rte| pure )fact",
        {
            {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
            {"art", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
            {"artefa", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
            {"fact", {}},
            {"an arte", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{3, 7}}}},
            {"artefact", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}}}},
            {"an artefact", {COMMON_REGEX_MATCH_TYPE_FULL, {{3, 11}}}},
            {"a pure", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
            {"a pure fact", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 11}}}},
            {"it's a pure fact", {COMMON_REGEX_MATCH_TYPE_FULL, {{5, 16}}}},
            {"" , {}},
            {"pure", {}},
            {"pure fact", {}},
        }
    });
    test({
        "abc",
        {
            {" abcc", {COMMON_REGEX_MATCH_TYPE_FULL, {{1, 4}}}},
            {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
            {"abc", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
            {" ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{1, 3}}}},
            {"a", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 1}}}},
            {"b", {}},
            {"c", {}},
            {"", {}},
        }
    });
    test({
        "(?:abc)?\\s*def",
        {
            {"ab", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
            {"abc", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
            {"abc ", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 4}}}},
            {"abc d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 5}}}},
            {"abc de", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
            {"abc def", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
            {"abc defg", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
            {"abc defgh", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 7}}}},
            {"abcde", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 5}}}},
            {"abcdefgh", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 6}}}},
            {" d", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 2}}}},
            {"def", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 3}}}},
        }
    });
    test({
        "a+b",
        {
            {"aaab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 4}}}},
            {"aaa", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 3}}}},
            {"ab", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 2}}}},
        }
    });
    test({
        "(?:"
            "(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start)
            "("                          // match 2 (open_tag)
                "<tool_call>"
                "|<function_call>"
                "|<tool>"
                "|<tools>"
                "|<response>"
                "|<json>"
                "|<xml>"
                "|<JSON>"
            ")?"
            "(\\s*\\{\\s*\"name\"\\s*:)" // match 3 (named tool call)
        ")"
        "|<function=([^>]+)>"            // match 4 (function name)
        "|<function name=\"([^\"]+)\">", // match 5 (function name again)
        {
            {"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 8}, {54, 54}, {54, 54}, {0, 8}, {54, 54}, {54, 54}}}},
            {"<tool_call> {\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 18}}}},
            {"<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 17}}}},
            {"Let's call something\n<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{21, 38}}}},
            {"Ok then<tool_call>{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{7, 24}}}},
            {"{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{0, 6}}}},
            {"Ok then{\"name", {COMMON_REGEX_MATCH_TYPE_PARTIAL, {{7, 13}}}},
            {"<tool_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 20}, {66, 66}, {0, 11}, {11, 20}, {66, 66}, {66, 66}}}},
            {"<function_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 24}, {70, 70}, {0, 15}, {15, 24}, {70, 70}, {70, 70}}}},
            {"<function name=\"special_function\"> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 34}, {89, 89}, {89, 89}, {89, 89}, {89, 89}, {16, 32}}}},
            {"<function=all>", {COMMON_REGEX_MATCH_TYPE_FULL, {{0, 14}, {14, 14}, {14, 14}, {14, 14}, {10, 13}, {14, 14}}}},
        }
    });
 }
 static void test_regex_to_reversed_partial_regex() {
    printf("[%s]\n", __func__);
    assert_equals<std::string>(
        "((?:(?:c)?b)?a)[\\s\\S]*",
        regex_to_reversed_partial_regex("abc"));
    assert_equals<std::string>(
        "(a+)[\\s\\S]*",
        regex_to_reversed_partial_regex("a+"));
    assert_equals<std::string>(
        "(a*)[\\s\\S]*",
        regex_to_reversed_partial_regex("a*"));
    assert_equals<std::string>(
        "(a?)[\\s\\S]*",
        regex_to_reversed_partial_regex("a?"));
    assert_equals<std::string>(
        "([a-z])[\\s\\S]*",
        regex_to_reversed_partial_regex("[a-z]"));
    assert_equals<std::string>(
        "((?:\\w+)?[a-z])[\\s\\S]*",
        regex_to_reversed_partial_regex("[a-z]\\w+"));
    assert_equals<std::string>(
        "((?:a|b))[\\s\\S]*",
        regex_to_reversed_partial_regex("(?:a|b)"));
    assert_equals<std::string>(
        "((?:(?:(?:d)?c)?b)?a)[\\s\\S]*",
        regex_to_reversed_partial_regex("abcd"));
    assert_equals<std::string>(
        "((?:b)?a*)[\\s\\S]*", // TODO: ((?:b)?a*+).* ??
        regex_to_reversed_partial_regex("a*b"));
    assert_equals<std::string>(
        "((?:(?:b)?a)?.*)[\\s\\S]*",
        regex_to_reversed_partial_regex(".*?ab"));
    assert_equals<std::string>(
        "((?:(?:b)?.*)?a)[\\s\\S]*",
        regex_to_reversed_partial_regex("a.*?b"));
    assert_equals<std::string>(
        "((?:(?:d)?(?:(?:c)?b))?a)[\\s\\S]*",
        regex_to_reversed_partial_regex("a(bc)d"));
    assert_equals<std::string>(
        "((?:(?:(?:c)?b|(?:e)?d))?a)[\\s\\S]*",
        regex_to_reversed_partial_regex("a(bc|de)"));
    assert_equals<std::string>(
        "((?:(?:(?:(?:(?:c)?b?)?b?)?b)?b)?a)[\\s\\S]*",
        regex_to_reversed_partial_regex("ab{2,4}c"));
 }
 int main() {
    test_regex_to_reversed_partial_regex();
    test_regex();
    std::cout << "All tests passed.\n";
 }
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@ -27,10 +27,10 @@
 #define MA_NO_ENGINE
 #define MA_NO_GENERATION
 #define MA_API static
-#include "vendor/miniaudio.h"
+#include "miniaudio/miniaudio.h"
 #define STB_IMAGE_IMPLEMENTATION
-#include "vendor/stb_image.h"
+#include "stb/stb_image.h"
 #define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
 #define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -11,9 +11,6 @@
 #include "mtmd.h"
 #include "mtmd-helper.h"
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
 // mime type for sending response
 #define MIMETYPE_JSON "application/json; charset=utf-8"
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@ -7,17 +7,16 @@
 #include "base64.hpp"
 #include "mtmd.h"
 #include "mtmd-helper.h"
 #include "chat.h"
 // increase max payload length to allow use of larger context size
 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
 // disable Nagle's algorithm
 #define CPPHTTPLIB_TCP_NODELAY true
-#include "httplib.h"
+#include <cpp-httplib/httplib.h>
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
+#include <nlohmann/json.hpp>
 #include "chat.h"
 #include <random>
 #include <sstream>
--- a/tools/tts/tts.cpp
+++ b/tools/tts/tts.cpp
@ -5,7 +5,9 @@
 #include "sampling.h"
 #include "log.h"
 #include "llama.h"
-#include "json.hpp"
+
 #define JSON_ASSERT GGML_ASSERT
 #include <nlohmann/json.hpp>
 #include <algorithm>
 #include <cmath>
--- a/vendor/cpp-httplib/httplib.h
+++ b/vendor/cpp-httplib/httplib.h
@ -8,7 +8,7 @@
 #ifndef CPPHTTPLIB_HTTPLIB_H
 #define CPPHTTPLIB_HTTPLIB_H
-#define CPPHTTPLIB_VERSION "0.20.0"
+#define CPPHTTPLIB_VERSION "0.20.1"
 /*
 * Configuration
@ -145,6 +145,10 @@
 #define CPPHTTPLIB_LISTEN_BACKLOG 5
 #endif
 #ifndef CPPHTTPLIB_MAX_LINE_LENGTH
 #define CPPHTTPLIB_MAX_LINE_LENGTH 32768
 #endif
 /*
 * Headers
 */
@ -3067,6 +3071,11 @@ inline bool stream_line_reader::getline() {
 #endif
  for (size_t i = 0;; i++) {
    if (size() >= CPPHTTPLIB_MAX_LINE_LENGTH) {
      // Treat exceptionally long lines as an error to
      // prevent infinite loops/memory exhaustion
      return false;
    }
    char byte;
    auto n = strm_.read(&byte, 1);
@ -6055,6 +6064,8 @@ inline void calc_actual_timeout(time_t max_timeout_msec, time_t duration_msec,
  auto actual_timeout_msec =
      (std::min)(max_timeout_msec - duration_msec, timeout_msec);
  if (actual_timeout_msec < 0) { actual_timeout_msec = 0; }
  actual_timeout_sec = actual_timeout_msec / 1000;
  actual_timeout_usec = (actual_timeout_msec % 1000) * 1000;
 }
@ -7327,8 +7338,9 @@ Server::process_request(Stream &strm, const std::string &remote_addr,
  }
  // Setup `is_connection_closed` method
-  req.is_connection_closed = [&]() {
+  auto sock = strm.socket();
-    return !detail::is_socket_alive(strm.socket());
+  req.is_connection_closed = [sock]() {
    return !detail::is_socket_alive(sock);
  };
  // Routing
--- a/tools/mtmd/vendor/miniaudio.h
+++ b/tools/mtmd/vendor/miniaudio.h
--- a/vendor/minja/chat-template.hpp
+++ b/vendor/minja/chat-template.hpp
@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
-#include <json.hpp>
+#include <nlohmann/json.hpp>
 using json = nlohmann::ordered_json;
--- a/vendor/minja/minja.hpp
+++ b/vendor/minja/minja.hpp
@ -29,7 +29,7 @@
 #include <utility>
 #include <vector>
-#include <json.hpp>
+#include <nlohmann/json.hpp>
 using json = nlohmann::ordered_json;
--- a/vendor/nlohmann/json.hpp
+++ b/vendor/nlohmann/json.hpp
--- a/vendor/nlohmann/json_fwd.hpp
+++ b/vendor/nlohmann/json_fwd.hpp
@ -0,0 +1,187 @@
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
 // |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
 // SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 #ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_
 #define INCLUDE_NLOHMANN_JSON_FWD_HPP_
 #include <cstdint> // int64_t, uint64_t
 #include <map> // map
 #include <memory> // allocator
 #include <string> // string
 #include <vector> // vector
 // #include <nlohmann/detail/abi_macros.hpp>
 //     __ _____ _____ _____
 //  __|  |   __|     |   | |  JSON for Modern C++
 // |  |  |__   |  |  | | | |  version 3.12.0
 // |_____|_____|_____|_|___|  https://github.com/nlohmann/json
 //
 // SPDX-FileCopyrightText: 2013 - 2025 Niels Lohmann <https://nlohmann.me>
 // SPDX-License-Identifier: MIT
 // This file contains all macro definitions affecting or depending on the ABI
 #ifndef JSON_SKIP_LIBRARY_VERSION_CHECK
    #if defined(NLOHMANN_JSON_VERSION_MAJOR) && defined(NLOHMANN_JSON_VERSION_MINOR) && defined(NLOHMANN_JSON_VERSION_PATCH)
        #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 12 || NLOHMANN_JSON_VERSION_PATCH != 0
            #warning "Already included a different version of the library!"
        #endif
    #endif
 #endif
 #define NLOHMANN_JSON_VERSION_MAJOR 3   // NOLINT(modernize-macro-to-enum)
 #define NLOHMANN_JSON_VERSION_MINOR 12  // NOLINT(modernize-macro-to-enum)
 #define NLOHMANN_JSON_VERSION_PATCH 0   // NOLINT(modernize-macro-to-enum)
 #ifndef JSON_DIAGNOSTICS
    #define JSON_DIAGNOSTICS 0
 #endif
 #ifndef JSON_DIAGNOSTIC_POSITIONS
    #define JSON_DIAGNOSTIC_POSITIONS 0
 #endif
 #ifndef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
    #define JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON 0
 #endif
 #if JSON_DIAGNOSTICS
    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS _diag
 #else
    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS
 #endif
 #if JSON_DIAGNOSTIC_POSITIONS
    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS _dp
 #else
    #define NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS
 #endif
 #if JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON
    #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON _ldvcmp
 #else
    #define NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON
 #endif
 #ifndef NLOHMANN_JSON_NAMESPACE_NO_VERSION
    #define NLOHMANN_JSON_NAMESPACE_NO_VERSION 0
 #endif
 // Construct the namespace ABI tags component
 #define NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c) json_abi ## a ## b ## c
 #define NLOHMANN_JSON_ABI_TAGS_CONCAT(a, b, c) \
    NLOHMANN_JSON_ABI_TAGS_CONCAT_EX(a, b, c)
 #define NLOHMANN_JSON_ABI_TAGS                                       \
    NLOHMANN_JSON_ABI_TAGS_CONCAT(                                   \
            NLOHMANN_JSON_ABI_TAG_DIAGNOSTICS,                       \
            NLOHMANN_JSON_ABI_TAG_LEGACY_DISCARDED_VALUE_COMPARISON, \
            NLOHMANN_JSON_ABI_TAG_DIAGNOSTIC_POSITIONS)
 // Construct the namespace version component
 #define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch) \
    _v ## major ## _ ## minor ## _ ## patch
 #define NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(major, minor, patch) \
    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT_EX(major, minor, patch)
 #if NLOHMANN_JSON_NAMESPACE_NO_VERSION
 #define NLOHMANN_JSON_NAMESPACE_VERSION
 #else
 #define NLOHMANN_JSON_NAMESPACE_VERSION                                 \
    NLOHMANN_JSON_NAMESPACE_VERSION_CONCAT(NLOHMANN_JSON_VERSION_MAJOR, \
                                           NLOHMANN_JSON_VERSION_MINOR, \
                                           NLOHMANN_JSON_VERSION_PATCH)
 #endif
 // Combine namespace components
 #define NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b) a ## b
 #define NLOHMANN_JSON_NAMESPACE_CONCAT(a, b) \
    NLOHMANN_JSON_NAMESPACE_CONCAT_EX(a, b)
 #ifndef NLOHMANN_JSON_NAMESPACE
 #define NLOHMANN_JSON_NAMESPACE               \
    nlohmann::NLOHMANN_JSON_NAMESPACE_CONCAT( \
            NLOHMANN_JSON_ABI_TAGS,           \
            NLOHMANN_JSON_NAMESPACE_VERSION)
 #endif
 #ifndef NLOHMANN_JSON_NAMESPACE_BEGIN
 #define NLOHMANN_JSON_NAMESPACE_BEGIN                \
    namespace nlohmann                               \
    {                                                \
    inline namespace NLOHMANN_JSON_NAMESPACE_CONCAT( \
                NLOHMANN_JSON_ABI_TAGS,              \
                NLOHMANN_JSON_NAMESPACE_VERSION)     \
    {
 #endif
 #ifndef NLOHMANN_JSON_NAMESPACE_END
 #define NLOHMANN_JSON_NAMESPACE_END                                     \
    }  /* namespace (inline namespace) NOLINT(readability/namespace) */ \
    }  // namespace nlohmann
 #endif
 /*!
@brief namespace for Niels Lohmann
@see https://github.com/nlohmann
@since version 1.0.0
 */
 NLOHMANN_JSON_NAMESPACE_BEGIN
 /*!
@brief default JSONSerializer template argument
 This serializer ignores the template arguments and uses ADL
 ([argument-dependent lookup](https://en.cppreference.com/w/cpp/language/adl))
 for serialization.
 */
 template<typename T = void, typename SFINAE = void>
 struct adl_serializer;
 /// a class to store JSON values
 /// @sa https://json.nlohmann.me/api/basic_json/
 template<template<typename U, typename V, typename... Args> class ObjectType =
         std::map,
         template<typename U, typename... Args> class ArrayType = std::vector,
         class StringType = std::string, class BooleanType = bool,
         class NumberIntegerType = std::int64_t,
         class NumberUnsignedType = std::uint64_t,
         class NumberFloatType = double,
         template<typename U> class AllocatorType = std::allocator,
         template<typename T, typename SFINAE = void> class JSONSerializer =
         adl_serializer,
         class BinaryType = std::vector<std::uint8_t>, // cppcheck-suppress syntaxError
         class CustomBaseClass = void>
 class basic_json;
 /// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document
 /// @sa https://json.nlohmann.me/api/json_pointer/
 template<typename RefStringType>
 class json_pointer;
 /*!
@brief default specialization
@sa https://json.nlohmann.me/api/json/
 */
 using json = basic_json<>;
 /// @brief a minimal map-like container that preserves insertion order
 /// @sa https://json.nlohmann.me/api/ordered_map/
 template<class Key, class T, class IgnoredLess, class Allocator>
 struct ordered_map;
 /// @brief specialization that maintains the insertion order of object keys
 /// @sa https://json.nlohmann.me/api/ordered_json/
 using ordered_json = basic_json<nlohmann::ordered_map>;
 NLOHMANN_JSON_NAMESPACE_END
 #endif  // INCLUDE_NLOHMANN_JSON_FWD_HPP_
--- a/tools/mtmd/vendor/stb_image.h
+++ b/tools/mtmd/vendor/stb_image.h