Merge branch 'upstream' into concedo_experimental

# Conflicts: # .devops/nix/package.nix # .github/labeler.yml # .gitignore # CMakeLists.txt # Makefile # Package.swift # README.md # ci/run.sh # docs/build.md # examples/CMakeLists.txt # flake.lock # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt # grammars/README.md # requirements/requirements-convert_hf_to_gguf.txt # requirements/requirements-convert_hf_to_gguf_update.txt # scripts/check-requirements.sh # scripts/compare-llama-bench.py # scripts/gen-unicode-data.py # scripts/sync-ggml-am.sh # scripts/sync-ggml.last # scripts/sync-ggml.sh # tests/test-backend-ops.cpp # tests/test-chat-template.cpp # tests/test-tokenizer-random.py
2025-09-11 01:24:36 +00:00 · 2024-07-11 16:36:16 +08:00 · 2024-07-11 16:36:16 +08:00 · 2cad736260
commit 2cad736260
parent eeecaf442a a977c11544
85 changed files with 12568 additions and 445 deletions
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@ -0,0 +1,38 @@
 name: Python Type-Check
 on:
  push:
    paths:
      - '.github/workflows/python-type-check.yml'
      - '**.py'
      - '**/requirements*.txt'
  pull_request:
    paths:
      - '.github/workflows/python-type-check.yml'
      - '**.py'
      - '**/requirements*.txt'
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  python-type-check:
    runs-on: ubuntu-latest
    name: pyright type-check
    steps:
      - name: Check out source repository
        uses: actions/checkout@v4
      - name: Set up Python environment
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Install Python dependencies
        # TODO: use a venv
        run: pip install -r requirements/requirements-all.txt
      - name: Type-check with Pyright
        uses: jakebailey/pyright-action@v2
        with:
          version: 1.1.370
          level: warning
          warnings: true
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -430,8 +430,10 @@ add_library(ggml
            ggml/include/ggml-backend.h
            ggml/src/ggml-quants.c
            ggml/src/ggml-quants.h
-            ggml/src/sgemm.cpp
+            ggml/src/llamafile/sgemm.cpp
-            ggml/src/sgemm.h
+            ggml/src/llamafile/sgemm.h
            ggml/src/ggml-aarch64.c
            ggml/src/ggml-aarch64.h
            ${GGML_SOURCES_CUDA})
 target_include_directories(ggml PUBLIC . ./ggml/include ./ggml/src ./include ./otherarch ./otherarch/tools)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
--- a/14
+++ b/14
@ -65,9 +65,9 @@ endif
 CUBLASLD_FLAGS =
 CUBLAS_OBJS =
-OBJS_FULL += ggml-alloc.o ggml-quants.o unicode.o unicode-data.o sgemm.o common.o sampling.o grammar-parser.o
+OBJS_FULL += ggml-alloc.o ggml-aarch64.o ggml-quants.o unicode.o unicode-data.o sgemm.o common.o sampling.o grammar-parser.o
-OBJS_SIMPLE += ggml-alloc.o ggml-quants_noavx2.o unicode.o unicode-data.o sgemm_noavx2.o common.o sampling.o grammar-parser.o
+OBJS_SIMPLE += ggml-alloc.o ggml-aarch64.o ggml-quants_noavx2.o unicode.o unicode-data.o sgemm_noavx2.o common.o sampling.o grammar-parser.o
-OBJS_FAILSAFE += ggml-alloc.o ggml-quants_failsafe.o unicode.o unicode-data.o sgemm_failsafe.o common.o sampling.o grammar-parser.o
+OBJS_FAILSAFE += ggml-alloc.o ggml-aarch64.o ggml-quants_failsafe.o unicode.o unicode-data.o sgemm_failsafe.o common.o sampling.o grammar-parser.o
 #lets try enabling everything
 CFLAGS   += -pthread -s -Wno-deprecated -Wno-deprecated-declarations
@ -421,11 +421,11 @@ ggml-quants_failsafe.o: ggml/src/ggml-quants.c ggml/include/ggml.h ggml/src/ggml
 	$(CC)  $(CFLAGS) $(NONECFLAGS) -c $< -o $@
 #sgemm
-sgemm.o: ggml/src/sgemm.cpp ggml/src/sgemm.h ggml/include/ggml.h
+sgemm.o: ggml/src/llamafile/sgemm.cpp ggml/src/llamafile/sgemm.h ggml/include/ggml.h
 	$(CXX) $(CXXFLAGS) $(FULLCFLAGS) -c $< -o $@
-sgemm_noavx2.o: ggml/src/sgemm.cpp ggml/src/sgemm.h ggml/include/ggml.h
+sgemm_noavx2.o: ggml/src/llamafile/sgemm.cpp ggml/src/llamafile/sgemm.h ggml/include/ggml.h
 	$(CXX) $(CXXFLAGS) $(SIMPLECFLAGS) -c $< -o $@
-sgemm_failsafe.o: ggml/src/sgemm.cpp ggml/src/sgemm.h ggml/include/ggml.h
+sgemm_failsafe.o: ggml/src/llamafile/sgemm.cpp ggml/src/llamafile/sgemm.h ggml/include/ggml.h
 	$(CXX) $(CXXFLAGS) $(NONECFLAGS) -c $< -o $@
 #there's no intrinsics or special gpu ops used here, so we can have a universal object
@ -437,6 +437,8 @@ unicode.o: src/unicode.cpp src/unicode.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 unicode-data.o: src/unicode-data.cpp src/unicode-data.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 ggml-aarch64.o: ggml/src/ggml-aarch64.c ggml/include/ggml.h ggml/src/ggml-aarch64.h ggml/src/ggml-common.h
 	$(CC)  $(CFLAGS) -c $< -o $@
 #these have special gpu defines
 ggml-backend_default.o: ggml/src/ggml-backend.c ggml/include/ggml.h ggml/include/ggml-backend.h
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1,3 +1,7 @@
 #if defined(_MSC_VER)
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif
 #include "common.h"
 #include "build-info.h"
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
@ -191,6 +195,12 @@ int32_t cpu_get_num_math() {
 // CLI argument parsing
 //
 void gpt_params_handle_hf_token(gpt_params & params) {
    if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
        params.hf_token = std::getenv("HF_TOKEN");
    }
 }
 void gpt_params_handle_model_default(gpt_params & params) {
    if (!params.hf_repo.empty()) {
        // short-hand to avoid specifying --hf-file -> default it to --model
@ -238,6 +248,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
    gpt_params_handle_model_default(params);
    gpt_params_handle_hf_token(params);
    if (params.escape) {
        string_process_escapes(params.prompt);
        string_process_escapes(params.input_prefix);
@ -653,6 +665,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.model_url = argv[i];
        return true;
    }
    if (arg == "-hft" || arg == "--hf-token") {
        if (++i >= argc) {
          invalid_param = true;
          return true;
        }
        params.hf_token = argv[i];
        return true;
    }
    if (arg == "-hfr" || arg == "--hf-repo") {
        CHECK_ARG
        params.hf_repo = argv[i];
@ -1577,6 +1597,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "-mu,   --model-url MODEL_URL",  "model download url (default: unused)" });
    options.push_back({ "*",           "-hfr,  --hf-repo REPO",         "Hugging Face model repository (default: unused)" });
    options.push_back({ "*",           "-hff,  --hf-file FILE",         "Hugging Face model file (default: unused)" });
    options.push_back({ "*",           "-hft,  --hf-token TOKEN",       "Hugging Face access token (default: value from HF_TOKEN environment variable)" });
    options.push_back({ "retrieval" });
    options.push_back({ "retrieval",   "       --context-file FNAME",   "file to load context from (repeat to specify multiple files)" });
@ -2016,9 +2037,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
    llama_model * model = nullptr;
    if (!params.hf_repo.empty() && !params.hf_file.empty()) {
-        model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
+        model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
    } else if (!params.model_url.empty()) {
-        model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
+        model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
    } else {
        model = llama_load_model_from_file(params.model.c_str(), mparams);
    }
@ -2206,7 +2227,7 @@ static bool starts_with(const std::string & str, const std::string & prefix) {
    return str.rfind(prefix, 0) == 0;
 }
-static bool llama_download_file(const std::string & url, const std::string & path) {
+static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
    // Initialize libcurl
    std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
@ -2221,6 +2242,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
    // Check if hf-token or bearer-token was specified
    if (!hf_token.empty()) {
      std::string auth_header = "Authorization: Bearer ";
      auth_header += hf_token.c_str();
      struct curl_slist *http_headers = NULL;
      http_headers = curl_slist_append(http_headers, auth_header.c_str());
      curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
    }
 #if defined(_WIN32)
    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
    //   operating system. Currently implemented under MS-Windows.
@ -2416,6 +2446,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
 struct llama_model * llama_load_model_from_url(
        const char * model_url,
        const char * path_model,
        const char * hf_token,
        const struct llama_model_params & params) {
    // Basic validation of the model_url
    if (!model_url || strlen(model_url) == 0) {
@ -2423,7 +2454,7 @@ struct llama_model * llama_load_model_from_url(
        return NULL;
    }
-    if (!llama_download_file(model_url, path_model)) {
+    if (!llama_download_file(model_url, path_model, hf_token)) {
        return NULL;
    }
@ -2471,14 +2502,14 @@ struct llama_model * llama_load_model_from_url(
        // Prepare download in parallel
        std::vector<std::future<bool>> futures_download;
        for (int idx = 1; idx < n_split; idx++) {
-            futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool {
+            futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
                char split_path[PATH_MAX] = {0};
                llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
                char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
                llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
-                return llama_download_file(split_url, split_path);
+                return llama_download_file(split_url, split_path, hf_token);
            }, idx));
        }
@ -2497,6 +2528,7 @@ struct llama_model * llama_load_model_from_hf(
        const char * repo,
        const char * model,
        const char * path_model,
        const char * hf_token,
        const struct llama_model_params & params) {
    // construct hugging face model url:
    //
@ -2512,7 +2544,7 @@ struct llama_model * llama_load_model_from_hf(
    model_url += "/resolve/main/";
    model_url += model;
-    return llama_load_model_from_url(model_url.c_str(), path_model, params);
+    return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
 }
 #else
@ -2520,6 +2552,7 @@ struct llama_model * llama_load_model_from_hf(
 struct llama_model * llama_load_model_from_url(
        const char * /*model_url*/,
        const char * /*path_model*/,
        const char * /*hf_token*/,
        const struct llama_model_params & /*params*/) {
    fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
    return nullptr;
@ -2529,6 +2562,7 @@ struct llama_model * llama_load_model_from_hf(
        const char * /*repo*/,
        const char * /*model*/,
        const char * /*path_model*/,
        const char * /*hf_token*/,
        const struct llama_model_params & /*params*/) {
    fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
    return nullptr;
--- a/common/common.h
+++ b/common/common.h
@ -125,6 +125,7 @@ struct gpt_params {
    std::string model_draft          = ""; // draft model for speculative decoding
    std::string model_alias          = "unknown"; // model alias
    std::string model_url            = ""; // model url to download
    std::string hf_token             = ""; // HF token
    std::string hf_repo              = ""; // HF repo
    std::string hf_file              = ""; // HF file
    std::string prompt               = "";
@ -273,6 +274,7 @@ struct gpt_params {
    bool spm_infill = false; // suffix/prefix/middle pattern for infill
 };
 void gpt_params_handle_hf_token(gpt_params & params);
 void gpt_params_handle_model_default(gpt_params & params);
 bool gpt_params_parse_ex   (int argc, char ** argv, gpt_params & params);
@ -328,8 +330,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
-struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
-struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);
+struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
 // Batch utils
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -282,8 +282,6 @@ static llama_token llama_sampling_sample_impl(
        GGML_ASSERT(!original_logits.empty());
    }
    llama_token id = 0;
    // Get a pointer to the logits
    float * logits = llama_get_logits_ith(ctx_main, idx);
    if (temp < 0.0) {
        // greedy sampling, with probs
@ -324,6 +322,9 @@ static llama_token llama_sampling_sample_impl(
    }
    if (ctx_sampling->grammar != NULL && !is_resampling) {
        // Get a pointer to the logits
        float * logits = llama_get_logits_ith(ctx_main, idx);
        // Create an array with a single token data element for the sampled id
        llama_token_data single_token_data = {id, logits[id], 0.0f};
        llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
@ -377,7 +378,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
    if (ctx_sampling->grammar != NULL && !apply_grammar) {
        GGML_ASSERT(original_logits != NULL);
        // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
-        *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
+        *original_logits = {logits, logits + n_vocab};
    }
    // apply params.logit_bias map
@ -390,10 +391,10 @@ static llama_token_data_array llama_sampling_prepare_impl(
        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
    }
-    cur.clear();
+    cur.resize(n_vocab);
    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
    }
    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -265,7 +265,7 @@ class Model:
                    break
            for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
-                data: np.ndarray = data  # type hint
+                data: np.ndarray  # type hint
                n_dims = len(data.shape)
                data_dtype = data.dtype
                data_qtype: gguf.GGMLQuantizationType | None = None
@ -487,6 +487,9 @@ class Model:
        if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
            res = "jina-v2-code"
        if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
            res = "chatglm-bpe"
        if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
            # ref: https://huggingface.co/LumiOpen/Viking-7B
            res = "viking"
@ -596,10 +599,6 @@ class Model:
        tokenizer_path = self.dir_model / 'tokenizer.model'
        tokens: list[bytes] = []
        scores: list[float] = []
        toktypes: list[int] = []
        if not tokenizer_path.is_file():
            raise FileNotFoundError(f"File not found: {tokenizer_path}")
@ -2117,7 +2116,7 @@ class InternLM2Model(Model):
            logger.error(f'Error: Missing {tokenizer_path}')
            sys.exit(1)
-        sentencepiece_model = model.ModelProto()
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
        add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
@ -2145,6 +2144,9 @@ class InternLM2Model(Model):
                toktype = SentencePieceTokenTypes.UNUSED
            elif tokenizer.IsByte(token_id):
                toktype = SentencePieceTokenTypes.BYTE
            # take care of ununsed raw token
            if piece.startswith('[UNUSED'):
                toktype = SentencePieceTokenTypes.UNKNOWN
            tokens.append(text)
            scores.append(score)
@ -2160,6 +2162,47 @@ class InternLM2Model(Model):
                    scores.append(-1000.0)
                    toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
        chat_eos_token = '<|im_end|>'
        chat_eos_token_id = None
        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
        if tokenizer_config_file.is_file():
            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
                tokenizer_config_json = json.load(f)
                added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
                for token_id, foken_data in added_tokens_decoder.items():
                    token_id = int(token_id)
                    token = foken_data["content"]
                    if token == chat_eos_token:
                        chat_eos_token_id = token_id
                    token = token.encode("utf-8")
                    if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
                        assert(tokens[token_id] == token)
                    tokens[token_id] = token
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
                    if foken_data.get("special"):
                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
        tokenizer_file = self.dir_model / 'tokenizer.json'
        if tokenizer_file.is_file():
            with open(tokenizer_file, "r", encoding="utf-8") as f:
                tokenizer_json = json.load(f)
                added_tokens = tokenizer_json.get("added_tokens", [])
                for foken_data in added_tokens:
                    token_id = int(foken_data["id"])
                    token = foken_data["content"]
                    if token == chat_eos_token:
                        chat_eos_token_id = token_id
                    token = token.encode("utf-8")
                    if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
                        assert(tokens[token_id] == token)
                    tokens[token_id] = token
                    scores[token_id] = -1000.0
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
                    if foken_data.get("special"):
                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
        self.gguf_writer.add_tokenizer_model("llama")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
@ -2169,28 +2212,16 @@ class InternLM2Model(Model):
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        old_eos = special_vocab.special_token_ids["eos"]
-        if "chat" in os.path.basename(self.dir_model.absolute()):
+        if chat_eos_token_id is not None:
            # For the chat model, we replace the eos with '<|im_end|>'.
            # TODO: this is a hack, should be fixed
            #       https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
-            special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
+            special_vocab.special_token_ids["eos"] = chat_eos_token_id
-            logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
+            logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
-in chat mode so that the conversation can end normally.")
+                           " in chat mode so that the conversation can end normally.")
        special_vocab.add_to_gguf(self.gguf_writer)
    def _try_get_sft_eos(self, tokenizer):
        unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]')
        im_end_list = tokenizer.Encode('<|im_end|>')
        eos_token = None
        assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
        if len(unused_145_list) == 1:
            eos_token = unused_145_list[0]
        if len(im_end_list) == 1:
            eos_token = im_end_list[0]
        assert eos_token
        return eos_token
    def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
        if n_head_kv is not None and n_head != n_head_kv:
            n_head = n_head_kv
@ -2209,6 +2240,10 @@ in chat mode so that the conversation can end normally.")
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
        self.gguf_writer.add_file_type(self.ftype)
        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
            if self.hparams["rope_scaling"].get("type") == "linear":
                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        num_heads = self.hparams["num_attention_heads"]
@ -2969,7 +3004,7 @@ class T5Model(Model):
        if not tokenizer_path.is_file():
            raise FileNotFoundError(f"File not found: {tokenizer_path}")
-        sentencepiece_model = model.ModelProto()
+        sentencepiece_model = model.ModelProto()  # pyright: ignore[reportAttributeAccessIssue]
        sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
        # some models like Pile-T5 family use BPE tokenizer instead of Unigram
@ -3149,7 +3184,7 @@ class JaisModel(Model):
            # but Jais's PyTorch model simply precalculates the slope values and places them
            # in relative_pes.slopes
            n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
-            first_val = float(data_torch._data[0])
+            first_val = float(data_torch[0].item())
            self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
            return tensors
@ -3176,6 +3211,190 @@ class JaisModel(Model):
        self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
@Model.register("ChatGLMModel", "ChatGLMForConditionalGeneration")
 class ChatGLMModel(Model):
    model_arch = gguf.MODEL_ARCH.CHATGLM
    def set_vocab_chatglm3(self):
        dir_model = self.dir_model
        hparams = self.hparams
        tokens: list[bytes] = []
        toktypes: list[int] = []
        scores: list[float] = []
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
        vocab_size = hparams.get("padded_vocab_size", len(tokenizer.get_vocab()))
        assert max(tokenizer.get_vocab().values()) < vocab_size
        role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
        special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
        for token_id in range(vocab_size):
            piece = tokenizer._convert_id_to_token(token_id)
            if token_id == 0:
                piece = "<unk>"
            elif token_id == 1:
                piece = "<bos>"
            elif token_id == 2:
                piece = "<eos>"
            text = piece.encode("utf-8")
            score = 0.0
            # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
            # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
            if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
                score = tokenizer.tokenizer.sp_model.get_score(token_id)
            if len(piece) == 0:
                text = f"[PAD{token_id}]".encode("utf-8")
            if token_id >= tokenizer.tokenizer.sp_model.vocab_size():
                if piece in special_tokens:
                    # show special tokens in prompt
                    toktype = SentencePieceTokenTypes.USER_DEFINED
                else:
                    toktype = SentencePieceTokenTypes.UNKNOWN
                tokens.append(text)
                scores.append(score)
                toktypes.append(toktype)
                continue
            toktype = SentencePieceTokenTypes.NORMAL
            if tokenizer.tokenizer.sp_model.is_unknown(token_id):
                toktype = SentencePieceTokenTypes.UNKNOWN
            elif tokenizer.tokenizer.sp_model.is_control(token_id):
                toktype = SentencePieceTokenTypes.CONTROL
            elif tokenizer.tokenizer.sp_model.is_unused(token_id):
                toktype = SentencePieceTokenTypes.UNUSED
            elif tokenizer.tokenizer.sp_model.is_byte(token_id):
                toktype = SentencePieceTokenTypes.BYTE
            tokens.append(text)
            scores.append(score)
            toktypes.append(toktype)
        self.gguf_writer.add_tokenizer_model("llama")
        # glm3 needs prefix and suffix formatted as:
        # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
        self.gguf_writer.add_tokenizer_pre("chatglm-spm")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)
    @staticmethod
    def token_bytes_to_string(b):
        from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
        byte_encoder = bytes_to_unicode()
        return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
    @staticmethod
    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
        parts = [bytes([b]) for b in token]
        while True:
            min_idx = None
            min_rank = None
            for i, pair in enumerate(zip(parts[:-1], parts[1:])):
                rank = mergeable_ranks.get(pair[0] + pair[1])
                if rank is not None and (min_rank is None or rank < min_rank):
                    min_idx = i
                    min_rank = rank
            if min_rank is None or (max_rank is not None and min_rank >= max_rank):
                break
            assert min_idx is not None
            parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
        return parts
    def set_vocab(self):
        if "THUDM/chatglm3-6b" in self.hparams.get("_name_or_path", ""):
            self.set_vocab_chatglm3()
            return
        dir_model = self.dir_model
        hparams = self.hparams
        tokens: list[str] = []
        toktypes: list[int] = []
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
        vocab_size = hparams["padded_vocab_size"]
        assert max(tokenizer.get_vocab().values()) < vocab_size
        tokpre = self.get_vocab_base_pre(tokenizer)
        merges = []
        vocab = {}
        mergeable_ranks = tokenizer.mergeable_ranks
        for token, rank in mergeable_ranks.items():
            vocab[ChatGLMModel.token_bytes_to_string(token)] = rank
            if len(token) == 1:
                continue
            merged = ChatGLMModel.bpe(mergeable_ranks, token, max_rank=rank)
            assert len(merged) >= 2 and len(merged) <= 7
            merges.append(' '.join(map(ChatGLMModel.token_bytes_to_string, merged)))
        # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
        added_vocab = tokenizer.get_added_vocab()
        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
        for i in range(vocab_size):
            if i not in reverse_vocab:
                tokens.append(f"[PAD{i}]")
                toktypes.append(gguf.TokenType.USER_DEFINED)
            elif reverse_vocab[i] in added_vocab:
                tokens.append(reverse_vocab[i])
                if tokenizer.added_tokens_decoder[i].special:
                    toktypes.append(gguf.TokenType.CONTROL)
                else:
                    toktypes.append(gguf.TokenType.USER_DEFINED)
            else:
                tokens.append(reverse_vocab[i])
                toktypes.append(gguf.TokenType.NORMAL)
        self.gguf_writer.add_tokenizer_model("gpt2")
        self.gguf_writer.add_tokenizer_pre(tokpre)
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
        special_vocab.merges = merges
        # only add special tokens when they were not already loaded from config.json
        special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
        special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
        # this one is usually not in config.json anyway
        special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
        special_vocab.add_to_gguf(self.gguf_writer)
    def set_gguf_parameters(self):
        self.gguf_writer.add_name(self.hparams["_name_or_path"].split("/")[1]) # THUDM/glm4-9b-chat or THUDM/chatglm3-6b
        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
        n_head_kv = self.hparams.get("multi_query_group_num", n_head)
        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
        self.gguf_writer.add_embedding_length(n_embed)
        self.gguf_writer.add_feed_forward_length(self.hparams.get("ffn_hidden_size", 4 * n_embed))
        self.gguf_writer.add_block_count(self.hparams["num_layers"])
        self.gguf_writer.add_head_count(n_head)
        self.gguf_writer.add_head_count_kv(n_head_kv)
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layernorm_epsilon"])
        self.gguf_writer.add_file_type(self.ftype)
        self.gguf_writer.add_rope_dimension_count(64)
        self.gguf_writer.add_add_bos_token(False)
        rope_freq = 10000
        if "rope_ratio" in self.hparams:
            rope_freq = rope_freq * self.hparams["rope_ratio"]
        self.gguf_writer.add_rope_freq_base(rope_freq)
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
        if name.endswith(".rotary_pos_emb.inv_freq"):
            return []
        name = name.removeprefix("transformer.")
        return [(self.map_tensor_name(name), data_torch)]
 ###### CONVERSION LOGIC ######
--- a/convert_llama_ggml_to_gguf.py
+++ b/convert_llama_ggml_to_gguf.py
@ -354,7 +354,8 @@ class GGMLToGGUF:
 def handle_metadata(cfg, hp):
-    import convert
+    import examples.convert_legacy_llama as convert
    assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
    hf_config_path   = cfg.model_metadata_dir / "config.json"
    orig_config_path = cfg.model_metadata_dir / "params.json"
--- a/examples/convert_legacy_llama.py
+++ b/examples/convert_legacy_llama.py
@ -353,7 +353,7 @@ class Metadata:
    version: Optional[str] = None
    url: Optional[str] = None
    description: Optional[str] = None
-    licence: Optional[str] = None
+    license: Optional[str] = None
    source_url: Optional[str] = None
    source_hf_repo: Optional[str] = None
@ -492,12 +492,13 @@ class LazyTensor:
 LazyModel: TypeAlias = 'dict[str, LazyTensor]'
 ModelFormat: TypeAlias = Literal['ggml', 'torch', 'safetensors', 'none']
@dataclass
 class ModelPlus:
    model: LazyModel
    paths: list[Path]  # Where this was read from.
-    format: Literal['ggml', 'torch', 'safetensors', 'none']
+    format: ModelFormat
    vocab: BaseVocab | None  # For GGML models (which have vocab built in), the vocab.
@ -536,7 +537,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
 def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
-    formats = set(mp.format for mp in models_plus)
+    formats: set[ModelFormat] = set(mp.format for mp in models_plus)
    assert len(formats) == 1, "different formats?"
    format = formats.pop()
    paths = [path for mp in models_plus for path in mp.paths]
@ -555,7 +556,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
    else:
        model = merge_sharded([mp.model for mp in models_plus])
-    return ModelPlus(model, paths, format, vocab)  # pytype: disable=wrong-arg-types
+    return ModelPlus(model, paths, format, vocab)
 def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
@ -805,7 +806,7 @@ class OutputFile:
    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
-    def add_meta_model(self, params: Params, metadata: Metadata) -> None:
+    def add_meta_model(self, params: Params, metadata: Metadata | None) -> None:
        # Metadata About The Model And Its Provenence
        name = "LLaMA"
        if metadata is not None and metadata.name is not None:
@ -827,8 +828,8 @@ class OutputFile:
                self.gguf.add_url(metadata.url)
            if metadata.description is not None:
                self.gguf.add_description(metadata.description)
-            if metadata.licence is not None:
+            if metadata.license is not None:
-                self.gguf.add_licence(metadata.licence)
+                self.gguf.add_licence(metadata.license)
            if metadata.source_url is not None:
                self.gguf.add_source_url(metadata.source_url)
            if metadata.source_hf_repo is not None:
@ -943,7 +944,7 @@ class OutputFile:
    @staticmethod
    def write_vocab_only(
        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata | None = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@ -977,7 +978,7 @@ class OutputFile:
        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
-        metadata: Metadata = None,
+        metadata: Metadata | None = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@ -1396,6 +1397,8 @@ def main(args_in: list[str] | None = None) -> None:
    if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
        vocab = model_plus.vocab
    assert params is not None
    logger.info(f"Vocab info: {vocab}")
    logger.info(f"Special vocab info: {special_vocab}")
    model   = model_plus.model
--- a/examples/deprecation-warning/README.md
+++ b/examples/deprecation-warning/README.md
@ -0,0 +1,51 @@
 # Migration notice for binary filenames
 > [!IMPORTANT]
 [2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
 This migration was important, but it is a breaking change that may not always be immediately obvious to users.
 Please update all scripts and workflows to use the new binary names.
 | Old Filename | New Filename |
 | ---- | ---- |
 | main | llama-cli |
 | server | llama-server |
 | llama-bench | llama-bench |
 | embedding | llama-embedding |
 | finetune | llama-finetune |
 | quantize | llama-quantize |
 | tokenize | llama-tokenize |
 | export-lora | llama-export-lora |
 | libllava.a | libllava.a |
 | baby-llama | llama-baby-llama |
 | batched | llama-batched |
 | batched-bench | llama-batched-bench |
 | benchmark-matmult | llama-benchmark-matmult |
 | convert-llama2c-to-ggml | llama-convert-llama2c-to-ggml |
 | eval-callback | llama-eval-callback |
 | gbnf-validator | llama-gbnf-validator |
 | gguf | llama-gguf |
 | gguf-split | llama-gguf-split |
 | gritlm | llama-gritlm |
 | imatrix | llama-imatrix |
 | infill | llama-infill |
 | llava-cli | llama-llava-cli |
 | lookahead | llama-lookahead |
 | lookup | llama-lookup |
 | lookup-create | llama-lookup-create |
 | lookup-merge | llama-lookup-merge |
 | lookup-stats | llama-lookup-stats |
 | parallel | llama-parallel |
 | passkey | llama-passkey |
 | perplexity | llama-perplexity |
 | q8dot | llama-q8dot |
 | quantize-stats | llama-quantize-stats |
 | retrieval | llama-retrieval |
 | save-load-state | llama-save-load-state |
 | simple | llama-simple |
 | speculative | llama-speculative |
 | train-text-from-scratch | llama-train-text-from-scratch |
 | vdot | llama-vdot |
 | tests/test-c.o | tests/test-c.o |
--- a/examples/deprecation-warning/deprecation-warning.cpp
+++ b/examples/deprecation-warning/deprecation-warning.cpp
@ -0,0 +1,35 @@
 // Warns users that this filename was deprecated, and provides a link for more information.
 #include <cstdio>
 #include <string>
 #include <unordered_map>
 // Main
 int main(int argc, char** argv) {
    std::string filename = "main";
    if (argc >= 1) {
        filename = argv[0];
    }
    // Get only the program name from the full path
    auto pos = filename.find_last_of('/');
    if (pos != std::string::npos) {
        filename = filename.substr(pos+1);
    }
    // Append "llama-" to the beginning of filename to get the replacemnt filename
    auto replacement_filename = "llama-" + filename;
    // The exception is if the filename is "main", then our replacement filename is "llama-cli"
    if (filename == "main") {
        replacement_filename = "llama-cli";
    }
    fprintf(stdout, "\n");
    fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
    fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
    fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
    fprintf(stdout, "\n");
    return EXIT_FAILURE;
 }
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@ -87,4 +87,4 @@ The LORA rank can be configured for each model tensor type separately with these
 The LORA rank of 'norm' tensors should always be 1.
-To see all available options use `finetune --help`.
+To see all available options use `llama-finetune --help`.
--- a/examples/finetune/convert_finetune_checkpoint_to_gguf.py
+++ b/examples/finetune/convert_finetune_checkpoint_to_gguf.py
@ -74,7 +74,7 @@ class Tensor:
            if len(self.ne) == 0:
                self.nbytes = 0
            else:
-                self.nbytes = int(np.product(self.ne)) * 4
+                self.nbytes = int(np.prod(self.ne)) * 4
        else:
            raise ValueError(f"Unhandled data type '{self.dtype}'")
--- a/examples/finetune/finetune.sh
+++ b/examples/finetune/finetune.sh
@ -8,7 +8,7 @@ if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
 if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
 # MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
-MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
+MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "llama-cli --lora" with GPU inferencing.
 while getopts "dg" opt; do
  case $opt in
--- a/examples/gguf-hash/CMakeLists.txt
+++ b/examples/gguf-hash/CMakeLists.txt
@ -0,0 +1,15 @@
 set(TARGET llama-gguf-hash)
 add_executable(${TARGET} gguf-hash.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 # clibs dependencies
 include_directories(deps/)
 add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h)
 target_link_libraries(${TARGET} PRIVATE xxhash)
 add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h)
 target_link_libraries(${TARGET} PRIVATE sha1)
 add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
 target_link_libraries(${TARGET} PRIVATE sha256)
 target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/gguf-hash/README.md
+++ b/examples/gguf-hash/README.md
@ -0,0 +1,206 @@
 # llama-gguf-hash
 CLI to hash GGUF files to detect difference on a per model and per tensor level.
 **Command line options:**
 - `--help`: display help message
 - `--xxh64`: use xhash 64bit hash mode (default)
 - `--sha1`: use sha1
 - `--uuid`: use uuid
 - `--sha256`: use sha256
 - `--all`: use all hash
 - `--no-layer`: exclude per layer hash
 - `--uuid`: generate UUIDv5 ID
 - `-c`, `--check <manifest>`:  verify against a manifest
 ## About
 While most POSIX systems already have hash checking programs like sha256sum, it
 is designed to check entire files. This is not ideal for our purpose if we want
 to check for consistency of the tensor data even if the metadata content of the
 gguf KV store has been updated.
 This program is designed to hash a gguf tensor payload on a 'per tensor layer'
 in addition to a 'entire tensor model' hash. The intent is that the entire
 tensor layer can be checked first but if there is any detected inconsistencies,
 then the per tensor hash can be used to narrow down the specific tensor layer
 that has inconsistencies.
 For Maintainers:
 - Detection of tensor inconsistency during development and automated tests
    - This is served by xxh64 which is fast
    - This is also served by having per tensor layer to assist in narrowing down
      the location of the faulty tensor layer
    - This is also served by sha1 which is much slower but more widely supported
 For Model Creators:
 - Optional consistent UUID generation based on model tensor content
    - This is served by UUIDv5 which is useful for databases keys
        - llama.cpp UUIDv5 Namespace: `ef001206-dadc-5f6d-a15f-3359e577d4e5`
            - Made via UUIDv5 URL namespace of `en.wikipedia.org/wiki/Llama.cpp`
 For Model Users:
 - Assurance of tensor layer integrity even if metadata was updated
    - This is served by sha256 which is still considered very secure as of 2024
 ### Design Note
 - The default behavior of this program if no arguments is provided is to hash
  using xxhash's xxh32 mode because it is very fast and is primarily targeted
  towards maintainers who may want to use this in automated tests.
 - xxhash support xxh32 and xxh128 for 32bit hash and 128bit hash respectively
  however we picked 64bit xxhash as most computers are 64bit as of 2024 and thus
  would have a better affinity to calculating hash that is 64bit in size.
 ## Compile Example
 ```bash
 cmake -B build -DCMAKE_BUILD_TYPE=Debug -DLLAMA_FATAL_WARNINGS=ON
 make -C build clean
 make -C build llama-gguf-hash VERBOSE=1
 ./build/bin/llama-gguf-hash test.gguf
 ./build/bin/llama-gguf-hash --xxh64 test.gguf
 ./build/bin/llama-gguf-hash --sha1 test.gguf
 ./build/bin/llama-gguf-hash --uuid test.gguf
 ./build/bin/llama-gguf-hash --sha256 test.gguf
 ```
 ## Generation and Verification Example
 To generate we may use this command
 ```bash
 ./llama-gguf-hash --all test.gguf > test.gguf.manifest
 ```
 Which would generate a manifest that looks like below, which contains multiple hash type and per tensor layer hashes as well
 (This excludes UUID as that is an ID not a hash)
 ```bash
 xxh64     f66e9cd66a4396a0  test.gguf:tensor_0
 sha1      59f79ecefd8125a996fdf419239051a7e99e5f20  test.gguf:tensor_0
 sha256    c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd  test.gguf:tensor_0
 xxh64     7d3a1f9ac04d0537  test.gguf:tensor_1
 sha1      4765f592eacf096df4628ba59476af94d767080a  test.gguf:tensor_1
 sha256    8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7  test.gguf:tensor_1
 xxh64     a0af5d700049693b  test.gguf:tensor_2
 sha1      25cbfbad4513cc348e2c95ebdee69d6ff2fd8753  test.gguf:tensor_2
 sha256    947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180  test.gguf:tensor_2
 xxh64     e83fddf559d7b6a6  test.gguf:tensor_3
 sha1      a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c  test.gguf:tensor_3
 sha256    423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1  test.gguf:tensor_3
 xxh64     1257733306b7992d  test.gguf:tensor_4
 sha1      d7bc61db93bb685ce9d598da89717c66729b7543  test.gguf:tensor_4
 sha256    79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf  test.gguf:tensor_4
 xxh64     d238d16ba4711e58  test.gguf:tensor_5
 sha1      0706566c198fe1072f37e0a5135b4b5f23654c52  test.gguf:tensor_5
 sha256    60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b  test.gguf:tensor_5
 xxh64     3fbc3b65ab8c7f39  test.gguf:tensor_6
 sha1      73922a0727226a409049f6fc3172a52219ca6f00  test.gguf:tensor_6
 sha256    574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0  test.gguf:tensor_6
 xxh64     c22021c29854f093  test.gguf:tensor_7
 sha1      efc39cece6a951188fc41e354c73bbfe6813d447  test.gguf:tensor_7
 sha256    4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75  test.gguf:tensor_7
 xxh64     936df61f5d64261f  test.gguf:tensor_8
 sha1      c2490296d789a4f34398a337fed8377d943d9f06  test.gguf:tensor_8
 sha256    c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01  test.gguf:tensor_8
 xxh64     93fd20c64421c081  test.gguf:tensor_9
 sha1      7047ce1e78437a6884337a3751c7ee0421918a65  test.gguf:tensor_9
 sha256    23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514  test.gguf:tensor_9
 xxh64     5a54d3aad816f302  test.gguf
 sha1      d15be52c4ff213e823cb6dd13af7ee2f978e7042  test.gguf
 sha256    7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4  test.gguf
 ```
 We can then use the normal check command which will by default check for the highest security strength hash and verify against that:
 ```bash
 $ ./llama-gguf-hash --check test.gguf.manifest test.gguf
 manifest  test.gguf.manifest  sha256  sha1  xxh64
 sha256    c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd  test.gguf:tensor_0  -  Ok
 sha256    8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7  test.gguf:tensor_1  -  Ok
 sha256    947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180  test.gguf:tensor_2  -  Ok
 sha256    423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1  test.gguf:tensor_3  -  Ok
 sha256    79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf  test.gguf:tensor_4  -  Ok
 sha256    60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b  test.gguf:tensor_5  -  Ok
 sha256    574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0  test.gguf:tensor_6  -  Ok
 sha256    4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75  test.gguf:tensor_7  -  Ok
 sha256    c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01  test.gguf:tensor_8  -  Ok
 sha256    23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514  test.gguf:tensor_9  -  Ok
 sha256    7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4  test.gguf  -  Ok
 Verification results for test.gguf.manifest - Success
 ```
 Or we may explicitly ask for a faster hash like:
 ```bash
 $ ./llama-gguf-hash --check test.gguf.manifest --xxh64 test.gguf
 manifest  test.gguf.manifest  sha256  sha1  xxh64
 xxh64     f66e9cd66a4396a0  test.gguf:tensor_0  -  Ok
 xxh64     7d3a1f9ac04d0537  test.gguf:tensor_1  -  Ok
 xxh64     a0af5d700049693b  test.gguf:tensor_2  -  Ok
 xxh64     e83fddf559d7b6a6  test.gguf:tensor_3  -  Ok
 xxh64     1257733306b7992d  test.gguf:tensor_4  -  Ok
 xxh64     d238d16ba4711e58  test.gguf:tensor_5  -  Ok
 xxh64     3fbc3b65ab8c7f39  test.gguf:tensor_6  -  Ok
 xxh64     c22021c29854f093  test.gguf:tensor_7  -  Ok
 xxh64     936df61f5d64261f  test.gguf:tensor_8  -  Ok
 xxh64     93fd20c64421c081  test.gguf:tensor_9  -  Ok
 xxh64     5a54d3aad816f302  test.gguf  -  Ok
 Verification results for test.gguf.manifest - Success
 ```
 Or maybe we want to just check that all the hash is valid:
 ```bash
 $./llama-gguf-hash --check test.gguf.manifest --all test.gguf.manifest
 manifest  test.gguf.manifest  sha256  sha1  xxh64
 xxh64     f66e9cd66a4396a0  test.gguf:tensor_0  -  Ok
 sha1      59f79ecefd8125a996fdf419239051a7e99e5f20  test.gguf:tensor_0  -  Ok
 sha256    c0510d38fa060c46265e0160a85c7243096b01dd31c2f355bdbb5516b20de1bd  test.gguf:tensor_0  -  Ok
 xxh64     7d3a1f9ac04d0537  test.gguf:tensor_1  -  Ok
 sha1      4765f592eacf096df4628ba59476af94d767080a  test.gguf:tensor_1  -  Ok
 sha256    8514cbcc73692a2c56bd7a33a022edd5ff819614bd23b19915d7224387f397a7  test.gguf:tensor_1  -  Ok
 xxh64     a0af5d700049693b  test.gguf:tensor_2  -  Ok
 sha1      25cbfbad4513cc348e2c95ebdee69d6ff2fd8753  test.gguf:tensor_2  -  Ok
 sha256    947e6b36e20f2cc95e1d2ce1c1669d813d574657ac6b5ac5196158d454d35180  test.gguf:tensor_2  -  Ok
 xxh64     e83fddf559d7b6a6  test.gguf:tensor_3  -  Ok
 sha1      a9cba73e2d90f2ee3dae2548caa42bef3fe6a96c  test.gguf:tensor_3  -  Ok
 sha256    423b044e016d8ac73c39f23f60bf01bedef5ecb03c0230accd824c91fe86f1a1  test.gguf:tensor_3  -  Ok
 xxh64     1257733306b7992d  test.gguf:tensor_4  -  Ok
 sha1      d7bc61db93bb685ce9d598da89717c66729b7543  test.gguf:tensor_4  -  Ok
 sha256    79737cb3912d4201384cf7f16a1a37ff7823f23ea796cb205b6ca361ab9e3ebf  test.gguf:tensor_4  -  Ok
 xxh64     d238d16ba4711e58  test.gguf:tensor_5  -  Ok
 sha1      0706566c198fe1072f37e0a5135b4b5f23654c52  test.gguf:tensor_5  -  Ok
 sha256    60949be8298eced0ecdde64487643d018407bd261691e061d9e9c3dbc9fd358b  test.gguf:tensor_5  -  Ok
 xxh64     3fbc3b65ab8c7f39  test.gguf:tensor_6  -  Ok
 sha1      73922a0727226a409049f6fc3172a52219ca6f00  test.gguf:tensor_6  -  Ok
 sha256    574f4c46ff384a3b9a225eb955d2a871847a2e8b3fa59387a8252832e92ef7b0  test.gguf:tensor_6  -  Ok
 xxh64     c22021c29854f093  test.gguf:tensor_7  -  Ok
 sha1      efc39cece6a951188fc41e354c73bbfe6813d447  test.gguf:tensor_7  -  Ok
 sha256    4c0410cd3c500f078ae5b21e8dc9eb79e29112713b2ab58a882f82a3868d4d75  test.gguf:tensor_7  -  Ok
 xxh64     936df61f5d64261f  test.gguf:tensor_8  -  Ok
 sha1      c2490296d789a4f34398a337fed8377d943d9f06  test.gguf:tensor_8  -  Ok
 sha256    c4401313feeba0261275c3b25bd2d8fe40ce04e0f440c2980ed0e9674c30ff01  test.gguf:tensor_8  -  Ok
 xxh64     93fd20c64421c081  test.gguf:tensor_9  -  Ok
 sha1      7047ce1e78437a6884337a3751c7ee0421918a65  test.gguf:tensor_9  -  Ok
 sha256    23d57cf0d7a6e90b0b3616b41300e0cd354781e812add854a5f95aa55f2bc514  test.gguf:tensor_9  -  Ok
 xxh64     5a54d3aad816f302  test.gguf  -  Ok
 sha1      d15be52c4ff213e823cb6dd13af7ee2f978e7042  test.gguf  -  Ok
 sha256    7dd641b32f59b60dbd4b5420c4b0f6321ccf48f58f6ae201a3dbc4a58a27c6e4  test.gguf  -  Ok
 Verification results for test.gguf.manifest - Success
 ```
 ## Crypto/Hash Libraries Used
 These micro c libraries dependencies was installed via the [clib c package manager](https://github.com/clibs)
 - https://github.com/mofosyne/xxHash (From: https://github.com/Cyan4973/xxHash)
 - https://github.com/clibs/sha1/
 - https://github.com/jb55/sha256.c
--- a/examples/gguf-hash/deps/rotate-bits/package.json
+++ b/examples/gguf-hash/deps/rotate-bits/package.json
@ -0,0 +1,13 @@
 {
  "name": "rotate-bits",
  "version": "0.1.1",
  "repo": "jb55/rotate-bits.h",
  "description": "rotate bits",
  "keywords": ["rotl", "rotr"],
  "src": ["rotate-bits.h"],
  "license": "Public Domain",
  "development": {
    "thlorenz/tap.c": "*"
  }
 }
--- a/examples/gguf-hash/deps/rotate-bits/rotate-bits.h
+++ b/examples/gguf-hash/deps/rotate-bits/rotate-bits.h
@ -0,0 +1,46 @@
 #ifndef __ROTATE_DEFS_H
 #define __ROTATE_DEFS_H
 #ifdef _MSC_VER
 #include <stdlib.h>
 #define ROTL32(v, n) _rotl((v), (n))
 #define ROTL64(v, n) _rotl64((v), (n))
 #define ROTR32(v, n) _rotr((v), (n))
 #define ROTR64(v, n) _rotr64((v), (n))
 #else
 #include <stdint.h>
 #define U8V(v) ((uint8_t)(v) & 0xFFU)
 #define U16V(v) ((uint16_t)(v) & 0xFFFFU)
 #define U32V(v) ((uint32_t)(v) & 0xFFFFFFFFU)
 #define U64V(v) ((uint64_t)(v) & 0xFFFFFFFFFFFFFFFFU)
 #define ROTL32(v, n) \
  (U32V((uint32_t)(v) << (n)) | ((uint32_t)(v) >> (32 - (n))))
 // tests fail if we don't have this cast...
 #define ROTL64(v, n) \
  (U64V((uint64_t)(v) << (n)) | ((uint64_t)(v) >> (64 - (n))))
 #define ROTR32(v, n) ROTL32(v, 32 - (n))
 #define ROTR64(v, n) ROTL64(v, 64 - (n))
 #endif
 #define ROTL8(v, n) \
  (U8V((uint8_t)(v) << (n)) | ((uint8_t)(v) >> (8 - (n))))
 #define ROTL16(v, n) \
  (U16V((uint16_t)(v) << (n)) | ((uint16_t)(v) >> (16 - (n))))
 #define ROTR8(v, n) ROTL8(v, 8 - (n))
 #define ROTR16(v, n) ROTL16(v, 16 - (n))
 #endif
--- a/examples/gguf-hash/deps/sha1/package.json
+++ b/examples/gguf-hash/deps/sha1/package.json
@ -0,0 +1,9 @@
 {
  "name": "sha1",
  "version": "0.0.1",
  "repo": "clibs/sha1",
  "description": "sha1 hash algorithm",
  "keywords": ["sha1", "hash"],
  "license": "public domain",
  "src": ["sha1.c", "sha1.h"]
 }
--- a/examples/gguf-hash/deps/sha1/sha1.c
+++ b/examples/gguf-hash/deps/sha1/sha1.c
@ -0,0 +1,295 @@
 /*
 SHA-1 in C
 By Steve Reid <steve@edmweb.com>
 100% Public Domain
 Test Vectors (from FIPS PUB 180-1)
 "abc"
  A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
 "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
  84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
 A million repetitions of "a"
  34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
 */
 /* #define LITTLE_ENDIAN * This should be #define'd already, if true. */
 /* #define SHA1HANDSOFF * Copies data before messing with it. */
 #define SHA1HANDSOFF
 #include <stdio.h>
 #include <string.h>
 /* for uint32_t */
 #include <stdint.h>
 #include "sha1.h"
 #define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
 /* blk0() and blk() perform the initial expand. */
 /* I got the idea of expanding during the round function from SSLeay */
 #if BYTE_ORDER == LITTLE_ENDIAN
 #define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
    |(rol(block->l[i],8)&0x00FF00FF))
 #elif BYTE_ORDER == BIG_ENDIAN
 #define blk0(i) block->l[i]
 #else
 #error "Endianness not defined!"
 #endif
 #define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
    ^block->l[(i+2)&15]^block->l[i&15],1))
 /* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
 #define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);
 #define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);
 #define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
 #define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);
 #define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
 /* Hash a single 512-bit block. This is the core of the algorithm. */
 void SHA1Transform(
    uint32_t state[5],
    const unsigned char buffer[64]
 )
 {
    uint32_t a, b, c, d, e;
    typedef union
    {
        unsigned char c[64];
        uint32_t l[16];
    } CHAR64LONG16;
 #ifdef SHA1HANDSOFF
    CHAR64LONG16 block[1];      /* use array to appear as a pointer */
    memcpy(block, buffer, 64);
 #else
    /* The following had better never be used because it causes the
     * pointer-to-const buffer to be cast into a pointer to non-const.
     * And the result is written through.  I threw a "const" in, hoping
     * this will cause a diagnostic.
     */
    CHAR64LONG16 *block = (const CHAR64LONG16 *) buffer;
 #endif
    /* Copy context->state[] to working vars */
    a = state[0];
    b = state[1];
    c = state[2];
    d = state[3];
    e = state[4];
    /* 4 rounds of 20 operations each. Loop unrolled. */
    R0(a, b, c, d, e, 0);
    R0(e, a, b, c, d, 1);
    R0(d, e, a, b, c, 2);
    R0(c, d, e, a, b, 3);
    R0(b, c, d, e, a, 4);
    R0(a, b, c, d, e, 5);
    R0(e, a, b, c, d, 6);
    R0(d, e, a, b, c, 7);
    R0(c, d, e, a, b, 8);
    R0(b, c, d, e, a, 9);
    R0(a, b, c, d, e, 10);
    R0(e, a, b, c, d, 11);
    R0(d, e, a, b, c, 12);
    R0(c, d, e, a, b, 13);
    R0(b, c, d, e, a, 14);
    R0(a, b, c, d, e, 15);
    R1(e, a, b, c, d, 16);
    R1(d, e, a, b, c, 17);
    R1(c, d, e, a, b, 18);
    R1(b, c, d, e, a, 19);
    R2(a, b, c, d, e, 20);
    R2(e, a, b, c, d, 21);
    R2(d, e, a, b, c, 22);
    R2(c, d, e, a, b, 23);
    R2(b, c, d, e, a, 24);
    R2(a, b, c, d, e, 25);
    R2(e, a, b, c, d, 26);
    R2(d, e, a, b, c, 27);
    R2(c, d, e, a, b, 28);
    R2(b, c, d, e, a, 29);
    R2(a, b, c, d, e, 30);
    R2(e, a, b, c, d, 31);
    R2(d, e, a, b, c, 32);
    R2(c, d, e, a, b, 33);
    R2(b, c, d, e, a, 34);
    R2(a, b, c, d, e, 35);
    R2(e, a, b, c, d, 36);
    R2(d, e, a, b, c, 37);
    R2(c, d, e, a, b, 38);
    R2(b, c, d, e, a, 39);
    R3(a, b, c, d, e, 40);
    R3(e, a, b, c, d, 41);
    R3(d, e, a, b, c, 42);
    R3(c, d, e, a, b, 43);
    R3(b, c, d, e, a, 44);
    R3(a, b, c, d, e, 45);
    R3(e, a, b, c, d, 46);
    R3(d, e, a, b, c, 47);
    R3(c, d, e, a, b, 48);
    R3(b, c, d, e, a, 49);
    R3(a, b, c, d, e, 50);
    R3(e, a, b, c, d, 51);
    R3(d, e, a, b, c, 52);
    R3(c, d, e, a, b, 53);
    R3(b, c, d, e, a, 54);
    R3(a, b, c, d, e, 55);
    R3(e, a, b, c, d, 56);
    R3(d, e, a, b, c, 57);
    R3(c, d, e, a, b, 58);
    R3(b, c, d, e, a, 59);
    R4(a, b, c, d, e, 60);
    R4(e, a, b, c, d, 61);
    R4(d, e, a, b, c, 62);
    R4(c, d, e, a, b, 63);
    R4(b, c, d, e, a, 64);
    R4(a, b, c, d, e, 65);
    R4(e, a, b, c, d, 66);
    R4(d, e, a, b, c, 67);
    R4(c, d, e, a, b, 68);
    R4(b, c, d, e, a, 69);
    R4(a, b, c, d, e, 70);
    R4(e, a, b, c, d, 71);
    R4(d, e, a, b, c, 72);
    R4(c, d, e, a, b, 73);
    R4(b, c, d, e, a, 74);
    R4(a, b, c, d, e, 75);
    R4(e, a, b, c, d, 76);
    R4(d, e, a, b, c, 77);
    R4(c, d, e, a, b, 78);
    R4(b, c, d, e, a, 79);
    /* Add the working vars back into context.state[] */
    state[0] += a;
    state[1] += b;
    state[2] += c;
    state[3] += d;
    state[4] += e;
    /* Wipe variables */
    a = b = c = d = e = 0;
 #ifdef SHA1HANDSOFF
    memset(block, '\0', sizeof(block));
 #endif
 }
 /* SHA1Init - Initialize new context */
 void SHA1Init(
    SHA1_CTX * context
 )
 {
    /* SHA1 initialization constants */
    context->state[0] = 0x67452301;
    context->state[1] = 0xEFCDAB89;
    context->state[2] = 0x98BADCFE;
    context->state[3] = 0x10325476;
    context->state[4] = 0xC3D2E1F0;
    context->count[0] = context->count[1] = 0;
 }
 /* Run your data through this. */
 void SHA1Update(
    SHA1_CTX * context,
    const unsigned char *data,
    uint32_t len
 )
 {
    uint32_t i;
    uint32_t j;
    j = context->count[0];
    if ((context->count[0] += len << 3) < j)
        context->count[1]++;
    context->count[1] += (len >> 29);
    j = (j >> 3) & 63;
    if ((j + len) > 63)
    {
        memcpy(&context->buffer[j], data, (i = 64 - j));
        SHA1Transform(context->state, context->buffer);
        for (; i + 63 < len; i += 64)
        {
            SHA1Transform(context->state, &data[i]);
        }
        j = 0;
    }
    else
        i = 0;
    memcpy(&context->buffer[j], &data[i], len - i);
 }
 /* Add padding and return the message digest. */
 void SHA1Final(
    unsigned char digest[20],
    SHA1_CTX * context
 )
 {
    unsigned i;
    unsigned char finalcount[8];
    unsigned char c;
 #if 0    /* untested "improvement" by DHR */
    /* Convert context->count to a sequence of bytes
     * in finalcount.  Second element first, but
     * big-endian order within element.
     * But we do it all backwards.
     */
    unsigned char *fcp = &finalcount[8];
    for (i = 0; i < 2; i++)
    {
        uint32_t t = context->count[i];
        int j;
        for (j = 0; j < 4; t >>= 8, j++)
            *--fcp = (unsigned char) t}
 #else
    for (i = 0; i < 8; i++)
    {
        finalcount[i] = (unsigned char) ((context->count[(i >= 4 ? 0 : 1)] >> ((3 - (i & 3)) * 8)) & 255);      /* Endian independent */
    }
 #endif
    c = 0200;
    SHA1Update(context, &c, 1);
    while ((context->count[0] & 504) != 448)
    {
        c = 0000;
        SHA1Update(context, &c, 1);
    }
    SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform() */
    for (i = 0; i < 20; i++)
    {
        digest[i] = (unsigned char)
            ((context->state[i >> 2] >> ((3 - (i & 3)) * 8)) & 255);
    }
    /* Wipe variables */
    memset(context, '\0', sizeof(*context));
    memset(&finalcount, '\0', sizeof(finalcount));
 }
 void SHA1(
    char *hash_out,
    const char *str,
    uint32_t len)
 {
    SHA1_CTX ctx;
    unsigned int ii;
    SHA1Init(&ctx);
    for (ii=0; ii<len; ii+=1)
        SHA1Update(&ctx, (const unsigned char*)str + ii, 1);
    SHA1Final((unsigned char *)hash_out, &ctx);
 }
--- a/examples/gguf-hash/deps/sha1/sha1.h
+++ b/examples/gguf-hash/deps/sha1/sha1.h
@ -0,0 +1,52 @@
 #ifndef SHA1_H
 #define SHA1_H
 /*
   SHA-1 in C
   By Steve Reid <steve@edmweb.com>
   100% Public Domain
 */
 #include "stdint.h"
 #if defined(__cplusplus)
 extern "C" {
 #endif
 typedef struct
 {
    uint32_t state[5];
    uint32_t count[2];
    unsigned char buffer[64];
 } SHA1_CTX;
 void SHA1Transform(
    uint32_t state[5],
    const unsigned char buffer[64]
    );
 void SHA1Init(
    SHA1_CTX * context
    );
 void SHA1Update(
    SHA1_CTX * context,
    const unsigned char *data,
    uint32_t len
    );
 void SHA1Final(
    unsigned char digest[20],
    SHA1_CTX * context
    );
 void SHA1(
    char *hash_out,
    const char *str,
    uint32_t len);
 #if defined(__cplusplus)
 }
 #endif
 #endif /* SHA1_H */
--- a/examples/gguf-hash/deps/sha256/package.json
+++ b/examples/gguf-hash/deps/sha256/package.json
@ -0,0 +1,15 @@
 {
  "name": "sha256",
  "version": "0.0.2",
  "repo": "jb55/sha256.c",
  "description": "sha256 in c",
  "keywords": ["sha256", "sha2"],
  "src": ["sha256.c", "sha256.h"],
  "dependencies": {
    "jb55/rotate-bits.h": "0.1.1"
  },
  "development": {
    "thlorenz/tap.c": "*"
  }
 }
--- a/examples/gguf-hash/deps/sha256/sha256.c
+++ b/examples/gguf-hash/deps/sha256/sha256.c
@ -0,0 +1,221 @@
 /* Crypto/Sha256.c -- SHA-256 Hash
 2010-06-11 : Igor Pavlov : Public domain
 This code is based on public domain code from Wei Dai's Crypto++ library. */
 #include "rotate-bits/rotate-bits.h"
 #include "sha256.h"
 /* define it for speed optimization */
 #define _SHA256_UNROLL
 #define _SHA256_UNROLL2
 void
 sha256_init(sha256_t *p)
 {
  p->state[0] = 0x6a09e667;
  p->state[1] = 0xbb67ae85;
  p->state[2] = 0x3c6ef372;
  p->state[3] = 0xa54ff53a;
  p->state[4] = 0x510e527f;
  p->state[5] = 0x9b05688c;
  p->state[6] = 0x1f83d9ab;
  p->state[7] = 0x5be0cd19;
  p->count = 0;
 }
 #define S0(x) (ROTR32(x, 2) ^ ROTR32(x,13) ^ ROTR32(x, 22))
 #define S1(x) (ROTR32(x, 6) ^ ROTR32(x,11) ^ ROTR32(x, 25))
 #define s0(x) (ROTR32(x, 7) ^ ROTR32(x,18) ^ (x >> 3))
 #define s1(x) (ROTR32(x,17) ^ ROTR32(x,19) ^ (x >> 10))
 #define blk0(i) (W[i] = data[i])
 #define blk2(i) (W[i&15] += s1(W[(i-2)&15]) + W[(i-7)&15] + s0(W[(i-15)&15]))
 #define Ch(x,y,z) (z^(x&(y^z)))
 #define Maj(x,y,z) ((x&y)|(z&(x|y)))
 #define a(i) T[(0-(i))&7]
 #define b(i) T[(1-(i))&7]
 #define c(i) T[(2-(i))&7]
 #define d(i) T[(3-(i))&7]
 #define e(i) T[(4-(i))&7]
 #define f(i) T[(5-(i))&7]
 #define g(i) T[(6-(i))&7]
 #define h(i) T[(7-(i))&7]
 #ifdef _SHA256_UNROLL2
 #define R(a,b,c,d,e,f,g,h, i) h += S1(e) + Ch(e,f,g) + K[i+j] + (j?blk2(i):blk0(i));\
  d += h; h += S0(a) + Maj(a, b, c)
 #define RX_8(i) \
  R(a,b,c,d,e,f,g,h, i); \
  R(h,a,b,c,d,e,f,g, (i+1)); \
  R(g,h,a,b,c,d,e,f, (i+2)); \
  R(f,g,h,a,b,c,d,e, (i+3)); \
  R(e,f,g,h,a,b,c,d, (i+4)); \
  R(d,e,f,g,h,a,b,c, (i+5)); \
  R(c,d,e,f,g,h,a,b, (i+6)); \
  R(b,c,d,e,f,g,h,a, (i+7))
 #else
 #define R(i) h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + (j?blk2(i):blk0(i));\
  d(i) += h(i); h(i) += S0(a(i)) + Maj(a(i), b(i), c(i))
 #ifdef _SHA256_UNROLL
 #define RX_8(i) R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7);
 #endif
 #endif
 static const uint32_t K[64] = {
  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 };
 static void
 sha256_transform(uint32_t *state, const uint32_t *data)
 {
  uint32_t W[16] = {0};
  unsigned j;
  #ifdef _SHA256_UNROLL2
  uint32_t a,b,c,d,e,f,g,h;
  a = state[0];
  b = state[1];
  c = state[2];
  d = state[3];
  e = state[4];
  f = state[5];
  g = state[6];
  h = state[7];
  #else
  uint32_t T[8];
  for (j = 0; j < 8; j++)
    T[j] = state[j];
  #endif
  for (j = 0; j < 64; j += 16)
  {
    #if defined(_SHA256_UNROLL) || defined(_SHA256_UNROLL2)
    RX_8(0); RX_8(8);
    #else
    unsigned i;
    for (i = 0; i < 16; i++) { R(i); }
    #endif
  }
  #ifdef _SHA256_UNROLL2
  state[0] += a;
  state[1] += b;
  state[2] += c;
  state[3] += d;
  state[4] += e;
  state[5] += f;
  state[6] += g;
  state[7] += h;
  #else
  for (j = 0; j < 8; j++)
    state[j] += T[j];
  #endif
  /* Wipe variables */
  /* memset(W, 0, sizeof(W)); */
  /* memset(T, 0, sizeof(T)); */
 }
 #undef S0
 #undef S1
 #undef s0
 #undef s1
 static void
 sha256_write_byte_block(sha256_t *p)
 {
  uint32_t data32[16];
  unsigned i;
  for (i = 0; i < 16; i++)
    data32[i] =
      ((uint32_t)(p->buffer[i * 4    ]) << 24) +
      ((uint32_t)(p->buffer[i * 4 + 1]) << 16) +
      ((uint32_t)(p->buffer[i * 4 + 2]) <<  8) +
      ((uint32_t)(p->buffer[i * 4 + 3]));
  sha256_transform(p->state, data32);
 }
 void
 sha256_hash(unsigned char *buf, const unsigned char *data, size_t size)
 {
  sha256_t hash;
  sha256_init(&hash);
  sha256_update(&hash, data, size);
  sha256_final(&hash, buf);
 }
 void
 sha256_update(sha256_t *p, const unsigned char *data, size_t size)
 {
  uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
  while (size > 0)
  {
    p->buffer[curBufferPos++] = *data++;
    p->count++;
    size--;
    if (curBufferPos == 64)
    {
      curBufferPos = 0;
      sha256_write_byte_block(p);
    }
  }
 }
 void
 sha256_final(sha256_t *p, unsigned char *digest)
 {
  uint64_t lenInBits = (p->count << 3);
  uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
  unsigned i;
  p->buffer[curBufferPos++] = 0x80;
  while (curBufferPos != (64 - 8))
  {
    curBufferPos &= 0x3F;
    if (curBufferPos == 0)
      sha256_write_byte_block(p);
    p->buffer[curBufferPos++] = 0;
  }
  for (i = 0; i < 8; i++)
  {
    p->buffer[curBufferPos++] = (unsigned char)(lenInBits >> 56);
    lenInBits <<= 8;
  }
  sha256_write_byte_block(p);
  for (i = 0; i < 8; i++)
  {
    *digest++ = (unsigned char)(p->state[i] >> 24);
    *digest++ = (unsigned char)(p->state[i] >> 16);
    *digest++ = (unsigned char)(p->state[i] >> 8);
    *digest++ = (unsigned char)(p->state[i]);
  }
  sha256_init(p);
 }
--- a/examples/gguf-hash/deps/sha256/sha256.h
+++ b/examples/gguf-hash/deps/sha256/sha256.h
@ -0,0 +1,24 @@
 /* Sha256.h -- SHA-256 Hash
 2010-06-11 : Igor Pavlov : Public domain */
 #ifndef __CRYPTO_SHA256_H
 #define __CRYPTO_SHA256_H
 #include <stdlib.h>
 #include <stdint.h>
 #define SHA256_DIGEST_SIZE 32
 typedef struct sha256_t
 {
  uint32_t state[8];
  uint64_t count;
  unsigned char buffer[64];
 } sha256_t;
 void sha256_init(sha256_t *p);
 void sha256_update(sha256_t *p, const unsigned char *data, size_t size);
 void sha256_final(sha256_t *p, unsigned char *digest);
 void sha256_hash(unsigned char *buf, const unsigned char *data, size_t size);
 #endif
--- a/examples/gguf-hash/deps/xxhash/clib.json
+++ b/examples/gguf-hash/deps/xxhash/clib.json
@ -0,0 +1,12 @@
 {
  "name": "xxhash",
  "version": "0.8.2",
  "repo": "mofosyne/xxhash",
  "description": "Extremely fast non-cryptographic hash algorithm",
  "keywords": ["xxhash", "hashing"],
  "license": "BSD-2-Clause",
  "src": [
    "xxhash.c",
    "xxhash.h"
  ]
 }
--- a/examples/gguf-hash/deps/xxhash/xxhash.c
+++ b/examples/gguf-hash/deps/xxhash/xxhash.c
@ -0,0 +1,42 @@
 /*
 * xxHash - Extremely Fast Hash algorithm
 * Copyright (C) 2012-2023 Yann Collet
 *
 * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 *    * Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *    * Redistributions in binary form must reproduce the above
 *      copyright notice, this list of conditions and the following disclaimer
 *      in the documentation and/or other materials provided with the
 *      distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * You can contact the author at:
 *   - xxHash homepage: https://www.xxhash.com
 *   - xxHash source repository: https://github.com/Cyan4973/xxHash
 */
 /*
 * xxhash.c instantiates functions defined in xxhash.h
 */
 #define XXH_STATIC_LINKING_ONLY /* access advanced declarations */
 #define XXH_IMPLEMENTATION      /* access definitions */
 #include "xxhash.h"
--- a/examples/gguf-hash/deps/xxhash/xxhash.h
+++ b/examples/gguf-hash/deps/xxhash/xxhash.h
--- a/examples/gguf-hash/gguf-hash.cpp
+++ b/examples/gguf-hash/gguf-hash.cpp
@ -0,0 +1,693 @@
 #include "ggml.h"
 #include <cstdlib>   /* abort() */
 #include <cstddef>
 #include <cstdio>
 #include <string>
 #include <stdexcept>
 #include <algorithm>
 #include <cstring>
 #include <sstream>
 #include <fstream>
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include "xxhash/xxhash.h"
 #include "sha1/sha1.h"
 #include "sha256/sha256.h"
 #ifdef __cplusplus
 }
 #endif
 // uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp')
 #define UUID_NAMESPACE_LLAMA_CPP "ef001206-dadc-5f6d-a15f-3359e577d4e5"
 #define UUID_NAMESPACE_LLAMA_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5
 #define HASH_TYPE_SHA256_STR "sha256"
 #define HASH_TYPE_SHA1_STR   "sha1"
 #define HASH_TYPE_XXH64_STR  "xxh64"
 #define HASH_TYPE_UUID_STR   "uuid"
 typedef enum {
    HASH_EXIT_SUCCESS = 0, // All hash has been generated or validated
    HASH_EXIT_FAILURE = 1, // Generic Failure
    HASH_EXIT_MISMATCH = 2, // Hash mismatched during validation
    HASH_EXIT_MANIFEST_MISSING_ENTRY = 3, // Hash attempted validation but missing entry in manifest
    HASH_EXIT_MANIFEST_UNKNOWN_HASH = 4, // Manifest is present, but we do not know any hash format within it
    HASH_EXIT_MANIFEST_FILE_ERROR = 5 // Manifest is either missing or not a known format
 } hash_exit_code_t;
 typedef enum {
    HASH_MANIFEST_NOT_FOUND,
    HASH_MANIFEST_MISMATCH,
    HASH_MANIFEST_OK,
 } hash_manifest_result_t;
 struct hash_params {
    std::string input;
    bool xxh64 = false;
    bool sha1 = false;
    bool sha256 = false;
    bool uuid = false;
    bool no_layer = false;
    bool manifest_is_usable = false;
    std::string manifest_file;
 };
 struct manifest_check_params {
    bool xxh64 = false;
    bool sha1 = false;
    bool sha256 = false;
    bool uuid = false;
 };
 static char const * hash_manifest_result_to_str(hash_manifest_result_t value) {
    switch (value) {
        case HASH_MANIFEST_NOT_FOUND: return "Not Found";
        case HASH_MANIFEST_MISMATCH: return "Mismatch";
        case HASH_MANIFEST_OK: return "Ok";
    }
    return "?";
 }
 static char const * hash_exit_code_to_str(hash_exit_code_t value) {
    switch (value) {
        case HASH_EXIT_SUCCESS: return "Success";
        case HASH_EXIT_FAILURE: return "Failure";
        case HASH_EXIT_MISMATCH: return "Mismatch";
        case HASH_EXIT_MANIFEST_MISSING_ENTRY: return "Manifest Missing Entry";
        case HASH_EXIT_MANIFEST_UNKNOWN_HASH: return "Manifest Unknown Hash";
        case HASH_EXIT_MANIFEST_FILE_ERROR: return "Manifest File Error";
    }
    return "?";
 }
 static void hash_print_usage(const char * executable) {
    const hash_params default_params;
    printf("\n");
    printf("usage: %s [options] GGUF_IN\n", executable);
    printf("\n");
    printf("Hash a GGUF file");
    printf("\n");
    printf("options:\n");
    printf("  -h, --help              show this help message and exit\n");
    printf("      --xxh64             use xxh64 hash\n");
    printf("      --sha1              use sha1 hash\n");
    printf("      --sha256            use sha256 hash\n");
    printf("      --all               use all hash\n");
    printf("      --no-layer          exclude per layer hash\n");
    printf("      --uuid              generate UUIDv5 ID\n");
    printf("  -c, --check <manifest>  verify against a manifest\n");
    printf("\n");
 }
 static void hash_params_parse_ex(int argc, const char ** argv, hash_params & params) {
    std::string arg;
    bool invalid_param = false;
    const std::string arg_prefix = "--";
    int arg_idx = 1;
    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
        arg = argv[arg_idx];
        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
            std::replace(arg.begin(), arg.end(), '_', '-');
        }
        bool arg_found = false;
        if (arg == "-h" || arg == "--help") {
            hash_print_usage(argv[0]);
            exit(0);
        }
        if (arg == "--xxh64") {
            arg_found = true;
            params.xxh64 = true;
        }
        if (arg == "--sha1") {
            arg_found = true;
            params.sha1 = true;
        }
        if (arg == "--uuid") {
            arg_found = true;
            params.uuid = true;
        }
        if (arg == "--sha256") {
            arg_found = true;
            params.sha256 = true;
        }
        if (arg == "--all") {
            arg_found = true;
            params.sha256 = true;
            params.sha1 = true;
            params.xxh64 = true;
        }
        if (arg == "--no-layer") {
            arg_found = true;
            params.no_layer = true;
        }
        if (arg == "-c" || arg == "--check") {
            if (++arg_idx >= argc) {
                invalid_param = true;
                break;
            }
            arg_found = true;
            params.manifest_file = argv[arg_idx];
        }
        if (!arg_found) {
            throw std::invalid_argument("error: unknown argument: " + arg);
        }
    }
    if (invalid_param) {
        throw std::invalid_argument("error: invalid parameter for argument:" + arg);
    }
    if (argc - arg_idx < 1) {
        throw std::invalid_argument("error: bad arguments");
    }
    params.input = argv[arg_idx++];
 }
 static bool hash_params_parse(int argc, const char ** argv, hash_params & params) {
    bool result = true;
    try {
        hash_params_parse_ex(argc, argv, params);
    }
    catch (const std::invalid_argument & ex) {
        fprintf(stderr, "%s\n", ex.what());
        hash_print_usage(argv[0]);
        exit(EXIT_FAILURE);
    }
    return result;
 }
 static bool manifest_type(const std::string & manifest_file, manifest_check_params & manifest_check) {
    if (manifest_file.empty()) {
        return false;
    }
    std::ifstream file(manifest_file);
    if (!file.is_open()) {
        return false;
    }
    std::string manifest_entry_line;
    while (getline(file, manifest_entry_line)) {
        // hash_type_str hash_str tensor_name
        // e.g. 'xxh64     f66e9cd66a4396a0  test.gguf:tensor_0'
        std::istringstream line_stream(manifest_entry_line);
        std::string file_hash_type;
        if (line_stream >> file_hash_type) {
            if (file_hash_type == HASH_TYPE_SHA256_STR) {
                manifest_check.sha256 = true;
            } else if (file_hash_type == HASH_TYPE_SHA1_STR) {
                manifest_check.sha1 = true;
            } else if (file_hash_type == HASH_TYPE_XXH64_STR) {
                manifest_check.xxh64 = true;
            } else if (file_hash_type == HASH_TYPE_UUID_STR) {
                manifest_check.uuid = true;
            }
        }
    }
    return true;
 }
 static hash_manifest_result_t manifest_verify(const std::string& manifest_file, const std::string& hash_type_str, const std::string& hash_str, const std::string& tensor_name) {
    if (manifest_file.empty()) {
        return HASH_MANIFEST_NOT_FOUND;
    }
    std::ifstream file(manifest_file);
    if (!file.is_open()) {
        return HASH_MANIFEST_NOT_FOUND;
    }
    std::string manifest_entry_line;
    while (getline(file, manifest_entry_line)) {
        std::istringstream line_stream(manifest_entry_line);
        std::string file_hash_type;
        std::string file_hash;
        std::string file_tensor_name;
        if (line_stream >> file_hash_type >> file_hash >> file_tensor_name) {
            // Line parsed. Check hash validity
            if (file_hash_type != hash_type_str) {
                continue;
            }
            if (file_tensor_name != tensor_name) {
                continue;
            }
            return (file_hash == hash_str) ? HASH_MANIFEST_OK : HASH_MANIFEST_MISMATCH;
        }
    }
    return HASH_MANIFEST_NOT_FOUND;
 }
 static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char uuid[16]) {
    // Ref: https://www.rfc-editor.org/rfc/rfc9562.html#section-5.5
    // Assumes that digest was processed correctly with the expected namespace
    for (int i = 0; i < 16; i++) {
        uuid[i] = sha1_digest[i];
    }
    // Set bits corresponding to UUID ver 5
    uuid[ 6] &= ~(0xF << 4);
    uuid[ 6] |= (5 << 4);
    // Set bits corresponding to UUID variant 0b10XX
    uuid[ 8] &= ~(0xc << 4);
    uuid[ 8] |= (0x8 << 4);
 }
 static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
    const std::string & fname = hash_params.input;
    struct ggml_context * ctx_data = NULL;
    struct gguf_init_params params = {
        /*.no_alloc = */ false,
        /*.ctx      = */ &ctx_data,
    };
    // xxh64 init
    XXH64_state_t* xxh64_model_hash_state = NULL;
    if (hash_params.xxh64) {
        xxh64_model_hash_state = XXH64_createState();
        if (xxh64_model_hash_state==NULL) {
            abort();
        }
        XXH64_hash_t const seed = 0;
        if (XXH64_reset(xxh64_model_hash_state, seed) == XXH_ERROR) {
            abort();
        }
    }
    // sha1 init
    SHA1_CTX sha1_model_hash_ctx;
    if (hash_params.sha1) {
        SHA1Init(&sha1_model_hash_ctx);
    }
    // sha256 init
    sha256_t sha256_model_hash_ctx;
    if (hash_params.sha256) {
        sha256_init(&sha256_model_hash_ctx);
    }
    // sha1 for uuid init
    SHA1_CTX sha1_for_uuid_ctx;
    if (hash_params.uuid) {
        unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_LLAMA_CPP_HEX};
        SHA1Init(&sha1_for_uuid_ctx);
        SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace));
    }
    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
    const int n_tensors = gguf_get_n_tensors(ctx);
    bool tensor_layer_in_manifest = false;
    bool model_in_manifest = false;
    bool tensor_layer_has_mismatch = false;
    bool model_has_mismatch = false;
    for (int i = 0; i < n_tensors; ++i) {
        const char * name = gguf_get_tensor_name(ctx, i);
        struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
        auto n_bytes = ggml_nbytes(cur);
        auto *raw_data = cur->data;
        const std::string tensor_layer_name = fname + ":" + name;
        if (hash_params.xxh64) {
            if (!hash_params.no_layer) {
                // Per Layer Hash
                XXH64_hash_t hash = XXH64(raw_data, n_bytes, 0);
                char hex_result[17];
                for (int  offset = 0; offset < 8; offset++) {
                    unsigned int shift_bits_by = (8 * (8 - offset - 1));
                    sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
                }
                if (hash_params.manifest_is_usable) {
                    hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name);
                    switch (verify_result) {
                        case HASH_MANIFEST_NOT_FOUND:
                            break;
                        case HASH_MANIFEST_MISMATCH:
                            tensor_layer_in_manifest = true;
                            tensor_layer_has_mismatch = true;
                            break;
                        case HASH_MANIFEST_OK:
                            tensor_layer_in_manifest = true;
                            break;
                    }
                    printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
                } else {
                    printf("%-8s  %-s  %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str());
                }
            }
            // Overall Model Hash
            if (XXH64_update(xxh64_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
        }
        if (hash_params.sha1) {
            if (!hash_params.no_layer) {
                // Per Layer Hash
                char result[21]; // sha1 outputs 20 bytes
                SHA1( result, (const char *)raw_data, n_bytes);
                char hex_result[41] = {0};
                for (int  offset = 0; offset < 20; offset++) {
                    sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
                }
                if (hash_params.manifest_is_usable) {
                    hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name);
                    switch (verify_result) {
                        case HASH_MANIFEST_NOT_FOUND:
                            break;
                        case HASH_MANIFEST_MISMATCH:
                            tensor_layer_in_manifest = true;
                            tensor_layer_has_mismatch = true;
                            break;
                        case HASH_MANIFEST_OK:
                            tensor_layer_in_manifest = true;
                            break;
                    }
                    printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
                } else {
                    printf("%-8s  %-s  %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str());
                }
            }
            // Overall Model Hash
            SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
        }
        if (hash_params.sha256) {
            if (!hash_params.no_layer) {
                // Per Layer Hash
                unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
                sha256_hash((unsigned char*) result, (const unsigned char *)raw_data, n_bytes);
                char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
                for (int  offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
                    sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
                }
                if (hash_params.manifest_is_usable) {
                    hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name);
                    switch (verify_result) {
                        case HASH_MANIFEST_NOT_FOUND:
                            break;
                        case HASH_MANIFEST_MISMATCH:
                            tensor_layer_in_manifest = true;
                            tensor_layer_has_mismatch = true;
                            break;
                        case HASH_MANIFEST_OK:
                            tensor_layer_in_manifest = true;
                            break;
                    }
                    printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
                } else {
                    printf("%-8s  %-s  %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str());
                }
            }
            // Overall Model Hash
            sha256_update( &sha256_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
        }
        if (hash_params.uuid) {
            SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)raw_data, n_bytes);
        }
    }
    if (hash_params.xxh64) {
        XXH64_hash_t const hash = XXH64_digest(xxh64_model_hash_state);
        char hex_result[17];
        for (int  offset = 0; offset < 8; offset++) {
            unsigned int shift_bits_by = (8 * (8 - offset - 1));
            sprintf( ( hex_result + (2*offset)), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
        }
        if (hash_params.manifest_is_usable) {
            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, fname);
            switch (verify_result) {
                case HASH_MANIFEST_NOT_FOUND:
                    break;
                case HASH_MANIFEST_MISMATCH:
                    model_in_manifest = true;
                    model_has_mismatch = true;
                    break;
                case HASH_MANIFEST_OK:
                    model_in_manifest = true;
                    break;
            }
            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
        } else {
            printf("%-8s  %-s  %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str());
        }
    }
    if (hash_params.sha1) {
        unsigned char result[21];
        SHA1Final(result, &sha1_model_hash_ctx);
        char hex_result[41];
        for (int  offset = 0; offset < 20; offset++) {
            sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
        }
        if (hash_params.manifest_is_usable) {
            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, fname);
            switch (verify_result) {
                case HASH_MANIFEST_NOT_FOUND:
                    break;
                case HASH_MANIFEST_MISMATCH:
                    model_in_manifest = true;
                    model_has_mismatch = true;
                    break;
                case HASH_MANIFEST_OK:
                    model_in_manifest = true;
                    break;
            }
            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
        } else {
            printf("%-8s  %-s  %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str());
        }
    }
    if (hash_params.sha256) {
        unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
        sha256_final( &sha256_model_hash_ctx,  result);
        char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
        for (int  offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
            sprintf( ( hex_result + (2*offset)), "%02x", result[offset]&0xff);
        }
        if (hash_params.manifest_is_usable) {
            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, fname);
            switch (verify_result) {
                case HASH_MANIFEST_NOT_FOUND:
                    break;
                case HASH_MANIFEST_MISMATCH:
                    model_in_manifest = true;
                    model_has_mismatch = true;
                    break;
                case HASH_MANIFEST_OK:
                    model_in_manifest = true;
                    break;
            }
            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
        } else {
            printf("%-8s  %-s  %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str());
        }
    }
    if (hash_params.uuid) {
        unsigned char result[21];
        SHA1Final(result, &sha1_for_uuid_ctx);
        unsigned char uuid[16];
        generate_uuidv5(result, uuid);
        char string_buffer[37] = {0};
        sprintf(string_buffer, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
            uuid[0], uuid[1], uuid[2], uuid[3],
            uuid[4], uuid[5], uuid[6], uuid[7],
            uuid[8], uuid[9], uuid[10], uuid[11],
            uuid[12], uuid[13], uuid[14], uuid[15]);
        if (hash_params.manifest_is_usable) {
            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, string_buffer, fname);
            switch (verify_result) {
                case HASH_MANIFEST_NOT_FOUND:
                    break;
                case HASH_MANIFEST_MISMATCH:
                    model_in_manifest = true;
                    model_has_mismatch = true;
                    break;
                case HASH_MANIFEST_OK:
                    model_in_manifest = true;
                    break;
            }
            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str(), hash_manifest_result_to_str(verify_result));
        } else {
            printf("%-8s  %-s  %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str());
        }
    }
    ggml_free(ctx_data);
    gguf_free(ctx);
    if (hash_params.manifest_is_usable) {
        // In hash verification mode
        if (!model_in_manifest) {
            // model missing in manifest?
            // Check tensor layer...
            if (!tensor_layer_in_manifest) {
                // Still missing? Maybe we are reading the wrong manifest.
                return HASH_EXIT_MANIFEST_MISSING_ENTRY;
            }
            if (tensor_layer_has_mismatch) {
                // Per tensor check found error
                return HASH_EXIT_FAILURE;
            }
            // All per tensor layer checks passed? Sounds good enough.
            return HASH_EXIT_SUCCESS;
        }
        // Overall model check passed, but let's check per layer just in case
        // If missing, we don't care too much as the overall model checked
        if (tensor_layer_in_manifest && tensor_layer_has_mismatch) {
            return HASH_EXIT_FAILURE;
        }
        if (model_has_mismatch) {
            // model has failed hash somewhere in the model
            return HASH_EXIT_FAILURE;
        }
        // All checks appears to be fine
        return HASH_EXIT_SUCCESS;
    }
    // In hash generation mode
    return HASH_EXIT_SUCCESS;
 }
 int main(int argc, const char ** argv) {
    hash_params params;
    manifest_check_params manifest_check;
    hash_params_parse(argc, argv, params);
    if (!params.manifest_file.empty()) {
        if (!manifest_type(params.manifest_file, manifest_check)) {
            printf("ERROR cannot open manifest %s", params.manifest_file.c_str());
            return HASH_EXIT_MANIFEST_FILE_ERROR;
        }
        if (!manifest_check.sha256 && !manifest_check.sha1 && !manifest_check.xxh64 && !manifest_check.uuid) {
            printf("ERROR manifest does not have any known hash format in %s", params.manifest_file.c_str());
            return HASH_EXIT_MANIFEST_UNKNOWN_HASH;
        }
        printf("manifest  %s", params.manifest_file.c_str());
        if (manifest_check.sha256) {
            printf("  sha256");
        }
        if (manifest_check.sha1) {
            printf("  sha1");
        }
        if (manifest_check.xxh64) {
            printf("  xxh64");
        }
        if (manifest_check.uuid) {
            printf("  uuid");
        }
        printf("\n");
        // Autoselect the highest security hash if manifest is provided but
        // the user has not specifically defined the hash they care about
        if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
            // User has not selected a specific value, pick most secure hash
            if (manifest_check.sha256) {
                params.sha256 = true;
            } else if (manifest_check.sha1) {
                params.sha1 = true;
            } else if (manifest_check.xxh64) {
                params.xxh64 = true;
            } else if (manifest_check.uuid) {
                params.uuid = true;
            }
        }
        params.manifest_is_usable = true;
    }
    // By default if no swich argument provided, assume xxh64
    if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
        params.xxh64 = true;
    }
    hash_exit_code_t exit_code = gguf_hash(params);
    if (params.manifest_is_usable) {
        printf("\nVerification results for %s - %s\n", params.manifest_file.c_str(), hash_exit_code_to_str(exit_code));
    }
    return exit_code;
 }
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -205,21 +205,17 @@ int main(int argc, char ** argv) {
    GGML_ASSERT(llama_add_eos_token(model) != 1);
    LOG("add_bos: %d\n", add_bos);
    bool suff_rm_leading_spc = params.escape;
    if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
        params.input_suffix.erase(0, 1);
        suff_rm_leading_spc = false;
    }
    std::vector<llama_token> embd_inp;
    std::vector<llama_token> embd_end;
    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
-    const int space_token = 29871;
+
-    if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
+    GGML_ASSERT(llama_token_prefix(model) >= 0);
-        inp_sfx.erase(inp_sfx.begin());
+    GGML_ASSERT(llama_token_suffix(model) >= 0);
-    }
+
    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
    embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
    embd_end = params.spm_infill ? inp_pfx : inp_sfx;
    if (add_bos) {
@ -517,19 +513,14 @@ int main(int argc, char ** argv) {
                    string_process_escapes(params.input_prefix);
                    string_process_escapes(params.input_suffix);
                }
-                suff_rm_leading_spc = params.escape;
+
                if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
                    params.input_suffix.erase(0, 1);
                    suff_rm_leading_spc = false;
                }
                // tokenize new prefix and suffix
                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
-                if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
+
                    inp_sfx.erase(inp_sfx.begin());
                }
                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
                embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
                embd_end = params.spm_infill ? inp_pfx : inp_sfx;
                if (add_bos) {
--- a/examples/json_schema_pydantic_example.py
+++ b/examples/json_schema_pydantic_example.py
@ -3,7 +3,7 @@
 #! pip install pydantic
 #! python json_schema_pydantic_example.py
-from pydantic import BaseModel, Extra, TypeAdapter
+from pydantic import BaseModel, Field, TypeAdapter
 from annotated_types import MinLen
 from typing import Annotated, List, Optional
 import json, requests
@ -17,6 +17,9 @@ if True:
        The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below)
        '''
        response_format = None
        type_adapter = None
        if response_model:
            type_adapter = TypeAdapter(response_model)
            schema = type_adapter.json_schema()
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@ -1,4 +1,6 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import argparse
 import itertools
 import json
@ -188,7 +190,7 @@ def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], ou
    raise RuntimeError("At least one of min_value or max_value must be set")
 class BuiltinRule:
-    def __init__(self, content: str, deps: list = None):
+    def __init__(self, content: str, deps: list | None = None):
        self.content = content
        self.deps = deps or []
@ -248,7 +250,7 @@ class SchemaConverter:
    def _format_literal(self, literal):
        escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
-            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal
+            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)) or m.group(0), literal
        )
        return f'"{escaped}"'
@ -403,11 +405,11 @@ class SchemaConverter:
        i = 0
        length = len(pattern)
-        def to_rule(s: Tuple[str, bool]) -> str:
+        def to_rule(s: tuple[str, bool]) -> str:
            (txt, is_literal) = s
            return "\"" + txt + "\"" if is_literal else txt
-        def transform() -> Tuple[str, bool]:
+        def transform() -> tuple[str, bool]:
            '''
                Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
            '''
@ -420,7 +422,7 @@ class SchemaConverter:
            # We only need a flat structure here to apply repetition operators to the last item, and
            # to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
            # (GBNF's syntax is luckily very close to regular expressions!)
-            seq: list[Tuple[str, bool]] = []
+            seq: list[tuple[str, bool]] = []
            def get_dot():
                if self._dotall:
--- a/examples/llava/convert_image_encoder_to_gguf.py
+++ b/examples/llava/convert_image_encoder_to_gguf.py
@ -185,6 +185,8 @@ else:
    fout.add_description("two-tower CLIP model")
 if has_text_encoder:
    assert t_hparams is not None
    assert tokens is not None
    # text_model hparams
    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
@ -259,8 +261,8 @@ if has_vision_encoder:
    if processor is not None:
-        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
+        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean  # pyright: ignore[reportAttributeAccessIssue]
-        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
+        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std  # pyright: ignore[reportAttributeAccessIssue]
    else:
        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
        image_std = args.image_std if args.image_std is not None else default_image_std
@ -272,7 +274,7 @@ fout.add_bool("clip.use_gelu", use_gelu)
 if has_llava_projector:
-    model.vision_model.encoder.layers.pop(-1)
+    model.vision_model.encoder.layers.pop(-1)  # pyright: ignore[reportAttributeAccessIssue]
    projector = torch.load(args.llava_projector)
    for name, data in projector.items():
        name = get_tensor_name(name)
@ -286,7 +288,7 @@ if has_llava_projector:
    print("Projector tensors added\n")
-state_dict = model.state_dict()
+state_dict = model.state_dict()  # pyright: ignore[reportAttributeAccessIssue]
 for name, data in state_dict.items():
    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
        # we don't need this
--- a/examples/llava/llava_surgery_v2.py
+++ b/examples/llava/llava_surgery_v2.py
@ -2,7 +2,9 @@ import argparse
 import glob
 import os
 import torch
-from safetensors.torch import load as safe_load, save as safe_save, safe_open, save_file
+from safetensors import safe_open
 from safetensors.torch import save_file
 from typing import Any, ContextManager, cast
 # Function to determine if file is a SafeTensor file
 def is_safetensor_file(file_path):
@ -13,7 +15,7 @@ def is_safetensor_file(file_path):
 def load_model(file_path):
    if is_safetensor_file(file_path):
        tensors = {}
-        with safe_open(file_path, framework="pt", device="cpu") as f:
+        with cast(ContextManager[Any], safe_open(file_path, framework="pt", device="cpu")) as f:
            for key in f.keys():
                tensors[key] = f.get_tensor(key).clone()
                # output shape
@ -134,7 +136,7 @@ if len(mm_tensors) == 0:
    if last_checkpoint is not None:
        for k, v in last_checkpoint.items():
            print(k)
-    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint)} tensors.")
+    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint) if last_checkpoint is not None else 0} tensors.")
    print("No tensors found. Is this a LLaVA model?")
    exit()
@ -143,8 +145,10 @@ print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
 # projector = {name: checkpoint.[name].float() for name in mm_tensors}
 projector = {}
 for name in mm_tensors:
    assert last_checkpoint is not None
    projector[name] = last_checkpoint[name].float()
 for name in first_mm_tensors:
    assert first_checkpoint is not None
    projector[name] = first_checkpoint[name].float()
 if len(projector) > 0:
--- a/examples/llava/requirements.txt
+++ b/examples/llava/requirements.txt
@ -1,3 +1,4 @@
 -r ../../requirements/requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
 pillow~=10.2.0
 torch~=2.2.1
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -1,6 +1,6 @@
 # llama.cpp/examples/main
-This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
+This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
 ## Table of Contents
@ -17,60 +17,59 @@ This example program allows you to use various LLaMA language models in an easy
 To get started right away, run the following command, making sure to use the correct path for the model you have:
-#### Unix-based systems (Linux, macOS, etc.):
+First, we will need to download a model. In these examples, we will use the Gemma model from the ggml-org repo on Hugging Face.
 [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)
 Once downloaded, place your model in the models folder in llama.cpp.
 ### Unix-based systems (Linux, macOS, etc.):
 ##### Input prompt (One-and-done)
 ```bash
-./llama-cli -m models/7B/ggml-model.bin --prompt "Once upon a time"
+./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
 ```
-
+##### Conversation mode (Allow for continuous interaction with the model)
 #### Windows:
 ```powershell
 llama-cli.exe -m models\7B\ggml-model.bin --prompt "Once upon a time"
 ```
 For an interactive experience, try this command:
 #### Unix-based systems (Linux, macOS, etc.):
 ```bash
-./llama-cli -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
+./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
 'User: Hi
 AI: Hello. I am an AI chatbot. Would you like to talk?
 User: Sure!
 AI: What would you like to talk about?
 User:'
 ```
-#### Windows:
+##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
 ```powershell
 llama-cli.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
 ```
 The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
 #### Unix-based systems (Linux, macOS, etc.):
 ```bash
-./llama-cli -m models/7B/ggml-model.bin --ignore-eos -n -1
+./llama-cli -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 ```
-#### Windows:
+### Windows:
 ##### Input prompt (One-and-done)
 ```powershell
 ./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time"
 ```
 ##### Conversation mode (Allow for continuous interaction with the model)
 ```powershell
-llama-cli.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
+./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma
 ```
 #### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it):
 ```powershell
 llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 ```
 ## Common Options
 In this section, we cover the most commonly used options for running the `llama-cli` program with the LLaMA models:
-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/gemma-1.1-7b-it.Q4_K_M.gguf`; inferred from `--model-url` if set).
-   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
+-   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g [https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true](https://huggingface.co/ggml-org/gemma-1.1-7b-it-Q4_K_M-GGUF/resolve/main/gemma-1.1-7b-it.Q4_K_M.gguf?download=true)).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
 -   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
 -   `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\'
 -   `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has.
 -   -   `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 ## Input Prompts
@ -90,6 +89,7 @@ In interactive mode, users can participate in text generation by injecting their
 -   `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
 -   `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
 -   `-cnv,  --conversation`:  Run the program in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: false)
 -   `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
 By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
@ -117,6 +117,13 @@ The `--in-suffix` flag is used to add a suffix after your input. This is useful
 ```sh
 ./llama-cli -r "User:" --in-prefix " " --in-suffix "Assistant:"
 ```
 When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled
 ### Chat templates
 `--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name.  Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled.
 Example usage: `--chat-template gemma`
 ## Context Management
@ -124,9 +131,7 @@ During text generation, LLaMA models have a limited context size, which means th
 ### Context Size
-The `--ctx-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
+- `-c N, --ctx-size N`: Set the size of the prompt context (default: 0, 0 = loaded from model). The LLaMA models were built with a context of 2048-8192, which will yield the best results on longer input/inference.
 -   `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
 ### Extended Context Size
@ -148,15 +153,15 @@ The following options allow you to control the text generation process and fine-
 ### Number of Tokens to Predict
-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity, -2 = until context filled)
+-   `-n N, --predict N`: Set the number of tokens to predict when generating text (default: -1, -1 = infinity, -2 = until context filled)
-The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text.
+The `--predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text.
-A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after `--n-keep`) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in significant pause in output.
+A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after `--keep`) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in a significant pause in output.
 If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.
-It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n-predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
+It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
 ### Temperature
@ -164,15 +169,15 @@ It is important to note that the generated text may be shorter than the specifie
 Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
-Example usage: `--temp 0.5`
+Example usage: `--temp 0`
 ### Repeat Penalty
-   `--repeat-penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
+-   `--repeat-penalty N`: Control the repetition of token sequences in the generated text default: 1.0, 1.0 = disabled).
 -   `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
 -   `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.
-The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
+The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.
 The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
@ -196,19 +201,19 @@ Top-p sampling, also known as nucleus sampling, is another text generation metho
 Example usage: `--top-p 0.95`
-### Min P Sampling
+### Min-P Sampling
-   `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.05).
+-   `--min-p N`: Sets a minimum base probability threshold for token selection (default: 0.1).
 The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.
 Example usage: `--min-p 0.05`
-### Tail Free Sampling (TFS)
+### Tail-Free Sampling (TFS)
 -   `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
-Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens, and thus disables the effect of TFS.
+Tail-free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks at how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens and thus disables the effect of TFS.
 Example usage: `--tfs 0.95`
@ -307,10 +312,8 @@ These options provide extra functionality and customization when running the LLa
 -   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
 -   `--verbose-prompt`: Print the prompt before generating text.
 -   `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
 -   `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable  or in an OS-specific local cache.
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
@ -6,10 +6,10 @@ import re
 from copy import copy
 from enum import Enum
 from inspect import getdoc, isclass
-from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints
+from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin
 from docstring_parser import parse
-from pydantic import BaseModel, Field, create_model
+from pydantic import BaseModel, create_model
 if TYPE_CHECKING:
    from types import GenericAlias
@ -17,6 +17,9 @@ else:
    # python 3.8 compat
    from typing import _GenericAlias as GenericAlias
 # TODO: fix this
 # pyright: reportAttributeAccessIssue=information
 class PydanticDataType(Enum):
    """
@ -234,8 +237,9 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None
    # Define the integer part rule
    integer_part_rule = (
-        "integer-part" + (f"-max{max_digit}" if max_digit is not None else "") + (
+        "integer-part"
-        f"-min{min_digit}" if min_digit is not None else "")
+        + (f"-max{max_digit}" if max_digit is not None else "")
        + (f"-min{min_digit}" if min_digit is not None else "")
    )
    # Define the fractional part rule based on precision constraints
@ -458,7 +462,7 @@ def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[Bas
    if not issubclass(model, BaseModel):
        # For non-Pydantic classes, generate model_fields from __annotations__ or __init__
        if hasattr(model, "__annotations__") and model.__annotations__:
-            model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()}
+            model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()}  # pyright: ignore[reportGeneralTypeIssues]
        else:
            init_signature = inspect.signature(model.__init__)
            parameters = init_signature.parameters
@ -680,7 +684,7 @@ def generate_markdown_documentation(
        str: Generated text documentation.
    """
    documentation = ""
-    pyd_models = [(model, True) for model in pydantic_models]
+    pyd_models: list[tuple[type[BaseModel], bool]] = [(model, True) for model in pydantic_models]
    for model, add_prefix in pyd_models:
        if add_prefix:
            documentation += f"{model_prefix}: {model.__name__}\n"
@ -700,7 +704,7 @@ def generate_markdown_documentation(
            # Indenting the fields section
            documentation += f"  {fields_prefix}:\n"
        else:
-            documentation += f"  Fields:\n"
+            documentation += f"  Fields:\n"  # noqa: F541
        if isclass(model) and issubclass(model, BaseModel):
            for name, field_type in model.__annotations__.items():
                # if name == "markdown_code_block":
@ -778,7 +782,7 @@ def generate_field_markdown(
        return field_text
    if field_description != "":
-        field_text += f"        Description: " + field_description + "\n"
+        field_text += f"        Description: {field_description}\n"
    # Check for and include field-specific examples if available
    if hasattr(model, "Config") and hasattr(model.Config,
@ -833,7 +837,7 @@ def generate_text_documentation(
        str: Generated text documentation.
    """
    documentation = ""
-    pyd_models = [(model, True) for model in pydantic_models]
+    pyd_models: list[tuple[type[BaseModel], bool]] = [(model, True) for model in pydantic_models]
    for model, add_prefix in pyd_models:
        if add_prefix:
            documentation += f"{model_prefix}: {model.__name__}\n"
@ -1164,7 +1168,7 @@ def create_dynamic_model_from_function(func: Callable[..., Any]):
        dynamic_fields[param.name] = (
            param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
    # Creating the dynamic model
-    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)  # type: ignore[call-overload]
+    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)
    for name, param_doc in param_docs:
        dynamic_model.model_fields[name].description = param_doc.description
@ -1228,9 +1232,6 @@ def map_grammar_names_to_pydantic_model_class(pydantic_model_list):
    return output
 from enum import Enum
 def json_schema_to_python_types(schema):
    type_map = {
        "any": Any,
@ -1275,7 +1276,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
                    if items != {}:
                        array = {"properties": items}
                        array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
-                        fields[field_name] = (List[array_type], ...)  # type: ignore[valid-type]
+                        fields[field_name] = (List[array_type], ...)
                    else:
                        fields[field_name] = (list, ...)
                elif field_type == "object":
@ -1285,7 +1286,8 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
                    required = field_data.get("enum", [])
                    for key, field in fields.items():
                        if key not in required:
-                            fields[key] = (Optional[fields[key][0]], ...)
+                            optional_type = fields[key][0]
                            fields[key] = (Optional[optional_type], ...)
                else:
                    field_type = json_schema_to_python_types(field_type)
                    fields[field_name] = (field_type, ...)
@ -1305,6 +1307,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name:
        required = dictionary.get("required", [])
        for key, field in fields.items():
            if key not in required:
-                fields[key] = (Optional[fields[key][0]], ...)
+                optional_type = fields[key][0]
                fields[key] = (Optional[optional_type], ...)
    custom_model = create_model(model_name, **fields)
    return custom_model
--- a/examples/pydantic_models_to_grammar_examples.py
+++ b/examples/pydantic_models_to_grammar_examples.py
@ -1,6 +1,7 @@
 # Function calling example using pydantic models.
 from __future__ import annotations
 import datetime
 import importlib
 import json
 from enum import Enum
 from typing import Optional, Union
@ -215,9 +216,9 @@ for call in json_data:
    if call["function"] == "Calculator":
        print(Calculator(**call["params"]).run())
    elif call["function"] == "get_current_datetime":
-        print(current_datetime_model(**call["params"]).run())
+        print(current_datetime_model(**call["params"]).run())  # pyright: ignore[reportAttributeAccessIssue]
    elif call["function"] == "get_current_weather":
-        print(current_weather_tool_model(**call["params"]).run())
+        print(current_weather_tool_model(**call["params"]).run())  # pyright: ignore[reportAttributeAccessIssue]
 # Should output something like this:
 # 2024-01-14 13:36:06
 # {"location": "London", "temperature": "42", "unit": "celsius"}
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -47,6 +47,9 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B",  },
    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 6.14G, +0.0217 ppl @ Llama-3-8B",  },
    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 7.96G, +0.0026 ppl @ Llama-3-8B",  },
    { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
    { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
    { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "14.00G, +0.0020 ppl @ Mistral-7B",  },
    { "BF16",   LLAMA_FTYPE_MOSTLY_BF16,   "14.00G, -0.0050 ppl @ Mistral-7B",  },
    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B",          },
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -366,7 +366,8 @@ Notice that each `probs` is an array of length `n_probs`.
  "assistant_name": "",
  "user_name": "",
  "default_generation_settings": { ... },
-  "total_slots": 1
+  "total_slots": 1,
  "chat_template": ""
 }
 ```
@ -374,6 +375,7 @@ Notice that each `probs` is an array of length `n_probs`.
 - `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
 - `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
 - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
 - `chat_template` - the model's original Jinja2 prompt template
 - **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@ -1,3 +1,5 @@
 from __future__ import annotations
 import argparse
 import json
 import os
@ -59,10 +61,11 @@ def main(args_in: list[str] | None = None) -> None:
        sys.exit(1)
    # start the benchmark
    iterations = 0
    data = {}
    try:
        start_benchmark(args)
        iterations = 0
        with open("results.github.env", 'w') as github_env:
            # parse output
            with open('k6-results.json', 'r') as bench_results:
@ -129,7 +132,7 @@ def main(args_in: list[str] | None = None) -> None:
                timestamps, metric_values = zip(*values)
                metric_values = [float(value) for value in metric_values]
                prometheus_metrics[metric] = metric_values
-                timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps]
+                timestamps_dt = [str(datetime.fromtimestamp(int(ts))) for ts in timestamps]
                plt.figure(figsize=(16, 10), dpi=80)
                plt.plot(timestamps_dt, metric_values, label=metric)
                plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7)
@ -156,7 +159,7 @@ def main(args_in: list[str] | None = None) -> None:
                plt.close()
                # Mermaid format in case images upload failed
-                with (open(f"{metric}.mermaid", 'w') as mermaid_f):
+                with open(f"{metric}.mermaid", 'w') as mermaid_f:
                    mermaid = (
                    f"""---
 config:
@ -278,7 +281,7 @@ def start_server_background(args):
    }
    server_process = subprocess.Popen(
        args,
-        **pkwargs)
+        **pkwargs)  # pyright: ignore[reportArgumentType, reportCallIssue]
    def server_log(in_stream, out_stream):
        for line in iter(in_stream.readline, b''):
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -738,6 +738,8 @@ struct server_context {
            slot.ga_n = ga_n;
            slot.ga_w = ga_w;
            slot.sparams = params.sparams;
            slot.reset();
            slots.push_back(slot);
@ -885,7 +887,8 @@ struct server_context {
    bool launch_slot_with_task(server_slot & slot, const server_task & task) {
        slot_params default_params;
-        llama_sampling_params default_sparams;
+        // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
        llama_sampling_params default_sparams = params.sparams;
        auto & data = task.data;
        if (data.count("__oaicompat") != 0) {
@ -2606,7 +2609,7 @@ int main(int argc, char ** argv) {
    // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
    if (params.chat_template.empty()) {
        if (!ctx_server.validate_model_chat_template()) {
-            LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
+            LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
            params.chat_template = "chatml";
        }
    }
@ -2968,11 +2971,20 @@ int main(int argc, char ** argv) {
    };
    const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
        std::string template_key = "tokenizer.chat_template", curr_tmpl;
        int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0);
        if (tlen > 0) {
            std::vector<char> curr_tmpl_buf(tlen + 1, 0);
            if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) {
                curr_tmpl = std::string(curr_tmpl_buf.data(), tlen);
            }
        }
        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
        json data = {
            { "system_prompt",               ctx_server.system_prompt.c_str() },
            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
-            { "total_slots",                 ctx_server.params.n_parallel }
+            { "total_slots",                 ctx_server.params.n_parallel },
            { "chat_template",               curr_tmpl.c_str() }
        };
        res.set_content(data.dump(), "application/json; charset=utf-8");
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -1,5 +1,4 @@
 import asyncio
 import collections
 import json
 import os
 import re
@ -8,19 +7,23 @@ import subprocess
 import sys
 import threading
 import time
 from collections.abc import Sequence
 from contextlib import closing
 from re import RegexFlag
 from typing import Any, Literal, cast
 import aiohttp
 import numpy as np
 import openai
-from behave import step
+from openai.types.chat import ChatCompletionChunk
 from behave import step  # pyright: ignore[reportAttributeAccessIssue]
 from behave.api.async_step import async_run_until_complete
 from prometheus_client import parser
 # pyright: reportRedeclaration=false
@step("a server listening on {server_fqdn}:{server_port}")
-def step_server_config(context, server_fqdn, server_port):
+def step_server_config(context, server_fqdn: str, server_port: str):
    context.server_fqdn = server_fqdn
    context.server_port = int(server_port)
    context.n_threads = None
@ -74,34 +77,34 @@ def step_server_config(context, server_fqdn, server_port):
@step('a model file {hf_file} from HF repo {hf_repo}')
-def step_download_hf_model(context, hf_file, hf_repo):
+def step_download_hf_model(context, hf_file: str, hf_repo: str):
    context.model_hf_repo = hf_repo
    context.model_hf_file = hf_file
    context.model_file = os.path.basename(hf_file)
@step('a model file {model_file}')
-def step_model_file(context, model_file):
+def step_model_file(context, model_file: str):
    context.model_file = model_file
@step('a model url {model_url}')
-def step_model_url(context, model_url):
+def step_model_url(context, model_url: str):
    context.model_url = model_url
@step('a model alias {model_alias}')
-def step_model_alias(context, model_alias):
+def step_model_alias(context, model_alias: str):
    context.model_alias = model_alias
@step('{seed:d} as server seed')
-def step_seed(context, seed):
+def step_seed(context, seed: int):
    context.server_seed = seed
@step('{ngl:d} GPU offloaded layers')
-def step_n_gpu_layer(context, ngl):
+def step_n_gpu_layer(context, ngl: int):
    if 'N_GPU_LAYERS' in os.environ:
        new_ngl = int(os.environ['N_GPU_LAYERS'])
        if context.debug:
@ -111,37 +114,37 @@ def step_n_gpu_layer(context, ngl):
@step('{n_threads:d} threads')
-def step_n_threads(context, n_threads):
+def step_n_threads(context, n_threads: int):
    context.n_thread = n_threads
@step('{draft:d} as draft')
-def step_draft(context, draft):
+def step_draft(context, draft: int):
    context.draft = draft
@step('{n_ctx:d} KV cache size')
-def step_n_ctx(context, n_ctx):
+def step_n_ctx(context, n_ctx: int):
    context.n_ctx = n_ctx
@step('{n_slots:d} slots')
-def step_n_slots(context, n_slots):
+def step_n_slots(context, n_slots: int):
    context.n_slots = n_slots
@step('{n_predict:d} server max tokens to predict')
-def step_server_n_predict(context, n_predict):
+def step_server_n_predict(context, n_predict: int):
    context.n_server_predict = n_predict
@step('{slot_save_path} as slot save path')
-def step_slot_save_path(context, slot_save_path):
+def step_slot_save_path(context, slot_save_path: str):
    context.slot_save_path = slot_save_path
@step('using slot id {id_slot:d}')
-def step_id_slot(context, id_slot):
+def step_id_slot(context, id_slot: int):
    context.id_slot = id_slot
@ -191,7 +194,7 @@ def step_start_server(context):
@step("the server is {expecting_status}")
@async_run_until_complete
-async def step_wait_for_the_server_to_be_started(context, expecting_status):
+async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
    match expecting_status:
        case 'healthy':
            await wait_for_health_status(context, context.base_url, 200, 'ok',
@ -221,7 +224,7 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
@step('all slots are {expected_slot_status_string}')
@async_run_until_complete
-async def step_all_slots_status(context, expected_slot_status_string):
+async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
    match expected_slot_status_string:
        case 'idle':
            expected_slot_status = 0
@ -237,7 +240,7 @@ async def step_all_slots_status(context, expected_slot_status_string):
@step('a completion request with {api_error} api error')
@async_run_until_complete
-async def step_request_completion(context, api_error):
+async def step_request_completion(context, api_error: Literal['raised'] | str):
    expect_api_error = api_error == 'raised'
    seeds = await completions_seed(context, num_seeds=1)
    completion = await request_completion(context.prompts.pop(),
@ -777,8 +780,8 @@ def step_assert_metric_value(context, metric_name, metric_value):
 def step_available_models(context):
    # openai client always expects an api_key
    openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope'
-    openai.api_base = f'{context.base_url}/v1'
+    openai.base_url = f'{context.base_url}/v1/'
-    context.models = openai.Model.list().data
+    context.models = openai.models.list().data
@step('{n_model:d} models are supported')
@ -789,7 +792,7 @@ def step_supported_models(context, n_model):
@step('model {i_model:d} is {param} {preposition} {param_value}')
-def step_supported_models(context, i_model, param, preposition, param_value):
+def step_supported_models(context, i_model: int, param: Literal['identified', 'trained'] | str, preposition: str, param_value: str):
    assert i_model < len(context.models)
    model = context.models[i_model]
@ -798,7 +801,7 @@ def step_supported_models(context, i_model, param, preposition, param_value):
        case 'identified':
            value = model.id
        case 'trained':
-            value = str(model.meta.n_ctx_train)
+            value = str(model.meta["n_ctx_train"])
        case _:
            assert False, "param {param} not supported"
    assert param_value == value, f"model param {param} {value} != {param_value}"
@ -810,6 +813,7 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
        print(f"starting {context.n_prompts} concurrent completion requests...")
    assert context.n_prompts > 0
    seeds = await completions_seed(context)
    assert seeds is not None
    for prompt_no in range(context.n_prompts):
        shifted_args = [context.prompts.pop(), seeds[prompt_no], *args]
        context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
@ -861,7 +865,7 @@ async def request_completion(prompt,
                             id_slot=None,
                             expect_api_error=None,
                             user_api_key=None,
-                             temperature=None):
+                             temperature=None) -> int | dict[str, Any]:
    if debug:
        print(f"Sending completion request: {prompt}")
    origin = "my.super.domain"
@ -899,8 +903,8 @@ async def request_completion(prompt,
 async def oai_chat_completions(user_prompt,
                               seed,
                               system_prompt,
-                               base_url,
+                               base_url: str,
-                               base_path,
+                               base_path: str,
                               async_client,
                               debug=False,
                               temperature=None,
@ -909,7 +913,7 @@ async def oai_chat_completions(user_prompt,
                               enable_streaming=None,
                               response_format=None,
                               user_api_key=None,
-                               expect_api_error=None):
+                               expect_api_error=None) -> int | dict[str, Any]:
    if debug:
        print(f"Sending OAI Chat completions request: {user_prompt}")
    # openai client always expects an api key
@ -989,32 +993,35 @@ async def oai_chat_completions(user_prompt,
    else:
        try:
            openai.api_key = user_api_key
-            openai.api_base = f'{base_url}{base_path}'
+            openai.base_url = f'{base_url}{base_path.removesuffix("chat")}'
-            chat_completion = openai.Completion.create(
+            assert model is not None
            chat_completion = openai.chat.completions.create(
                messages=payload['messages'],
                model=model,
                max_tokens=n_predict,
                stream=enable_streaming,
-                response_format=payload.get('response_format'),
+                response_format=payload.get('response_format') or openai.NOT_GIVEN,
                seed=seed,
                temperature=payload['temperature']
            )
-        except openai.error.AuthenticationError as e:
+        except openai.AuthenticationError as e:
            if expect_api_error is not None and expect_api_error:
                return 401
            else:
                assert False, f'error raised: {e}'
        if enable_streaming:
            chat_completion = cast(openai.Stream[ChatCompletionChunk], chat_completion)
            for chunk in chat_completion:
                assert len(chunk.choices) == 1
                delta = chunk.choices[0].delta
-                if 'content' in delta:
+                if delta.content is not None:
-                    completion_response['content'] += delta['content']
+                    completion_response['content'] += delta.content
                    completion_response['timings']['predicted_n'] += 1
                completion_response['truncated'] = chunk.choices[0].finish_reason != 'stop'
        else:
            assert len(chat_completion.choices) == 1
            assert chat_completion.usage is not None
            completion_response = {
                'content': chat_completion.choices[0].message.content,
                'timings': {
@ -1028,7 +1035,7 @@ async def oai_chat_completions(user_prompt,
    return completion_response
-async def request_embedding(content, seed, base_url=None):
+async def request_embedding(content, seed, base_url=None) -> list[list[float]]:
    async with aiohttp.ClientSession() as session:
        async with session.post(f'{base_url}/embedding',
                                json={
@ -1041,7 +1048,7 @@ async def request_embedding(content, seed, base_url=None):
 async def request_oai_embeddings(input, seed,
                                 base_url=None, user_api_key=None,
-                                 model=None, async_client=False):
+                                 model=None, async_client=False) -> list[list[float]]:
    # openai client always expects an api_key
    user_api_key = user_api_key if user_api_key is not None else 'nope'
    if async_client:
@ -1063,7 +1070,7 @@ async def request_oai_embeddings(input, seed,
                response_json = await response.json()
                assert response_json['model'] == model, f"invalid model received: {response_json['model']}"
                assert response_json['object'] == 'list'
-                if isinstance(input, collections.abc.Sequence):
+                if isinstance(input, Sequence):
                    embeddings = []
                    for an_oai_embeddings in response_json['data']:
                        embeddings.append(an_oai_embeddings['embedding'])
@ -1072,19 +1079,14 @@ async def request_oai_embeddings(input, seed,
                return embeddings
    else:
        openai.api_key = user_api_key
-        openai.api_base = f'{base_url}/v1'
+        openai.base_url = f'{base_url}/v1/'
-        oai_embeddings = openai.Embedding.create(
+        assert model is not None
        oai_embeddings = openai.embeddings.create(
            model=model,
            input=input,
        )
-        if isinstance(input, collections.abc.Sequence):
+        return [e.embedding for e in oai_embeddings.data]
            embeddings = []
            for an_oai_embeddings in oai_embeddings.data:
                embeddings.append(an_oai_embeddings.embedding)
        else:
            embeddings = [oai_embeddings.data.embedding]
        return embeddings
 def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None):
@ -1343,7 +1345,7 @@ def start_server_background(context):
    }
    context.server_process = subprocess.Popen(
        [str(arg) for arg in [context.server_path, *server_args]],
-        **pkwargs)
+        **pkwargs)  # pyright: ignore[reportArgumentType, reportCallIssue]
    def server_log(in_stream, out_stream):
        for line in iter(in_stream.readline, b''):
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@ -1,6 +1,6 @@
 aiohttp~=3.9.3
 behave~=1.2.6
 huggingface_hub~=0.20.3
-numpy~=1.24.4
+numpy~=1.26.4
-openai~=0.25.0
+openai~=1.30.3
 prometheus-client~=0.20.0
--- a/examples/server_embd.py
+++ b/examples/server_embd.py
@ -1,13 +1,15 @@
 import asyncio
 import asyncio.threads
 import requests
 import numpy as np
 n = 8
 result = []
 async def requests_post_async(*args, **kwargs):
-    return await asyncio.to_thread(requests.post, *args, **kwargs)
+    return await asyncio.threads.to_thread(requests.post, *args, **kwargs)
 async def main():
    model_url = "http://127.0.0.1:6900"
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@ -29,6 +29,7 @@ static void print_usage_information(const char * argv0, FILE * stream) {
    fprintf(stream, "    -p PROMPT, --prompt PROMPT           read prompt from the argument.\n");
    fprintf(stream, "    --stdin                              read prompt from standard input.\n");
    fprintf(stream, "    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
    fprintf(stream, "    --no-parse-special                   do not parse control tokens.\n");
    fprintf(stream, "    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
    fprintf(stream, "    --show-count                         print the total number of tokens.\n");
 }
@ -195,6 +196,7 @@ int main(int raw_argc, char ** raw_argv) {
    // variables where to put any arguments we see.
    bool printing_ids = false;
    bool no_bos = false;
    bool no_parse_special = false;
    bool disable_logging = false;
    bool show_token_count = false;
    const char * model_path = NULL;
@ -229,6 +231,9 @@ int main(int raw_argc, char ** raw_argv) {
        else if (arg == "--no-bos") {
            no_bos = true;
        }
        else if (arg == "--no-parse-special") {
            no_parse_special = true;
        }
        else if (arg == "-p" || arg == "--prompt") {
            if (prompt_set) {
                fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
@ -359,9 +364,10 @@ int main(int raw_argc, char ** raw_argv) {
    const bool model_wants_add_bos = llama_should_add_bos_token(model);
    const bool add_bos = model_wants_add_bos && !no_bos;
    const bool parse_special = !no_parse_special;
    std::vector<llama_token> tokens;
-    tokens = ::llama_tokenize(model, prompt, add_bos, true);
+    tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);
    if (printing_ids) {
        printf("[");
--- a/examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py
+++ b/examples/train-text-from-scratch/convert_train_checkpoint_to_gguf.py
@ -66,7 +66,7 @@ class Tensor:
            if len(self.ne) == 0:
                self.nbytes = 0
            else:
-                self.nbytes = int(np.product(self.ne)) * 4
+                self.nbytes = int(np.prod(self.ne)) * 4
        else:
            raise ValueError(f"Unhandled data type '{self.dtype}'")
--- a/ggml/ggml_vk_generate_shaders.py
+++ b/ggml/ggml_vk_generate_shaders.py
@ -99,6 +99,8 @@ async def main():
    tasks = []
    base_dict = {"FLOAT_TYPE": "float"}
    for fp16 in (False, True):
        # MUL_MAT
        matmul_shaders(tasks, fp16, False)
@ -106,8 +108,6 @@ async def main():
        matmul_shaders(tasks, fp16, True)
    for tname in type_names:
        base_dict = {"FLOAT_TYPE": "float"}
        # mul mat vec
        data_a_key = f"DATA_A_{tname.upper()}"
        shader = f"mul_mat_vec_{tname}.comp" if tname.endswith("_k") else "mul_mat_vec.comp"
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -390,6 +390,9 @@ extern "C" {
        GGML_TYPE_F64     = 28,
        GGML_TYPE_IQ1_M   = 29,
        GGML_TYPE_BF16    = 30,
        GGML_TYPE_Q4_0_4_4 = 31,
        GGML_TYPE_Q4_0_4_8 = 32,
        GGML_TYPE_Q4_0_8_8 = 33,
        GGML_TYPE_COUNT,
    };
@ -431,6 +434,9 @@ extern "C" {
        GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
        GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
    };
    // available tensor operations:
@ -2413,6 +2419,12 @@ extern "C" {
    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
    typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                      const void * GGML_RESTRICT y, size_t by, int nrc);
    typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t nr,
                                             int64_t k, int64_t bx);
    typedef void (*ggml_gemv_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
                                      const void * GGML_RESTRICT y, int nr, int nc);
    typedef void (*ggml_gemm_t)      (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
                                      const void * GGML_RESTRICT y, int nr, int nc);
    typedef struct {
        const char      * type_name;
@ -2425,6 +2437,11 @@ extern "C" {
        ggml_vec_dot_t    vec_dot;
        enum ggml_type    vec_dot_type;
        int64_t           nrows; // number of rows to process simultaneously;
        int64_t           ncols; // number of columns to process simultaneously;
        int64_t           interleave_blcksize; // interleave elements in blocks of interleave_blcksize;
        ggml_from_float_to_mat_t from_float_to_mat;
        ggml_gemv_t       gemv;
        ggml_gemm_t       gemm;
    } ggml_type_traits_t;
    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
--- a/ggml/src/ggml-aarch64.h
+++ b/ggml/src/ggml-aarch64.h
@ -0,0 +1,39 @@
 // SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
 #pragma once
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "ggml.h"
 // GGML internal header
 #ifdef __cplusplus
 extern "C" {
 #endif
 // Quantization
 void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t interleave_blcksize);
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
 size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 // GEMV
 void ggml_gemv_q4_0_4x4_q8_0  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_4x8_q8_0  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_8x8_q8_0  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 // GEMM
 void ggml_gemm_q4_0_4x4_q8_0  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x8_q8_0  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_8x8_q8_0  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@ -199,6 +199,30 @@ typedef struct {
 } block_q8_1;
 static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
 typedef struct {
    ggml_half d[4];        // deltas for 4 q4_0 blocks
    uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
 } block_q4_0x4;
 static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
 typedef struct {
    ggml_half d[8];        // deltas for 8 q4_0 blocks
    uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
 } block_q4_0x8;
 static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
 typedef struct {
    ggml_half d[4];        // deltas for 4 q8_0 blocks
    int8_t qs[QK8_0 * 4];  // quants for 4 q8_0 blocks
 } block_q8_0x4;
 static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
 typedef struct {
    ggml_half d[8];        // deltas for 8 q8_0 blocks
    int8_t qs[QK8_0 * 8];  // quants for 8 q8_0 blocks
 } block_q8_0x8;
 static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
 //
 // Super-block quantization structures
 //
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -31,6 +31,7 @@ bool g_mul_mat_q = false;
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
 #include "ggml-cuda/upscale.cuh"
 #include "ggml-cuda/conv-transpose-1d.cuh"
 #include <algorithm>
 #include <array>
@ -2265,6 +2266,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_IM2COL:
            ggml_cuda_op_im2col(ctx, dst);
            break;
        case GGML_OP_CONV_TRANSPOSE_1D:
            ggml_cuda_op_conv_transpose_1d(ctx,dst);
            break;
        case GGML_OP_POOL_2D:
            ggml_cuda_op_pool2d(ctx, dst);
            break;
@ -2808,6 +2812,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                ggml_type src0_type = op->src[0]->type;
                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
            } break;
        case GGML_OP_CONV_TRANSPOSE_1D:
            {
                ggml_type src0_type = op->src[0]->type;
                ggml_type src1_type = op->src[1]->type;
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
                    return true;
                }
                return false;
            } break;
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
--- a/ggml/src/ggml-cuda/conv-transpose-1d.cu
+++ b/ggml/src/ggml-cuda/conv-transpose-1d.cu
@ -0,0 +1,87 @@
 #include "conv-transpose-1d.cuh"
 static  __global__ void conv_transpose_1d_kernel(
        const int s0, const int p0, const int d0, const int output_size,
        const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
        const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
        const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
        const float * src0, const float * src1,  float * dst) {
    int global_index = threadIdx.x + blockIdx.x * blockDim.x;
    if (global_index >= output_size) {
        return;
    }
    int out_index = global_index / dst_ne0;
    float accumulator = 0;
    for (int c = 0; c < src0_ne2; c++) {
        int idx = global_index % dst_ne0;
        int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0);
        int input_offset = src1_ne0 * c;
        for (int i = 0; i < src1_ne0; i++) {
            if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) {
                continue;
            }
            int weight_idx = idx - i*s0;
            float kernel_weight = src0[kernel_offset + weight_idx];
            float input_value =  src1[input_offset+i];
            accumulator += kernel_weight * input_value;
        }
    }
    dst[global_index] = accumulator;
 }
 static void conv_transpose_1d_f32_f32_cuda(
        const int s0, const int p0, const int d0, const int output_size,
        const int src0_ne0, const int src0_ne1, const int src0_ne2, const int src0_ne3,
        const int src1_ne0, const int src1_ne1, const int src1_ne2, const int src1_ne3,
        const int dst_ne0, const int dst_ne1, const int dst_ne2, const int dst_ne3,
        const float * src0, const float * src1,  float * dst,
        cudaStream_t stream) {
    const int num_blocks = (output_size + CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE;
    conv_transpose_1d_kernel<<<num_blocks,CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE, 0, stream>>>(
        s0,p0,d0,output_size,
        src0_ne0, src0_ne1,  src0_ne2, src0_ne3,
        src1_ne0, src1_ne1,  src1_ne2, src1_ne3,
        dst_ne0,  dst_ne1,   dst_ne2,  dst_ne3,
        src0,src1, dst);
 }
 void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
    const ggml_tensor * src1 = dst->src[1];
    const float * src1_d = (const float *)src1->data;
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(ggml_is_contiguous(src1));
    const int32_t * opts = (const int32_t *)dst->op_params;
    const int s0 = opts[0];
    const int p0 = 0;//opts[3];
    const int d0 = 1;//opts[4];
    const int64_t kernel_size = ggml_nelements(src0);
    const int64_t input_size = ggml_nelements(src1);
    const int64_t output_size = ggml_nelements(dst);
    conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
        src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
        src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
        dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
        src0_d, src1_d, dst_d, stream);
 }
--- a/ggml/src/ggml-cuda/conv-transpose-1d.cuh
+++ b/ggml/src/ggml-cuda/conv-transpose-1d.cuh
@ -0,0 +1,5 @@
 #include "common.cuh"
 #define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
 void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@ -609,6 +609,10 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
 #endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
 #ifdef __ARM_FEATURE_SVE
 #include <arm_sve.h>
 #endif // __ARM_FEATURE_SVE
 // precomputed f32 table for f16 (256 KB)
 // defined in ggml.c, initialized in ggml_init()
 extern float ggml_table_f32_f16[1 << 16];
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@ -3815,6 +3815,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    }
 #endif
 #if defined(__ARM_FEATURE_SVE)
    if (svcntb() == QK8_0) {
        const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
        const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
@ -3851,7 +3852,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
        }
        *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
-#elif defined(__ARM_NEON)
+        return;
    }
 #endif
 #if defined(__ARM_NEON)
    float32x4_t sumv0 = vdupq_n_f32(0.0f);
    float32x4_t sumv1 = vdupq_n_f32(0.0f);
@ -5423,6 +5427,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    }
 #endif
 #if defined(__ARM_FEATURE_SVE)
    if (svcntb() == QK8_0) {
        svfloat32_t sumv0 = svdup_n_f32(0.0f);
        svfloat32_t sumv1 = svdup_n_f32(0.0f);
@ -5447,7 +5452,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
        }
        *s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
-#elif defined(__ARM_NEON)
+        return;
    }
 #endif
 #if defined(__ARM_NEON)
    float32x4_t sumv0 = vdupq_n_f32(0.0f);
    float32x4_t sumv1 = vdupq_n_f32(0.0f);
@ -14761,6 +14769,16 @@ static bool validate_fp16(ggml_fp16_t f, size_t i) {
        } \
    }
 #define VALIDATE_ROW_DATA_DVEC_F16_IMPL(type, data, nb, nr) \
    const type * q = (const type *) (data); \
    for (size_t i = 0; i < (nb); ++i) { \
        for (size_t j = 0; j < (nr); ++j) { \
            if (!validate_fp16(q[i].d[j], i)) { \
                return false; \
            } \
        } \
    }
 bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) {
    if (type < 0 || type >= GGML_TYPE_COUNT) {
        fprintf(stderr, "%s: invalid type %d\n", __func__, type);
@ -14978,6 +14996,16 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
            } break;
        case GGML_TYPE_Q4_0_4_4:
        case GGML_TYPE_Q4_0_4_8:
            {
                VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
            } break;
        case GGML_TYPE_Q4_0_8_8:
            {
                VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
            } break;
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
        case GGML_TYPE_I32:
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@ -3658,6 +3658,10 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
    use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
 #endif // SYCL_USE_XMX
    // mmvq path is faster in the CUDA backend.
    if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda)
        use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
    if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
        // KQ single-batch
        ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst);
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@ -346,4 +346,10 @@ inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
    return *reinterpret_cast<const sycl::vec<Tp, n>*>(aligned_ptr);
 }
 // Helper for accessing pointers with no warnings
 template <typename Tp, int dim>
 static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
    return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
 }
 #endif // GGML_SYCL_COMMON_HPP
--- a/ggml/src/ggml-sycl/convert.cpp
+++ b/ggml/src/ggml-sycl/convert.cpp
@ -158,7 +158,7 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k,
                                                   sycl::range<3>(1, 1, 32),
                                               sycl::range<3>(1, 1, 32)),
                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q4_K(vx, y, scale_local_acc.get_pointer(), item_ct1);
+                                 dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
                             });
        });
    }
--- a/ggml/src/ggml-sycl/mmq.cpp
+++ b/ggml/src/ggml-sycl/mmq.cpp
@ -1835,10 +1835,10 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q4_0<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_qs_q4_0_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_qs_q4_0_acc_ct1),
-                            tile_x_d_q4_0_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_d_q4_0_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -1870,10 +1870,10 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q4_0<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_qs_q4_0_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_qs_q4_0_acc_ct1),
-                            tile_x_d_q4_0_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_d_q4_0_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -1950,10 +1950,10 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q4_1<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_qs_q4_1_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_qs_q4_1_acc_ct1),
-                            tile_x_dm_q4_1_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_dm_q4_1_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -1985,10 +1985,10 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q4_1<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_qs_q4_1_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_qs_q4_1_acc_ct1),
-                            tile_x_dm_q4_1_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_dm_q4_1_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -2065,10 +2065,10 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q5_0<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_0_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_ql_q5_0_acc_ct1),
-                            tile_x_d_q5_0_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_d_q5_0_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -2100,10 +2100,10 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q5_0<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_0_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_ql_q5_0_acc_ct1),
-                            tile_x_d_q5_0_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_d_q5_0_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -2180,10 +2180,10 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q5_1<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_1_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_ql_q5_1_acc_ct1),
-                            tile_x_dm_q5_1_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_dm_q5_1_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -2215,10 +2215,10 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q5_1<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_1_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_ql_q5_1_acc_ct1),
-                            tile_x_dm_q5_1_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_dm_q5_1_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -2295,10 +2295,10 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q8_0<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_qs_q8_0_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_qs_q8_0_acc_ct1),
-                            tile_x_d_q8_0_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_d_q8_0_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -2330,10 +2330,10 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q8_0<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_qs_q8_0_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_qs_q8_0_acc_ct1),
-                            tile_x_d_q8_0_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_d_q8_0_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -2412,11 +2412,11 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q2_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q2_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_ql_q2_K_acc_ct1),
-                            tile_x_dm_q2_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_dm_q2_K_acc_ct1),
-                            tile_x_sc_q2_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_sc_q2_K_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -2450,11 +2450,11 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q2_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q2_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_ql_q2_K_acc_ct1),
-                            tile_x_dm_q2_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_dm_q2_K_acc_ct1),
-                            tile_x_sc_q2_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_sc_q2_K_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -2537,12 +2537,12 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q3_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q3_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_ql_q3_K_acc_ct1),
-                            tile_x_dm_q3_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_dm_q3_K_acc_ct1),
-                            tile_x_qh_q3_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_qh_q3_K_acc_ct1),
-                            tile_x_sc_q3_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_sc_q3_K_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -2578,12 +2578,12 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q3_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q3_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_ql_q3_K_acc_ct1),
-                            tile_x_dm_q3_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_dm_q3_K_acc_ct1),
-                            tile_x_qh_q3_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_qh_q3_K_acc_ct1),
-                            tile_x_sc_q3_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_sc_q3_K_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -2663,11 +2663,11 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q4_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q4_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_ql_q4_K_acc_ct1),
-                            tile_x_dm_q4_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_dm_q4_K_acc_ct1),
-                            tile_x_sc_q4_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_sc_q4_K_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -2701,11 +2701,11 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q4_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q4_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_ql_q4_K_acc_ct1),
-                            tile_x_dm_q4_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_dm_q4_K_acc_ct1),
-                            tile_x_sc_q4_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_sc_q4_K_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -2784,11 +2784,11 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q5_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_ql_q5_K_acc_ct1),
-                            tile_x_dm_q5_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_dm_q5_K_acc_ct1),
-                            tile_x_sc_q5_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_sc_q5_K_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -2822,11 +2822,11 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q5_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_ql_q5_K_acc_ct1),
-                            tile_x_dm_q5_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_dm_q5_K_acc_ct1),
-                            tile_x_sc_q5_K_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_sc_q5_K_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -2905,11 +2905,11 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q6_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_ql_acc_ct1),
-                            tile_x_dm_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_dm_acc_ct1),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_sc_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
@ -2943,11 +2943,11 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
                        mul_mat_q6_K<need_check>(
                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                            nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_ql_acc_ct1),
-                            tile_x_dm_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_dm_acc_ct1),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            get_pointer(tile_x_sc_acc_ct1),
-                            tile_y_qs_acc_ct1.get_pointer(),
+                            get_pointer(tile_y_qs_acc_ct1),
-                            tile_y_ds_acc_ct1.get_pointer());
+                            get_pointer(tile_y_ds_acc_ct1));
                    });
            });
        }
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@ -218,7 +218,7 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
                [=](sycl::nd_item<3> item_ct1)
                [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                    norm_f32(x, dst, ncols, eps, item_ct1,
-                        s_sum_acc_ct1.get_pointer(), work_group_size);
+                        get_pointer(s_sum_acc_ct1), work_group_size);
                });
            });
    }
@ -265,7 +265,7 @@ static void group_norm_f32_sycl(const float* x, float* dst,
                [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                    group_norm_f32(x, dst, group_size, ne_elements,
                        eps_ct4, item_ct1,
-                        s_sum_acc_ct1.get_pointer(), work_group_size);
+                        get_pointer(s_sum_acc_ct1), work_group_size);
                });
            });
    }
@ -306,7 +306,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
                [=](sycl::nd_item<3> item_ct1)
                [[intel::reqd_sub_group_size(WARP_SIZE)]] {
                    rms_norm_f32(x, dst, ncols, eps, item_ct1,
-                        s_sum_acc_ct1.get_pointer(), work_group_size);
+                        get_pointer(s_sum_acc_ct1), work_group_size);
                });
            });
    }
--- a/ggml/src/ggml-sycl/rope.cpp
+++ b/ggml/src/ggml-sycl/rope.cpp
@ -55,7 +55,7 @@ static void rope_norm(
    const int i = row*ne0 + i0;
    const int i2 = row/p_delta_rows;
-    const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
+    const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f);
    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
@ -98,7 +98,7 @@ static void rope_neox(
    const int i  = row*ne0 + i0/2;
    const int i2 = row/p_delta_rows;
-    const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
+    const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f);
    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
--- a/ggml/src/ggml-sycl/softmax.cpp
+++ b/ggml/src/ggml-sycl/softmax.cpp
@ -136,7 +136,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, float *
                soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
                                                                             nrows_y, scale, max_bias, m0,
                                                                             m1, n_head_log2, item_ct1,
-                                                                             local_buf_acc.get_pointer());
+                                                                             get_pointer(local_buf_acc));
            });
    });
 }
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -4,7 +4,7 @@
 #include "ggml-impl.h"
 #include "ggml-quants.h"
 #include "ggml.h"
-
+#include "ggml-aarch64.h"
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
@ -37,12 +37,12 @@
 #include <unistd.h>
 #endif
-#ifdef __ARM_FEATURE_MATMUL_INT8
+#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef GGML_USE_LLAMAFILE
 #endif
 #ifdef GGML_USE_LLAMAFILE
-#include "sgemm.h"
+#include <llamafile/sgemm.h>
 #endif
 #if defined(_MSC_VER)
@ -700,6 +700,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
 #else
        .nrows                    = 1,
 #endif
        .from_float_to_mat        = quantize_mat_q8_0,
    },
    [GGML_TYPE_Q8_1] = {
        .type_name                = "q8_1",
@ -897,6 +898,54 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_bf16,
        .vec_dot_type             = GGML_TYPE_BF16,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q4_0_4_4] = {
        .type_name                = "q4_0_4x4",
        .blck_size                = QK4_0,
        .type_size                = sizeof(block_q4_0),
        .is_quantized             = true,
        .to_float                 = NULL,
        .from_float               = NULL,
        .from_float_reference     = NULL,
        .vec_dot                  = NULL,
        .vec_dot_type             = GGML_TYPE_Q8_0,
        .nrows                    = 1,
        .ncols                    = 4,
        .interleave_blcksize      = 4,
        .gemv                     = ggml_gemv_q4_0_4x4_q8_0,
        .gemm                     = ggml_gemm_q4_0_4x4_q8_0,
    },
    [GGML_TYPE_Q4_0_4_8] = {
        .type_name                = "q4_0_4x8",
        .blck_size                = QK4_0,
        .type_size                = sizeof(block_q4_0),
        .is_quantized             = true,
        .to_float                 = NULL,
        .from_float               = NULL,
        .from_float_reference     = NULL,
        .vec_dot                  = NULL,
        .vec_dot_type             = GGML_TYPE_Q8_0,
        .nrows                    = 1,
        .ncols                    = 4,
        .interleave_blcksize      = 8,
        .gemv                     = ggml_gemv_q4_0_4x8_q8_0,
        .gemm                     = ggml_gemm_q4_0_4x8_q8_0,
    },
    [GGML_TYPE_Q4_0_8_8] = {
        .type_name                = "q4_0_8x8",
        .blck_size                = QK4_0,
        .type_size                = sizeof(block_q4_0),
        .is_quantized             = true,
        .to_float                 = NULL,
        .from_float               = NULL,
        .from_float_reference     = NULL,
        .vec_dot                  = NULL,
        .vec_dot_type             = GGML_TYPE_Q8_0,
        .nrows                    = 1,
        .ncols                    = 8,
        .interleave_blcksize      = 8,
        .gemv                     = ggml_gemv_q4_0_8x8_q8_0,
        .gemm                     = ggml_gemm_q4_0_8x8_q8_0,
    }
 };
@ -3208,6 +3257,9 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
        case GGML_FTYPE_MOSTLY_IQ4_XS:        wtype = GGML_TYPE_IQ4_XS;   break;
        case GGML_FTYPE_MOSTLY_IQ3_S:         wtype = GGML_TYPE_IQ3_S;    break;
        case GGML_FTYPE_MOSTLY_IQ2_S:         wtype = GGML_TYPE_IQ2_S;    break;
        case GGML_FTYPE_MOSTLY_Q4_0_4_4:      wtype = GGML_TYPE_Q4_0_4_4; break;
        case GGML_FTYPE_MOSTLY_Q4_0_4_8:      wtype = GGML_TYPE_Q4_0_4_8; break;
        case GGML_FTYPE_MOSTLY_Q4_0_8_8:      wtype = GGML_TYPE_Q4_0_8_8; break;
        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
    }
@ -9467,6 +9519,9 @@ static void ggml_compute_forward_add(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_Q4_0_4_4:
        case GGML_TYPE_Q4_0_4_8:
        case GGML_TYPE_Q4_0_8_8:
            {
                ggml_compute_forward_add_q_f32(params, dst);
            } break;
@ -9842,6 +9897,9 @@ static void ggml_compute_forward_add1(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_Q4_0_4_4:
        case GGML_TYPE_Q4_0_4_8:
        case GGML_TYPE_Q4_0_8_8:
            {
                ggml_compute_forward_add1_q_f32(params, dst);
            } break;
@ -9967,6 +10025,9 @@ static void ggml_compute_forward_acc(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_Q4_0_4_4:
        case GGML_TYPE_Q4_0_4_8:
        case GGML_TYPE_Q4_0_8_8:
        default:
            {
                GGML_ASSERT(false);
@ -12180,6 +12241,12 @@ static void ggml_compute_forward_mul_mat(
    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
    int64_t           const vec_dot_num_rows      = type_traits[type].nrows;
    int64_t           const matmul_num_cols       = type_traits[type].ncols;
    int64_t           const interleave_blcksize   = type_traits[type].interleave_blcksize;
    ggml_from_float_to_mat_t const from_float_to_mat
                                                  = type_traits[vec_dot_type].from_float_to_mat;
    ggml_gemv_t       const gemv                  = type_traits[type].gemv;
    ggml_gemm_t       const gemm                  = type_traits[type].gemm;
    GGML_ASSERT(ne0 == ne01);
    GGML_ASSERT(ne1 == ne11);
@ -12246,7 +12313,16 @@ UseGgmlGemm1:;
        for (int64_t i13 = 0; i13 < ne13; ++i13) {
            for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
+                int64_t i11_processed = 0;
                if ((ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
                    for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
                        from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
                                          (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
                                          4, ne10, interleave_blcksize);
                    }
                    i11_processed = ne11 - ne11 % 4;
                }
                for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
                    from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
                                          (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
                                           ne10);
@ -12327,6 +12403,28 @@ UseGgmlGemm2:;
    const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
    const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
    if ((ggml_n_dims(src0) == 2) && gemv) {
        const void * src1_wdata      = (src1->type == vec_dot_type) ? src1->data : params->wdata;
        const size_t src1_col_stride = ggml_is_contiguous(src1) || src1->type != vec_dot_type ? ggml_row_size(vec_dot_type, ne10) : nb11;
        int64_t src0_start = (ith * ne01) / nth;
        int64_t src0_end   = ((ith + 1) * ne01) / nth;
        src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
        src0_end   = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
        if (src0_start >= src0_end) return;
        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
        if (gemm && (ne11 > 3)) {
            gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01,
                 (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
        }
        for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++) {
            gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
                 (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
                 src0_end - src0_start);
        }
        return;
    }
    // The first chunk comes from our thread_id, the rest will get auto-assigned.
    int current_chunk = ith;
@ -12372,6 +12470,8 @@ static void ggml_compute_forward_mul_mat_id(
    ggml_vec_dot_t    const vec_dot               = type_traits[type].vec_dot;
    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
    int64_t           const matmul_num_cols       = type_traits[type].ncols;
    ggml_gemv_t       const gemv                  = type_traits[type].gemv;
    // we don't support permuted src0 or src1
    GGML_ASSERT(nb00 == ggml_type_size(type));
@ -12457,6 +12557,34 @@ static void ggml_compute_forward_mul_mat_id(
        const int64_t nr0 = ne01; // src0 rows
        const int64_t nr1 = cne1; // src1 rows
        if (((ggml_n_dims(src0) - 1) == 2) && gemv) {
            int64_t src0_cur_start = (ith * ne01) / nth;
            int64_t src0_cur_end   = ((ith + 1) * ne01) / nth;
            src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start;
            src0_cur_end   = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end;
            if (src0_cur_start >= src0_cur_end) return;
            for (int ir1 = 0; ir1 < nr1; ir1++) {
                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
                const int id       = row_mapping.i1; // selected expert index
                const int64_t  i11 = id % ne11;
                const int64_t  i12 = row_mapping.i2; // row index in src1
                const int64_t  i1 = id;  // selected expert index
                const int64_t  i2 = i12; // row
                const char * src1_col = (const char *) wdata +
                    (src1_cont || src1->type != vec_dot_type
                    ? (i11        + i12 * ne11) * row_size
                    : (i11 * nb11 + i12 * nb12));
                gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
                     (const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
            }
            continue;
        }
        // distribute the thread work across the inner or outer loop based on which one is larger
        const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
@ -12758,6 +12886,9 @@ static void ggml_compute_forward_out_prod(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_Q4_0_4_4:
        case GGML_TYPE_Q4_0_4_8:
        case GGML_TYPE_Q4_0_8_8:
            {
                ggml_compute_forward_out_prod_q_f32(params, dst);
            } break;
@ -12943,6 +13074,9 @@ static void ggml_compute_forward_set(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_Q4_0_4_4:
        case GGML_TYPE_Q4_0_4_8:
        case GGML_TYPE_Q4_0_8_8:
        default:
            {
                GGML_ASSERT(false);
@ -13202,6 +13336,9 @@ static void ggml_compute_forward_get_rows(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_Q4_0_4_4:
        case GGML_TYPE_Q4_0_4_8:
        case GGML_TYPE_Q4_0_8_8:
            {
                ggml_compute_forward_get_rows_q(params, dst);
            } break;
@ -13788,6 +13925,9 @@ static void ggml_compute_forward_clamp(
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_Q8_K:
        case GGML_TYPE_Q4_0_4_4:
        case GGML_TYPE_Q4_0_4_8:
        case GGML_TYPE_Q4_0_8_8:
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
        case GGML_TYPE_I32:
@ -20516,6 +20656,9 @@ size_t ggml_quantize_chunk(
        case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_F16:
            {
                size_t elemsize = sizeof(ggml_fp16_t);
@ -21862,8 +22005,6 @@ int ggml_cpu_has_neon(void) {
 int ggml_cpu_has_sve(void) {
 #if defined(__ARM_FEATURE_SVE)
    // TODO: Currently, SVE 256 bit is only supported.
    GGML_ASSERT(svcntb() == QK8_0);
    return 1;
 #else
    return 0;
--- a/ggml/src/llamafile/sgemm.cpp
+++ b/ggml/src/llamafile/sgemm.cpp
--- a/ggml/src/llamafile/sgemm.h
+++ b/ggml/src/llamafile/sgemm.h
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@ -79,5 +79,4 @@ python -m twine upload dist/*
 ```
 ## TODO
 - [ ] Add tests
 - [ ] Include conversion scripts as command line entry points in this package.
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -120,7 +120,6 @@ class Keys:
        MIDDLE_ID            = "tokenizer.ggml.middle_token_id"
        EOT_ID               = "tokenizer.ggml.eot_token_id"
 #
 # recommended mapping of model tensor names for storage in gguf
 #
@ -163,6 +162,7 @@ class MODEL_ARCH(IntEnum):
    OPENELM      = auto()
    ARCTIC       = auto()
    DEEPSEEK2    = auto()
    CHATGLM      = auto()
    BITNET       = auto()
    T5           = auto()
    JAIS         = auto()
@ -289,6 +289,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.OPENELM:        "openelm",
    MODEL_ARCH.ARCTIC:         "arctic",
    MODEL_ARCH.DEEPSEEK2:      "deepseek2",
    MODEL_ARCH.CHATGLM:        "chatglm",
    MODEL_ARCH.BITNET:         "bitnet",
    MODEL_ARCH.T5:             "t5",
    MODEL_ARCH.JAIS:           "jais",
@ -924,6 +925,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN_SHEXP,
        MODEL_TENSOR.FFN_UP_SHEXP,
    ],
    MODEL_ARCH.CHATGLM : [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_QKV,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.BITNET: [
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
@ -1020,6 +1033,9 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_ROT_EMBD,
    ],
    MODEL_ARCH.CHATGLM: [
        MODEL_TENSOR.ROPE_FREQS,
    ],
 }
 #
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@ -67,7 +67,7 @@ class ReaderTensor(NamedTuple):
 class GGUFReader:
    # I - same as host, S - swapped
-    byte_order: Literal['I'] | Literal['S'] = 'I'
+    byte_order: Literal['I', 'S'] = 'I'
    alignment: int = GGUF_DEFAULT_ALIGNMENT
    data_offset: int
@ -86,7 +86,7 @@ class GGUFReader:
        GGUFValueType.BOOL:    np.bool_,
    }
-    def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'):
+    def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] = 'r'):
        self.data = np.memmap(path, mode = mode)
        offs = 0
@ -140,7 +140,7 @@ class GGUFReader:
        return self.tensors[idx]
    def _get(
-        self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I'] | Literal['S'] | Literal['<'] = None,
+        self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I', 'S', '<'] = None,
    ) -> npt.NDArray[Any]:
        count = int(count)
        itemsize = int(np.empty([], dtype = dtype).itemsize)
--- a/gguf-py/gguf/lazy.py
+++ b/gguf-py/gguf/lazy.py
@ -6,7 +6,6 @@ from typing import Any, Callable
 from collections import deque
 import numpy as np
 from numpy._typing import _Shape
 from numpy.typing import DTypeLike
@ -16,16 +15,16 @@ logger = logging.getLogger(__name__)
 class LazyMeta(ABCMeta):
    def __new__(cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs):
-        def __getattr__(self, __name: str) -> Any:
+        def __getattr__(self, name: str) -> Any:
-            meta_attr = getattr(self._meta, __name)
+            meta_attr = getattr(self._meta, name)
            if callable(meta_attr):
                return type(self)._wrap_fn(
-                    (lambda s, *args, **kwargs: getattr(s, __name)(*args, **kwargs)),
+                    (lambda s, *args, **kwargs: getattr(s, name)(*args, **kwargs)),
                    use_self=self,
                )
            elif isinstance(meta_attr, self._tensor_type):
                # e.g. self.T with torch.Tensor should still be wrapped
-                return type(self)._wrap_fn(lambda s: getattr(s, __name))(self)
+                return type(self)._wrap_fn(lambda s: getattr(s, name))(self)
            else:
                # no need to wrap non-tensor properties,
                # and they likely don't depend on the actual contents of the tensor
@ -141,19 +140,21 @@ class LazyBase(ABC, metaclass=LazyMeta):
                        res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)
            if isinstance(res, cls._tensor_type):
-                def collect_replace(t: LazyBase):
+                class CollectSharedLazy:
                    if collect_replace.shared_lazy is None:
                        collect_replace.shared_lazy = t._lazy
                    else:
                        collect_replace.shared_lazy.extend(t._lazy)
                        t._lazy = collect_replace.shared_lazy
                    # emulating a static variable
-                collect_replace.shared_lazy = None
+                    shared_lazy: None | deque[LazyBase] = None
-                LazyBase._recurse_apply(args, collect_replace)
+                    @staticmethod
                    def collect_replace(t: LazyBase):
                        if CollectSharedLazy.shared_lazy is None:
                            CollectSharedLazy.shared_lazy = t._lazy
                        else:
                            CollectSharedLazy.shared_lazy.extend(t._lazy)
                            t._lazy = CollectSharedLazy.shared_lazy
-                shared_lazy = collect_replace.shared_lazy
+                LazyBase._recurse_apply(args, CollectSharedLazy.collect_replace)
                shared_lazy = CollectSharedLazy.shared_lazy
                return cls(meta=cls.eager_to_meta(res), lazy=shared_lazy, args=args, func=lambda a: fn(*a, **kwargs))
            else:
@ -184,6 +185,7 @@ class LazyBase(ABC, metaclass=LazyMeta):
                lt._args = cls._recurse_apply(lt._args, already_eager_to_eager)
                lt._data = lt._func(lt._args)
                # sanity check
                assert lt._data is not None
                assert lt._data.dtype == lt._meta.dtype
                assert lt._data.shape == lt._meta.shape
@ -216,7 +218,7 @@ class LazyNumpyTensor(LazyBase):
    _tensor_type = np.ndarray
    @classmethod
-    def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: _Shape) -> np.ndarray[Any, Any]:
+    def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
        # The initial idea was to use np.nan as the fill value,
        # but non-float types like np.int16 can't use that.
        # So zero it is.
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -24,6 +24,7 @@ class TensorNameMap:
            "backbone.embedding",                        # mamba
            "backbone.embeddings",                       # mamba-hf
            "transformer.in_out_embed",                  # Grok
            "embedding.word_embeddings",                 # chatglm
            "transformer.token_embeddings",              # openelm
            "shared",                                    # t5
        ),
@ -55,6 +56,7 @@ class TensorNameMap:
            "output",                    # llama-pth bloom internlm2
            "word_embeddings_for_head",  # persimmon
            "lm_head.linear",            # phi2
            "output_layer",              # chatglm
        ),
        # Output norm
@ -71,12 +73,14 @@ class TensorNameMap:
            "model.norm_f",                            # mamba-qbert
            "backbone.norm_f",                         # mamba
            "transformer.rms_norm",                    # Grok
            "encoder.final_layernorm",                 # chatglm
            "transformer.norm",                        # openelm
        ),
        # Rope frequencies
        MODEL_TENSOR.ROPE_FREQS: (
            "rope.freqs",  # llama-pth
            "rotary_pos_emb.inv_freq",  # chatglm
        ),
    }
@ -101,6 +105,7 @@ class TensorNameMap:
            "backbone.layers.{bid}.norm",                           # mamba
            "transformer.decoder_layer.{bid}.rms_norm",             # Grok
            "transformer.blocks.{bid}.norm_attn_norm.norm_1",       # dbrx
            "encoder.layers.{bid}.input_layernorm",                 # chatglm
            "transformer.layers.{bid}.attn_norm",                   # openelm
        ),
@ -124,6 +129,7 @@ class TensorNameMap:
            "transformer.h.{bid}.mixer.Wqkv",                                      # phi2
            "encoder.layers.{bid}.attn.Wqkv",                                      # nomic-bert
            "model.layers.{bid}.self_attn.qkv_proj",                               # phi3
            "encoder.layers.{bid}.self_attention.query_key_value",                 # chatglm
            "transformer.layers.{bid}.attn.qkv_proj",                              # openelm
        ),
@ -135,7 +141,7 @@ class TensorNameMap:
            "transformer.h.{bid}.attn.q_proj",                           # gpt-j
            "model.layers.layers.{bid}.self_attn.q_proj",                # plamo
            "model.layers.{bid}.attention.wq",                           # internlm2
-            "transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok
+            "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
        ),
        # Attention key
@ -147,7 +153,7 @@ class TensorNameMap:
            "transformer.h.{bid}.attn.k",                              # refact
            "model.layers.layers.{bid}.self_attn.k_proj",              # plamo
            "model.layers.{bid}.attention.wk",                         # internlm2
-            "transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok
+            "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
        ),
        # Attention value
@ -182,6 +188,7 @@ class TensorNameMap:
            "encoder.layers.{bid}.attn.out_proj",                           # nomic-bert
            "transformer.decoder_layer.{bid}.multi_head_attention.linear",  # Grok
            "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj",        # dbrx
            "encoder.layers.{bid}.self_attention.dense",                    # chatglm
            "transformer.layers.{bid}.attn.out_proj",                       # openelm
        ),
@ -218,6 +225,7 @@ class TensorNameMap:
            "h.{bid}.ln_2",                                                  # gpt2
            "model.layers.{bid}.ffn_norm",                                   # internlm2
            "transformer.decoder_layer.{bid}.rms_norm_2",                    # Grok
            "encoder.layers.{bid}.post_attention_layernorm",                 # chatglm
            "transformer.layers.{bid}.ffn_norm",                             # openelm
        ),
@ -268,6 +276,7 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.c_fc",                            # starcoder2
            "encoder.layer.{bid}.mlp.gated_layers_v",                 # jina-bert-v2
            "model.layers.{bid}.residual_mlp.w3",                     # arctic
            "encoder.layers.{bid}.mlp.dense_h_to_4h",                 # chatglm
        ),
        MODEL_TENSOR.FFN_UP_EXP: (
@ -337,6 +346,7 @@ class TensorNameMap:
            "transformer.layers.{bid}.ffn.proj_2",                    # openelm
            "model.layers.{bid}.residual_mlp.w2",                     # arctic
            "encoder.layer.{bid}.mlp.down_layer",                     # jina-bert-v2
            "encoder.layers.{bid}.mlp.dense_4h_to_h",                 # chatglm
        ),
        MODEL_TENSOR.FFN_DOWN_EXP: (
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.9.0"
+version = "0.9.1"
 description = "Read and write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
--- a/gguf-py/scripts/init.py
+++ b/gguf-py/scripts/init.py
@ -1,3 +1,5 @@
 # pyright: reportUnusedImport=false
 from .gguf_convert_endian import main as gguf_convert_endian_entrypoint
 from .gguf_dump import main as gguf_dump_entrypoint
 from .gguf_set_metadata import main as gguf_set_metadata_entrypoint
--- a/gguf-py/scripts/gguf_hash.py
+++ b/gguf-py/scripts/gguf_hash.py
@ -0,0 +1,91 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import uuid
 import hashlib
 import logging
 import argparse
 import os
 import sys
 from pathlib import Path
 from tqdm import tqdm
 # Necessary to load the local gguf package
 if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
    sys.path.insert(0, str(Path(__file__).parent.parent))
 from gguf import GGUFReader  # noqa: E402
 logger = logging.getLogger("gguf-hash")
 # UUID_NAMESPACE_LLAMA_CPP = uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp')
 UUID_NAMESPACE_LLAMA_CPP = uuid.UUID('ef001206-dadc-5f6d-a15f-3359e577d4e5')
 # For more information about what field.parts and field.data represent,
 # please see the comments in the modify_gguf.py example.
 def gguf_hash(reader: GGUFReader, filename: str, disable_progress_bar) -> None:
    sha1 = hashlib.sha1()
    uuidv5_sha1 = hashlib.sha1()
    uuidv5_sha1.update(UUID_NAMESPACE_LLAMA_CPP.bytes)
    # Total Weight Calculation For Progress Bar
    total_weights = 0
    for n, tensor in enumerate(reader.tensors, 1):
        # We don't need these
        if tensor.name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
            continue
        # Calculate Tensor Volume
        sum_weights_in_tensor = 1
        for dim in tensor.shape:
            sum_weights_in_tensor *= dim
        total_weights += sum_weights_in_tensor
    # Hash Progress Bar
    bar = tqdm(desc="Hashing", total=total_weights, unit="weights", unit_scale=True, disable=disable_progress_bar)
    # Hashing Process
    for n, tensor in enumerate(reader.tensors, 1):
        # We don't need these
        if tensor.name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")):
            continue
        # Progressbar
        sum_weights_in_tensor = 1
        for dim in tensor.shape:
            sum_weights_in_tensor *= dim
        bar.update(sum_weights_in_tensor)
        sha1_layer = hashlib.sha1()
        sha1_layer.update(tensor.data.data)
        sha1.update(tensor.data.data)
        uuidv5_sha1.update(tensor.data.data)
        print("sha1    {0}  {1}:{2}".format(sha1_layer.hexdigest(), filename, tensor.name)) # noqa: NP100
    # Flush Hash Progress Bar
    bar.close()
    # Display Hash Output
    print("sha1    {0}  {1}".format(sha1.hexdigest(), filename)) # noqa: NP100
    print("UUIDv5  {0}  {1}".format(uuid.UUID(bytes=uuidv5_sha1.digest()[:16], version=5), filename)) # noqa: NP100
 def main() -> None:
    parser = argparse.ArgumentParser(description="Dump GGUF file metadata")
    parser.add_argument("model",         type=str,            help="GGUF format model filename")
    parser.add_argument("--verbose",     action="store_true", help="increase output verbosity")
    parser.add_argument("--progressbar", action="store_true", help="enable progressbar")
    args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
    reader = GGUFReader(args.model, 'r')
    gguf_hash(reader, args.model, not args.progressbar)
 if __name__ == '__main__':
    main()
--- a/gguf-py/scripts/gguf_new_metadata.py
+++ b/gguf-py/scripts/gguf_new_metadata.py
@ -1,4 +1,6 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import logging
 import argparse
 import os
--- a/gguf-py/tests/test_gguf.py
+++ b/gguf-py/tests/test_gguf.py
@ -1,4 +1,4 @@
-import gguf  # noqa: F401
+import gguf  # noqa: F401  # pyright: ignore[reportUnusedImport]
 # TODO: add tests
--- a/include/llama.h
+++ b/include/llama.h
@ -88,8 +88,10 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
        LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
        LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
-        LLAMA_VOCAB_PRE_TYPE_VIKING         = 16,
+        LLAMA_VOCAB_PRE_TYPE_CHATGLM3       = 16,
-        LLAMA_VOCAB_PRE_TYPE_JAIS           = 17,
+        LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
        LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
        LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
    };
    // note: these values should be synchronized with ggml_rope
@ -160,6 +162,9 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_0_4_4      = 33, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_0_4_8      = 34, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // except 1d tensors
        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@ -1,3 +1,21 @@
 {
  "extraPaths": ["gguf-py"],
  "pythonVersion": "3.9",
  "pythonPlatform": "All",
  "reportUnusedImport": "warning",
  "reportDuplicateImport": "error",
  "reportDeprecated": "warning",
  "reportUnnecessaryTypeIgnoreComment": "warning",
  "executionEnvironments": [
    {
      // TODO: make this version override work correctly
      "root": "gguf-py",
      "pythonVersion": "3.8",
    },
    {
      // uses match expressions in steps.py
      "root": "examples/server/tests",
      "pythonVersion": "3.10",
    },
  ],
 }
--- a/requirements/requirements-all.txt
+++ b/requirements/requirements-all.txt
@ -0,0 +1,12 @@
 -r ../examples/llava/requirements.txt
 -r ../examples/server/bench/requirements.txt
 -r ../examples/server/tests/requirements.txt
 -r ./requirements-compare-llama-bench.txt
 -r ./requirements-pydantic.txt
 -r ./requirements-test-tokenizer-random.txt
 -r ./requirements-convert_hf_to_gguf.txt
 -r ./requirements-convert_hf_to_gguf_update.txt
 -r ./requirements-convert_legacy_llama.txt
 -r ./requirements-convert_llama_ggml_to_gguf.txt
--- a/requirements/requirements-compare-llama-bench.txt
+++ b/requirements/requirements-compare-llama-bench.txt
@ -0,0 +1,2 @@
 tabulate~=0.9.0
 GitPython~=3.1.43
--- a/requirements/requirements-pydantic.txt
+++ b/requirements/requirements-pydantic.txt
@ -0,0 +1,2 @@
 docstring_parser~=0.15
 pydantic~=2.6.3
--- a/requirements/requirements-test-tokenizer-random.txt
+++ b/requirements/requirements-test-tokenizer-random.txt
@ -0,0 +1 @@
 cffi~=1.16.0
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -62,6 +62,12 @@
    #include <io.h>
 #endif
 #if __cplusplus >= 202000L
    #define LU8(x) (const char*)(u8##x)
 #else
    #define LU8(x) u8##x
 #endif
 #include <algorithm>
 #include <array>
 #include <cassert>
@ -253,6 +259,7 @@ enum llm_arch {
    LLM_ARCH_OPENELM,
    LLM_ARCH_ARCTIC,
    LLM_ARCH_DEEPSEEK2,
    LLM_ARCH_CHATGLM,
    LLM_ARCH_BITNET,
    LLM_ARCH_T5,
    LLM_ARCH_JAIS,
@ -296,6 +303,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_OPENELM,         "openelm"      },
    { LLM_ARCH_ARCTIC,          "arctic"       },
    { LLM_ARCH_DEEPSEEK2,       "deepseek2"    },
    { LLM_ARCH_CHATGLM,         "chatglm"      },
    { LLM_ARCH_BITNET,          "bitnet"       },
    { LLM_ARCH_T5,              "t5"           },
    { LLM_ARCH_JAIS,            "jais"         },
@ -1229,6 +1237,21 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
        },
    },
    {
        LLM_ARCH_CHATGLM,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
            { LLM_TENSOR_OUTPUT,          "output" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
        },
    },
    {
        LLM_ARCH_BITNET,
        {
@ -2115,9 +2138,11 @@ enum e_model {
    MODEL_2_8B,
    MODEL_3B,
    MODEL_4B,
    MODEL_6B,
    MODEL_6_9B,
    MODEL_7B,
    MODEL_8B,
    MODEL_9B,
    MODEL_11B,
    MODEL_12B,
    MODEL_13B,
@ -2143,7 +2168,6 @@ enum e_model {
    MODEL_16x12B,
    MODEL_10B_128x3_66B,
    MODEL_57B_A14B,
    MODEL_9B,
    MODEL_27B,
 };
@ -3804,6 +3828,9 @@ struct llama_model_loader {
                case GGML_TYPE_IQ4_NL:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL;  break;
                case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break;
                case GGML_TYPE_IQ3_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_S;   break;
                case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
                case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
                case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
                default:
                    {
                        LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@ -4510,6 +4537,9 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
        case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
        case LLAMA_FTYPE_MOSTLY_IQ3_S:  return "IQ3_S - 3.4375 bpw";
        case LLAMA_FTYPE_MOSTLY_IQ3_M:  return "IQ3_S mix - 3.66 bpw";
        case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
        case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
        case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
        default: return "unknown, may not work";
    }
@ -4543,9 +4573,11 @@ static const char * llama_model_type_name(e_model type) {
        case MODEL_2_8B:          return "2.8B";
        case MODEL_3B:            return "3B";
        case MODEL_4B:            return "4B";
        case MODEL_6B:            return "6B";
        case MODEL_6_9B:          return "6.9B";
        case MODEL_7B:            return "7B";
        case MODEL_8B:            return "8B";
        case MODEL_9B:            return "9B";
        case MODEL_11B:           return "11B";
        case MODEL_12B:           return "12B";
        case MODEL_13B:           return "13B";
@ -4571,7 +4603,6 @@ static const char * llama_model_type_name(e_model type) {
        case MODEL_16x12B:        return "16x12B";
        case MODEL_10B_128x3_66B: return "10B+128x3.66B";
        case MODEL_57B_A14B:      return "57B.A14B";
        case MODEL_9B:            return "9B";
        case MODEL_27B:           return "27B";
        default:                  return "?B";
    }
@ -4678,16 +4709,6 @@ static void llm_load_hparams(
    // non-transformer models do not have attention heads
    if (hparams.n_head() > 0) {
        // sanity check for n_rot (optional)
        hparams.n_rot = hparams.n_embd / hparams.n_head();
        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
        if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
            if (hparams.n_rot != hparams.n_embd / hparams.n_head()) {
                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head()));
            }
        }
        // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
        // gpt-j n_rot = rotary_dim
@ -4696,6 +4717,17 @@ static void llm_load_hparams(
        hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
        ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
        // sanity check for n_rot (optional)
        hparams.n_rot = hparams.n_embd_head_k;
        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
        if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
            if (hparams.n_rot != hparams.n_embd_head_k) {
                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
            }
        }
    } else {
        hparams.n_rot = 0;
        hparams.n_embd_head_k = 0;
@ -5176,6 +5208,15 @@ static void llm_load_hparams(
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
        case LLM_ARCH_CHATGLM:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
                    case 28: model.type = e_model::MODEL_6B; break;
                    case 40: model.type = e_model::MODEL_9B; break;
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
        case LLM_ARCH_BITNET:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -5309,9 +5350,7 @@ static void llm_load_vocab(
            if (merges_keyidx == -1) {
                throw std::runtime_error("cannot find tokenizer merges in model file\n");
            }
            const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
            for (int i = 0; i < n_merges; i++) {
                const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
                if (!OldBPETokenizerMode)
@ -5463,6 +5502,10 @@ static void llm_load_vocab(
                tokenizer_pre == "poro-chat") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
                vocab.tokenizer_clean_spaces = false;
            } else if (
                tokenizer_pre == "chatglm-bpe") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
                vocab.special_bos_id  = -1;
            } else if (
                tokenizer_pre == "viking") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
@ -5596,7 +5639,6 @@ static void llm_load_vocab(
                vocab.special_eot_id    = 107;
            }
        }
        try {
            vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
        } catch (const std::exception & e) {
@ -7513,6 +7555,36 @@ static bool llm_load_tensors(
                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff});
                    }
                } break;
            case LLM_ARCH_CHATGLM:
                {
                    model.tok_embd   = ml.create_tensor(ctx_input,  tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab});
                    // output
                    {
                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
                    }
                    for (int i = 0; i < n_layer; ++i) {
                        ggml_context * ctx_layer = ctx_for_layer(i);
                        ggml_context * ctx_split = ctx_for_layer_split(i);
                        auto & layer = model.layers[i];
                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + (hparams.n_embd_head_k << 2)});
                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + (hparams.n_embd_head_k << 2)});
                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff * 2});
                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
                    }
                } break;
            default:
                throw std::runtime_error("unknown architecture");
        }
@ -7737,6 +7809,7 @@ enum llm_ffn_op_type {
    LLM_FFN_GELU,
    LLM_FFN_RELU,
    LLM_FFN_RELU_SQR,
    LLM_FFN_SWIGLU,
 };
 enum llm_ffn_gate_type {
@ -7941,6 +8014,19 @@ static struct ggml_tensor * llm_build_ffn(
                cur = ggml_sqr(ctx, cur);
                cb(cur, "ffn_sqr(relu)", il);
            } break;
        case LLM_FFN_SWIGLU:
            {
                // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
                int64_t split_point = cur->ne[0] / 2;
                struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0));
                struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
                x0 = ggml_silu(ctx, x0);
                cb(cur, "ffn_silu", il);
                cur = ggml_mul(ctx, x0, x1);
                cb(cur, "ffn_mul", il);
            } break;
    }
    if (type_gate == LLM_FFN_PAR) {
@ -8129,7 +8215,7 @@ static struct ggml_tensor * llm_build_kqv(
        struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
        cb(kq, "kq", il);
-        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
+        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
            // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
            // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
            ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@ -10789,19 +10875,12 @@ struct llm_build_context {
            // special-case: the up and gate tensors are merged into a single tensor
            // TOOD: support into llm_build_ffn
            {
-                struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur);
+                cur = llm_build_ffn(ctx0, cur,
-                cb(up, "ffn_up", il);
+                        model.layers[il].ffn_up,   NULL, NULL,
-
+                        NULL,                      NULL, NULL,
-                auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0));
+                        model.layers[il].ffn_down, NULL, NULL,
-                auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2));
+                        NULL,
-
+                        LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
                y = ggml_mul(ctx0, y, ggml_silu(ctx0, g));
                cb(y, "ffn_gate", il);
                auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y);
                cb(down, "ffn_down", il);
                cur = down;
                cb(cur, "ffn_out", il);
            }
@ -11571,7 +11650,7 @@ struct llm_build_context {
                Qcur = ggml_rope_ext(
                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens), inp_pos, nullptr,
-                        n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Qcur, "Qcur", il);
@ -11580,7 +11659,7 @@ struct llm_build_context {
                Kcur = ggml_rope_ext(
                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
-                        n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Kcur, "Kcur", il);
@ -11684,7 +11763,7 @@ struct llm_build_context {
                Qcur = ggml_rope_ext(
                        ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens), inp_pos, nullptr,
-                        n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Qcur, "Qcur", il);
@ -11693,7 +11772,7 @@ struct llm_build_context {
                Kcur = ggml_rope_ext(
                        ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
-                        n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
                cb(Kcur, "Kcur", il);
@ -13214,6 +13293,8 @@ struct llm_build_context {
                    LLM_NORM_RMS, cb, -1);
            cb(cur, "result_norm", -1);
        } else {
            GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
            struct ggml_tensor * embd_enc       = llm_build_inp_embd_enc();
            struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
@ -13493,6 +13574,120 @@ struct llm_build_context {
        return gf;
    }
    struct ggml_cgraph * build_chatglm() {
        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
        const int64_t n_embd_head = hparams.n_embd_head_v;
        const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
        struct ggml_tensor * cur;
        struct ggml_tensor * inpL;
        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = build_inp_pos();
        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
            cur = llm_build_norm(ctx0, inpL, hparams,
                    model.layers[il].attn_norm,
                    NULL,
                    LLM_NORM_RMS, cb, il);
            cb(cur, "attn_norm", il);
            // self-attention
            {
                struct ggml_tensor * Qcur = nullptr;
                struct ggml_tensor * Kcur = nullptr;
                struct ggml_tensor * Vcur = nullptr;
                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
                cb(cur, "wqkv", il);
                cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
                cb(cur, "bqkv", il);
                Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
                Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
                Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
                cb(Qcur, "Qcur", il);
                cb(Kcur, "Kcur", il);
                cb(Vcur, "Vcur", il);
                //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
                Qcur = ggml_rope_ext(
                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                );
                cb(Qcur, "Qcur_rope", il);
                Kcur = ggml_rope_ext(
                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                );
                cb(Kcur, "Kcur_rope", il);
                cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
                        model.layers[il].wo, NULL,
                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
            }
            if (il == n_layer - 1) {
                // skip computing output for unused tokens
                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
            }
            // Add the input
            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
            cb(ffn_inp, "ffn_inp", il);
            // FF
            {
                cur = llm_build_norm(ctx0, ffn_inp, hparams,
                        model.layers[il].ffn_norm,
                        NULL,
                        LLM_NORM_RMS, cb, il);
                cb(cur, "ffn_norm", il);
                cur = llm_build_ffn(ctx0, cur,
                        model.layers[il].ffn_up,   NULL, NULL,
                        NULL,                      NULL, NULL,
                        model.layers[il].ffn_down, NULL, NULL,
                        NULL,
                        LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
                cb(cur, "ffn_out", il);
            }
            inpL = ggml_add(ctx0, cur, ffn_inp);
            cb(inpL, "l_out", il);
        }
        cur = llm_build_norm(ctx0, inpL, hparams,
                model.output_norm,
                NULL,
                LLM_NORM_RMS, cb, -1);
        cb(cur, "result_norm", -1);
        cur = ggml_mul_mat(ctx0, model.output, cur);
        cb(cur, "result_output", -1);
        ggml_build_forward_expand(gf, cur);
        return gf;
    }
 };
 static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@ -13724,6 +13919,10 @@ static struct ggml_cgraph * llama_build_graph(
            {
                result = llm.build_deepseek2();
            } break;
        case LLM_ARCH_CHATGLM:
            {
                result = llm.build_chatglm();
            } break;
        case LLM_ARCH_BITNET:
            {
                result = llm.build_bitnet();
@ -15560,6 +15759,11 @@ struct llm_tokenizer_bpe {
                    " ?[^(\\s|.,!?…。，、।۔،)]+",
                };
                break;
            case LLAMA_VOCAB_PRE_TYPE_CHATGLM4:
                regex_exprs = {
                    "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                };
                break;
            case LLAMA_VOCAB_PRE_TYPE_VIKING:
                regex_exprs = {
                    "\\p{N}",
@ -16488,7 +16692,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                if (add_special) {
                    tokenizer.append_bos(output);
                }
                for (const auto & fragment : fragment_buffer) {
                    if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                        auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@ -17915,6 +18118,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
                new_type = GGML_TYPE_IQ3_S;
            }
            else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
                     new_type == GGML_TYPE_Q4_0_8_8) {
                new_type = GGML_TYPE_Q4_0;
            }
        }
    } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
               ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
@ -18227,6 +18434,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        case LLAMA_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break;
        case LLAMA_FTYPE_MOSTLY_IQ3_S:   default_type = GGML_TYPE_IQ3_S;   break;
        case LLAMA_FTYPE_MOSTLY_IQ3_M:   default_type = GGML_TYPE_IQ3_S;   break;
        case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
        case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
        case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
        default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
    }
@ -18537,6 +18747,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                f32_data = (float *) f32_conv_buf.data();
            }
            int chunk_size_multiplier = 1;
            if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
                if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
                else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
                if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
                else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
            }
            LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
            fflush(stdout);
@ -18549,7 +18767,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            const int64_t nrows = tensor->ne[1];
            static const int64_t min_chunk_size = 32 * 512;
-            const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
+            const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
                                       chunk_size_multiplier;
            const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
            const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
@ -19482,6 +19701,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
        case LLM_ARCH_OLMO:
        case LLM_ARCH_ARCTIC:
        case LLM_ARCH_DEEPSEEK2:
        case LLM_ARCH_CHATGLM:
            return LLAMA_ROPE_TYPE_NORM;
        // the pairs of head values are offset by n_rot/2
@ -21225,7 +21445,6 @@ int32_t llama_tokenize(
                        bool   add_special,
                        bool   parse_special) {
    auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special);
    if (n_tokens_max < (int) res.size()) {
        // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
        return -((int) res.size());
@ -21650,12 +21869,31 @@ static int32_t llama_chat_apply_template_internal(
        if (add_ass) {
            ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
        }
-    } else if (tmpl == "minicpm" || tmpl_contains(u8"<用户>")) {
+    } else if (tmpl == "chatglm3" || tmpl_contains("[gMASK]sop")) {
        // chatglm3-6b
        ss << "[gMASK]" << "sop";
        for (auto message : chat) {
            std::string role(message->role);
            ss << "<|" << role << "|>" << "\n " << message->content;
        }
        if (add_ass) {
            ss << "<|assistant|>";
        }
    } else if (tmpl == "chaglm4" || tmpl_contains("[gMASK]<sop>")) {
        ss << "[gMASK]" << "<sop>";
        for (auto message : chat) {
            std::string role(message->role);
            ss << "<|" << role << "|>" << "\n" << message->content;
        }
        if (add_ass) {
            ss << "<|assistant|>";
        }
    } else if (tmpl == "minicpm" || tmpl_contains(LU8("<用户>"))) {
        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "user") {
-                ss << u8"<用户>";
+                ss << LU8("<用户>");
                ss << trim(message->content);
                ss << "<AI>";
            } else {
@ -21671,7 +21909,7 @@ static int32_t llama_chat_apply_template_internal(
            } else if (role == "user") {
                ss << "User: " << message->content << "\n\n";
            } else if (role == "assistant") {
-                ss << "Assistant: " << message->content << u8"<｜end▁of▁sentence｜>";
+                ss << "Assistant: " << message->content << LU8("<｜end▁of▁sentence｜>");
            }
        }
        if (add_ass) {
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@ -1,3 +1,7 @@
 #if defined(_MSC_VER)
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif
 #include "unicode.h"
 #include "unicode-data.h"
`@ -87,4 +87,4 @@ The LORA rank can be configured for each model tensor type separately with these`

	`The LORA rank of 'norm' tensors should always be 1.`	`The LORA rank of 'norm' tensors should always be 1.`

	To see all available options use `finetune --help`.	To see all available options use `llama-finetune --help`.
`@ -1,4 +1,4 @@`
	`import gguf # noqa: F401`	`import gguf # noqa: F401 # pyright: ignore[reportUnusedImport]`

	`# TODO: add tests`	`# TODO: add tests`