cleanup, fix native build for arm (+28 squashed commit)

Squashed commit: [d1f6a4154] bundle library [947ab84b7] undo [0f9aba8d8] test [e9ac93873] test [920438202] test [1c6d98804] Revert "quick test" This reverts commit acf8ec8940. [acf8ec894] quick test [6a9937233] undo [5a263a5bd] test [ddfd82bca] test [0b30e45da] test [c3bfece55] messed up [2a4b37fe0] Revert "test" This reverts commit 80a1fcaeaf. [80a1fcaea] test [e2aa7d944] test [264d80200] test [f5b123173] undo [1ffacc484] test [63c0be926] undo [510e0377e] ofast try fix [4ac199b20] try fix sigill [1bc987ba2] try fix illegal instruction [7697252b1] edit [f87087b28] check gcc ver [e9dfe2cef] try using qemu to do the pyinstaller [b411192db] revert [25b5301e5] try using qemu to do the pyinstaller [58038cddc] try using qemu to do the pyinstaller
2025-09-11 09:34:37 +00:00 · 2024-12-07 00:19:13 +08:00 · 2024-12-07 00:19:13 +08:00 · a11bba5893
commit a11bba5893
parent e9d2332dd8
28 changed files with 69 additions and 10898 deletions
--- a/.github/workflows/kcpp-build-release-arm64.yaml
+++ b/.github/workflows/kcpp-build-release-arm64.yaml
@ -3,7 +3,6 @@ name: Koboldcpp Linux ARM64
 on: workflow_dispatch
 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-  NOAVX2: 1

 jobs:
  linux-arm:
@ -15,35 +14,68 @@ jobs:
        with:
          ref: ${{ github.head_ref || github.ref_name }}

-      - name: Build Dependencies
-        id: depends1
+      - name: Install Dependencies
+        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install -y python3 python3-pip python3-dev build-essential \
+          sudo apt-get install -y python3-tk python3-pip python3-dev build-essential \
              libffi-dev libssl-dev libbz2-dev libreadline-dev libsqlite3-dev \
-              crossbuild-essential-arm64 qemu qemu-user qemu-user-static \
-              gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
+              crossbuild-essential-arm64 gcc-aarch64-linux-gnu g++-aarch64-linux-gnu

-      - name: Python Dependencies
-        id: depends2
+      - name: Install New GCC for Cross-Compilation
        run: |
-          pip install customtkinter pyinstaller tk
-
-      - name: Build with ARM NEON Support
-        id: build_binary
-        run: |
-          # Enable cross-compilation for ARM
-          export QEMU_LD_PREFIX=/usr/aarch64-linux-gnu
-          export CC=aarch64-linux-gnu-gcc
-          export CXX=aarch64-linux-gnu-g++
+          sudo apt-get install -y software-properties-common
+          sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+          sudo apt-get update
+          sudo apt-get install -y gcc-12 g++-12 gcc-12-aarch64-linux-gnu g++-12-aarch64-linux-gnu
+          export CC=/usr/bin/aarch64-linux-gnu-gcc-12
+          export CXX=/usr/bin/aarch64-linux-gnu-g++-12
          export AR=aarch64-linux-gnu-ar
          export UNAME_M=aarch64
          export UNAME_S=Linux
-
+          export PATH=/usr/bin:$PATH
          make LLAMA_PORTABLE=1
          chmod +x './create_ver_file.sh'
          . create_ver_file.sh
-          pyinstaller --noconfirm --onefile --collect-all customtkinter --collect-all psutil --add-data './koboldcpp_default.so:.' --add-data './kcpp_adapters:./kcpp_adapters' --add-data './koboldcpp.py:.' --add-data './klite.embd:.' --add-data './kcpp_docs.embd:.' --add-data './kcpp_sdui.embd:.' --add-data './taesd.embd:.' --add-data './taesd_xl.embd:.' --add-data './rwkv_vocab.embd:.' --add-data './rwkv_world_vocab.embd:.' --version-file './version.txt' --clean --console koboldcpp.py -n "koboldcpp-linux-arm64"
+          mkdir -p dist
+          cp './koboldcpp_default.so' dist
+          ls
+
+      - name: Install QEMU
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y qemu-user-static binfmt-support
+
+      - name: Setup QEMU for ARM64
+        run: |
+          docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
+
+      - name: Build ARM64 PyInstaller
+        run: |
+          docker run --rm \
+            --platform linux/arm64 \
+            -v "${PWD}:/src" \
+            python:3.9-slim \
+            /bin/bash -c "
+              apt-get update && apt-get install -y build-essential && \
+              apt-get update && apt-get install -y gcc-12 g++-12 && \
+              export LD_LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/12:$LD_LIBRARY_PATH && \
+              pip install customtkinter pyinstaller tk && \
+              cd /src && \
+              pyinstaller --noconfirm --onefile --collect-all customtkinter --collect-all psutil \
+              --add-data './koboldcpp_default.so:.' \
+              --add-data './kcpp_adapters:./kcpp_adapters' \
+              --add-data './koboldcpp.py:.' \
+              --add-data './klite.embd:.' \
+              --add-data './kcpp_docs.embd:.' \
+              --add-data './kcpp_sdui.embd:.' \
+              --add-data './taesd.embd:.' \
+              --add-data './taesd_xl.embd:.' \
+              --add-data './rwkv_vocab.embd:.' \
+              --add-data './rwkv_world_vocab.embd:.' \
+              --version-file './version.txt' \
+              --clean --console koboldcpp.py -n 'koboldcpp-linux-arm64'
+            "

      - name: Save artifact
        uses: actions/upload-artifact@v3
--- a/.github/workflows/kcpp-build-release-osx.yaml
+++ b/.github/workflows/kcpp-build-release-osx.yaml
@ -3,7 +3,6 @@ name: Koboldcpp Mac
 on: workflow_dispatch
 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-  NOAVX2: 1

 jobs:
  osx:
--- a/9
+++ b/9
@ -312,9 +312,12 @@ ifneq ($(filter aarch64%,$(UNAME_M)),)
 	# Apple M1, M2, etc.
 	# Raspberry Pi 3, 4, Zero 2 (64-bit)
 	ifdef LLAMA_PORTABLE
+		CFLAGS +=
+		CXXFLAGS +=
 	else
-		CFLAGS += -mcpu=native
-		CXXFLAGS += -mcpu=native
+		# sve is cooked so we are disabling it
+		CFLAGS += -mcpu=native -DLLAMA_NOSVE
+		CXXFLAGS += -mcpu=native -DLLAMA_NOSVE
 	endif
 endif

@ -395,7 +398,7 @@ else
 	ifndef LLAMA_HIPBLAS
 	ifndef LLAMA_VULKAN
 	ifndef LLAMA_METAL
-	NOTIFY_MSG = @echo -e '\nYou did a basic CPU build. For faster speeds, install and link a BLAS library. \nSet LLAMA_VULKAN=1 to compile with Vulkan support. This is just a reminder, not an error.'
+	NOTIFY_MSG = @echo -e '\n***\nYou did a basic CPU build. For faster speeds, consider installing and linking a GPU BLAS library. For example, set LLAMA_VULKAN=1 to compile with Vulkan support. Read the KoboldCpp Wiki for more information. This is just a reminder, not an error.\n***\n'
 	endif
 	endif
 	endif
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -1,243 +0,0 @@
-#include "arg.h"
-#include "common.h"
-#include "log.h"
-#include "llama.h"
-
-#include <algorithm>
-#include <cstdio>
-#include <string>
-#include <vector>
-
-static void print_usage(int, char ** argv) {
-    LOG("\nexample usage:\n");
-    LOG("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
-    LOG("\n");
-}
-
-int main(int argc, char ** argv) {
-    common_params params;
-
-    params.prompt = "Hello my name is";
-    params.n_predict = 32;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
-        return 1;
-    }
-
-    common_init();
-
-    // number of parallel batches
-    int n_parallel = params.n_parallel;
-
-    // total length of the sequences including the prompt
-    int n_predict = params.n_predict;
-
-    // init LLM
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    // initialize the model
-
-    llama_model_params model_params = common_model_params_to_llama(params);
-
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
-
-    if (model == NULL) {
-        LOG_ERR("%s: error: unable to load model\n" , __func__);
-        return 1;
-    }
-
-    // tokenize the prompt
-
-    std::vector<llama_token> tokens_list;
-    tokens_list = common_tokenize(model, params.prompt, true);
-
-    const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
-
-    // initialize the context
-
-    llama_context_params ctx_params = common_context_params_to_llama(params);
-
-    ctx_params.n_ctx   = n_kv_req;
-    ctx_params.n_batch = std::max(n_predict, n_parallel);
-
-    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
-
-    auto sparams = llama_sampler_chain_default_params();
-
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-
-    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k));
-    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep));
-    llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
-    llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
-
-    if (ctx == NULL) {
-        LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
-        return 1;
-    }
-
-    const int n_ctx = llama_n_ctx(ctx);
-
-    LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
-
-    // make sure the KV cache is big enough to hold all the prompt and generated tokens
-    if (n_kv_req > n_ctx) {
-        LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
-        LOG_ERR("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
-        return 1;
-    }
-
-    // print the prompt token-by-token
-
-    LOG("\n");
-
-    for (auto id : tokens_list) {
-        LOG("%s", common_token_to_piece(ctx, id).c_str());
-    }
-
-    // create a llama_batch
-    // we use this object to submit token data for decoding
-    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
-
-    std::vector<llama_seq_id> seq_ids(n_parallel, 0);
-    for (int32_t i = 0; i < n_parallel; ++i) {
-        seq_ids[i] = i;
-    }
-
-    // evaluate the initial prompt
-    for (size_t i = 0; i < tokens_list.size(); ++i) {
-        common_batch_add(batch, tokens_list[i], i, seq_ids, false);
-    }
-    GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
-
-    if (llama_model_has_encoder(model)) {
-        if (llama_encode(ctx, batch)) {
-            LOG_ERR("%s : failed to eval\n", __func__);
-            return 1;
-        }
-
-        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
-        if (decoder_start_token_id == -1) {
-            decoder_start_token_id = llama_token_bos(model);
-        }
-
-        common_batch_clear(batch);
-        common_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
-    }
-
-    // llama_decode will output logits only for the last token of the prompt
-    batch.logits[batch.n_tokens - 1] = true;
-
-    if (llama_decode(ctx, batch) != 0) {
-        LOG_ERR("%s: llama_decode() failed\n", __func__);
-        return 1;
-    }
-
-    //// assign the system KV cache to all parallel sequences
-    //// this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
-    //for (int32_t i = 1; i < n_parallel; ++i) {
-    //    llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
-    //}
-
-    if (n_parallel > 1) {
-        LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
-    }
-
-    // main loop
-
-    // we will store the parallel decoded sequences in this vector
-    std::vector<std::string> streams(n_parallel);
-
-    // remember the batch index of the last token for each parallel sequence
-    // we need this to determine which logits to sample from
-    std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
-
-    int n_cur    = batch.n_tokens;
-    int n_decode = 0;
-
-    const auto t_main_start = ggml_time_us();
-
-    while (n_cur <= n_predict) {
-        // prepare the next batch
-        common_batch_clear(batch);
-
-        // sample the next token for each parallel sequence / stream
-        for (int32_t i = 0; i < n_parallel; ++i) {
-            if (i_batch[i] < 0) {
-                // the stream has already finished
-                continue;
-            }
-
-            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
-
-            // is it an end of generation? -> mark the stream as finished
-            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
-                i_batch[i] = -1;
-                LOG("\n");
-                if (n_parallel > 1) {
-                    LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
-                }
-
-                continue;
-            }
-
-            // if there is only one stream, we print immediately to stdout
-            if (n_parallel == 1) {
-                LOG("%s", common_token_to_piece(ctx, new_token_id).c_str());
-            }
-
-            streams[i] += common_token_to_piece(ctx, new_token_id);
-
-            i_batch[i] = batch.n_tokens;
-
-            // push this new token for next evaluation
-            common_batch_add(batch, new_token_id, n_cur, { i }, true);
-
-            n_decode += 1;
-        }
-
-        // all streams are finished
-        if (batch.n_tokens == 0) {
-            break;
-        }
-
-        n_cur += 1;
-
-        // evaluate the current batch with the transformer model
-        if (llama_decode(ctx, batch)) {
-            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
-            return 1;
-        }
-    }
-
-    if (n_parallel > 1) {
-        LOG("\n");
-
-        for (int32_t i = 0; i < n_parallel; ++i) {
-            LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
-        }
-    }
-
-    const auto t_main_end = ggml_time_us();
-
-    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
-            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
-
-    LOG("\n");
-    llama_perf_sampler_print(smpl);
-    llama_perf_context_print(ctx);
-
-    fprintf(stderr, "\n");
-
-    llama_batch_free(batch);
-
-    llama_sampler_free(smpl);
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_backend_free();
-
-    return 0;
-}
--- a/examples/convert_legacy_llama.py
+++ b/examples/convert_legacy_llama.py
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@ -1,421 +0,0 @@
-#include "arg.h"
-#include "common.h"
-#include "ggml.h"
-#include "ggml-alloc.h"
-
-#include <map>
-#include <vector>
-#include <string>
-#include <thread>
-#include <fstream>
-
-static bool g_verbose = false;
-
-struct tensor_transformation {
-    struct ggml_tensor * in;
-    struct ggml_tensor * out;
-    bool is_copy;
-};
-
-static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
-    int id = gguf_find_key(ctx_gguf, key.c_str());
-    return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
-}
-
-static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) {
-    int id = gguf_find_key(ctx_gguf, key.c_str());
-    return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
-}
-
-static void zeros(std::ofstream & file, size_t n) {
-    char zero = 0;
-    for (size_t i = 0; i < n; ++i) {
-        file.write(&zero, 1);
-    }
-}
-
-static std::string ggml_ne_string(const ggml_tensor * t) {
-    std::string str;
-    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
-        str += std::to_string(t->ne[i]);
-        if (i + 1 < GGML_MAX_DIMS) {
-            str += ", ";
-        }
-    }
-    return str;
-}
-
-static struct gguf_context * load_gguf(std::string & fname, struct ggml_context ** ctx_ggml) {
-    struct gguf_init_params params = {
-        /*.no_alloc = */ true,
-        /*.ctx      = */ ctx_ggml,
-    };
-    struct gguf_context * ctx_gguf = gguf_init_from_file(fname.c_str(), params);
-    if (!ctx_gguf) {
-        throw std::runtime_error("failed to load input GGUF from " + fname);
-    }
-    return ctx_gguf;
-}
-
-struct file_input {
-    struct ggml_context * ctx_meta = nullptr;
-    struct gguf_context * ctx_gguf = nullptr;
-    std::ifstream f_in;
-    std::map<std::string, ggml_tensor *> tensors;
-    float alpha;
-    float scale;
-
-    file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
-        if (!f_in.is_open()) {
-            throw std::runtime_error("failed to open input gguf from " + fname);
-        }
-
-        ctx_gguf = load_gguf(fname, &ctx_meta);
-        alpha = get_kv_f32(ctx_gguf, "adapter.lora.alpha");
-        printf("%s: loaded gguf from %s\n", __func__, fname.c_str());
-
-        for (ggml_tensor * cur = ggml_get_first_tensor(ctx_meta); cur; cur = ggml_get_next_tensor(ctx_meta, cur)) {
-            std::string name(cur->name);
-            tensors[name] = cur;
-            if (g_verbose) {
-                printf("%s: %s\n", __func__, cur->name);
-            }
-        }
-    }
-
-    ggml_tensor * get_tensor(std::string name) {
-        if (tensors.find(name) == tensors.end()) {
-            return nullptr;
-        }
-        return tensors[name];
-    }
-
-    void read_tensor_data(std::string name, std::vector<uint8_t> & buf) {
-        if (tensors.find(name) == tensors.end()) {
-            throw std::runtime_error("cannot find tensor with name: " + name);
-        }
-        auto len = ggml_nbytes(tensors[name]);
-        if (buf.size() < len) {
-            buf.resize(len);
-        }
-        auto i_tensor_in = gguf_find_tensor(ctx_gguf, name.c_str()); // idx of tensor in the input file
-        auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in);
-        f_in.seekg(offset);
-        f_in.read((char* )buf.data(), len);
-    }
-
-    ~file_input() {
-        gguf_free(ctx_gguf);
-        ggml_free(ctx_meta);
-    }
-};
-
-struct lora_merge_ctx {
-    // input base model + adapters
-    file_input base_model;
-    std::vector<std::unique_ptr<file_input>> adapters;
-
-    // for computing merged tensor
-    int n_threads;
-    ggml_backend_t backend = nullptr;
-    ggml_gallocr_t allocr = nullptr;
-    std::vector<uint8_t> read_buf;
-
-    // output file
-    struct gguf_context * ctx_out;
-    struct ggml_context * ctx_out_ggml;
-    std::ofstream fout;
-
-    lora_merge_ctx(
-            std::string & base_fname,
-            std::vector<common_lora_adapter_info> & lora_files,
-            std::string & outfile,
-            int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
-        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
-
-        if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
-            throw std::runtime_error("split model is not yet supported");
-        }
-
-        for (auto & lora_inp : lora_files) {
-            auto fname = lora_inp.path;
-            auto scale = lora_inp.scale;
-            std::unique_ptr<file_input> adapter(new file_input(fname, scale));
-            check_metadata_lora(adapter.get());
-            adapters.push_back(std::move(adapter));
-        }
-
-        ctx_out = gguf_init_empty();
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-        };
-        ctx_out_ggml = ggml_init(params);
-        backend = ggml_backend_cpu_init();
-        allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
-    }
-
-    void check_metadata_lora(file_input * adapter) {
-        auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
-        if (general_type != "adapter") {
-            throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
-        }
-
-        auto adapter_type = get_kv_str(adapter->ctx_gguf, "adapter.type");
-        if (adapter_type != "lora") {
-            throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
-        }
-
-        auto general_arch_base = get_kv_str(base_model.ctx_gguf, "general.architecture");
-        auto general_arch_lora = get_kv_str(adapter->ctx_gguf,   "general.architecture");
-        if (general_arch_base != general_arch_lora) {
-            throw std::runtime_error("model arch and LoRA arch mismatch");
-        }
-    }
-
-    ggml_type get_out_tensor_type(struct ggml_tensor * t) {
-        if (t->type == GGML_TYPE_F32) {
-            return GGML_TYPE_F32;
-        } else {
-            return GGML_TYPE_F16;
-        }
-    }
-
-    void run_merge() {
-        // prepare metadata
-        gguf_set_kv(ctx_out, base_model.ctx_gguf);
-        // output is forced to f16 for now
-        gguf_set_val_u32(ctx_out, "general.file_type", LLAMA_FTYPE_MOSTLY_F16);
-
-        // check if all lora adapters have the same tensors
-        // TODO: remove this when we can support merging subset of adapters. Ref: https://github.com/ggerganov/llama.cpp/pull/8607#discussion_r1686027777
-        static const char * err_no_subset_adapter = "Input adapters do not have the same list of tensors. This is not yet supported. Please merge the adapter one-by-one instead of merging all at once.";
-        if (adapters.size() > 1) {
-            for (size_t i = 1; i < adapters.size(); ++i) {
-                if (adapters[0]->tensors.size() != adapters[i]->tensors.size()) {
-                    throw std::runtime_error(err_no_subset_adapter);
-                }
-                for (auto & it : adapters[i]->tensors) {
-                    if (adapters[0]->get_tensor(it.first) == nullptr) {
-                        throw std::runtime_error(err_no_subset_adapter);
-                    }
-                }
-            }
-        }
-
-        // mapping base tensor to out tensor (same shape with base, but different type)
-        std::vector<tensor_transformation> trans;
-        for (auto & it : base_model.tensors) {
-            bool t_a = true;
-            bool t_b = true;
-            for (auto & adapter : adapters) {
-                t_a &= nullptr != adapter->get_tensor(it.first + ".lora_a");
-                t_b &= nullptr != adapter->get_tensor(it.first + ".lora_b");
-            }
-            auto base_tensor = it.second;
-            if (!t_a && !t_b) {
-                // only copy
-                struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
-                ggml_set_name(cpy_tensor, base_tensor->name);
-                trans.push_back({
-                    cpy_tensor,
-                    cpy_tensor,
-                    true,
-                });
-                gguf_add_tensor(ctx_out, cpy_tensor);
-            } else if (t_a && t_b) {
-                // need merging
-                struct ggml_tensor * out_tensor = ggml_new_tensor(
-                    ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
-                ggml_set_name(out_tensor, base_tensor->name);
-                trans.push_back({
-                    base_tensor,
-                    out_tensor,
-                    false,
-                });
-                gguf_add_tensor(ctx_out, out_tensor);
-            } else {
-                throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
-            }
-        }
-
-        // placeholder for the meta data
-        {
-            size_t meta_size = gguf_get_meta_size(ctx_out);
-            zeros(fout, meta_size);
-        }
-
-        // process base model tensors
-        size_t n_merged = 0;
-        for (auto & it : trans) {
-            if (!it.is_copy) {
-                merge_tensor(it.in, it.out);
-                n_merged++;
-            } else {
-                copy_tensor(it.in);
-            }
-        }
-
-        // write output metadata
-        {
-            std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
-            gguf_get_meta_data(ctx_out, data.data());
-            fout.seekp(0);
-            fout.write((const char *)data.data(), data.size());
-        }
-
-        printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
-        printf("%s : wrote %ld tensors to output file\n", __func__, trans.size());
-    }
-
-    void copy_tensor(struct ggml_tensor * base) {
-        printf("%s :  %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
-        size_t len = ggml_nbytes(base);
-        base_model.read_tensor_data(base->name, read_buf);
-        fout.write((char* )read_buf.data(), len);
-        zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
-    }
-
-    void merge_tensor(struct ggml_tensor * base, struct ggml_tensor * out) {
-        std::string name_base(base->name);
-        std::string name_lora_a = name_base + ".lora_a";
-        std::string name_lora_b = name_base + ".lora_b";
-
-        printf("%s : %s [%s]\n", __func__, base->name, ggml_ne_string(base).c_str());
-
-        // context for input tensor
-        std::vector<struct ggml_tensor *> inp_a(adapters.size());
-        std::vector<struct ggml_tensor *> inp_b(adapters.size());
-        struct ggml_init_params params {
-            /*.mem_size   =*/ ggml_tensor_overhead()*(2+adapters.size()*2),
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-        };
-        struct ggml_context * ctx = ggml_init(params);
-
-        // alloc tensors
-        struct ggml_tensor * inp_base = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, base->ne);
-        for (size_t i = 0; i < adapters.size(); ++i) {
-            auto t_a = adapters[i]->get_tensor(name_lora_a);
-            auto t_b = adapters[i]->get_tensor(name_lora_b);
-            // TODO: add support for quantized lora
-            if (ggml_is_quantized(t_a->type) || ggml_is_quantized(t_b->type)) {
-                throw std::runtime_error("quantized LoRA adapters is not supported, please retry with f16 or f32");
-            }
-            inp_a[i] = ggml_dup_tensor(ctx, t_a);
-            inp_b[i] = ggml_dup_tensor(ctx, t_b);
-        }
-        ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
-
-        // load base tensor to backend buffer
-        base_model.read_tensor_data(name_base, read_buf);
-        if (base->type != GGML_TYPE_F32) {
-            // optionally dequantize it
-            printf("%s :   + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
-            auto nels = ggml_nelements(inp_base);
-            const auto * qtype = ggml_get_type_traits(base->type);
-            std::vector<uint8_t> dequant_buf(nels * sizeof(float));
-            qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
-            ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
-        } else {
-            ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
-        }
-
-        // load lora tensors to backend buffer
-        for (size_t i = 0; i < adapters.size(); ++i) {
-            adapters[i]->read_tensor_data(name_lora_a, read_buf);
-            ggml_backend_tensor_set(inp_a[i], read_buf.data(), 0, ggml_nbytes(inp_a[i]));
-            adapters[i]->read_tensor_data(name_lora_b, read_buf);
-            ggml_backend_tensor_set(inp_b[i], read_buf.data(), 0, ggml_nbytes(inp_b[i]));
-        }
-
-        // build graph
-        struct ggml_cgraph * gf;
-        {
-            static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
-            static std::vector<uint8_t> buf(buf_size);
-            struct ggml_init_params params0 = {
-                /*.mem_size   =*/ buf_size,
-                /*.mem_buffer =*/ buf.data(),
-                /*.no_alloc   =*/ true,
-            };
-            struct ggml_context * ctx0 = ggml_init(params0);
-            gf = ggml_new_graph(ctx0);
-            struct ggml_tensor * cur = inp_base;
-            for (size_t i = 0; i < adapters.size(); ++i) {
-                struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
-                struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
-                // scale
-                const float alpha = adapters[i]->alpha;
-                const float rank  = (float) inp_b[i]->ne[0];
-                const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale;
-                delta = ggml_scale(ctx0, delta, scale);
-                cur = ggml_add(ctx0, delta, cur);
-                printf("%s :   + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type));
-                printf("%s :     input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]);
-            }
-            cur = ggml_cast(ctx0, cur, out->type);
-            printf("%s :   + output type is %s\n", __func__, ggml_type_name(out->type));
-            ggml_build_forward_expand(gf, cur);
-            ggml_free(ctx0);
-        }
-
-        // compute
-        {
-            ggml_gallocr_alloc_graph(allocr, gf);
-            ggml_backend_cpu_set_n_threads(backend, n_threads);
-            ggml_backend_graph_compute(backend, gf);
-        }
-
-        // write data to output file
-        {
-            auto * result = ggml_graph_node(gf, -1);
-            size_t len = ggml_nbytes(result);
-            if (read_buf.size() < len) {
-                read_buf.resize(len);
-            }
-            ggml_backend_tensor_get(result, read_buf.data(), 0, len);
-            fout.write((char* )read_buf.data(), len);
-            zeros(fout, GGML_PAD(len, GGUF_DEFAULT_ALIGNMENT) - len);
-        }
-
-        ggml_free(ctx);
-        ggml_backend_buffer_free(buffer);
-    }
-
-    ~lora_merge_ctx() {
-        ggml_gallocr_free(allocr);
-        ggml_backend_free(backend);
-        gguf_free(ctx_out);
-        ggml_free(ctx_out_ggml);
-    }
-};
-
-static void print_usage(int, char ** argv) {
-    printf("\nexample usage:\n");
-    printf("\n  %s -m base-model.gguf --lora lora-file.gguf -o merged-model-f16.gguf\n", argv[0]);
-    printf("\nNOTE: output model is F16\n");
-    printf("\n");
-}
-
-int main(int argc, char ** argv) {
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
-        return 1;
-    }
-
-    g_verbose = (params.verbosity > 1);
-    try {
-        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
-        ctx.run_merge();
-    } catch (const std::exception & err) {
-        fprintf(stderr, "%s\n", err.what());
-        exit(EXIT_FAILURE);
-    }
-
-    printf("done, output file is %s\n", params.lora_outfile.c_str());
-
-    return 0;
-}
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@ -1,112 +0,0 @@
-#include "unicode.h"
-#include "llama-grammar.h"
-
-#include <cstdio>
-#include <cstdlib>
-#include <sstream>
-#include <fstream>
-#include <string>
-#include <vector>
-
-static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) {
-    const auto cpts = unicode_cpts_from_utf8(input_str);
-
-    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
-          llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
-
-    size_t pos = 0;
-    for (const auto & cpt : cpts) {
-        const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy
-
-        llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur);
-
-        if (stacks_cur.empty()) {
-            error_pos = pos;
-            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'";
-            stacks_cur = stacks_prev;
-            return false;
-        }
-        ++pos;
-    }
-
-    for (const auto & stack : stacks_cur) {
-        if (stack.empty()) {
-            return true;
-        }
-    }
-
-    error_pos = pos;
-    error_msg = "Unexpected end of input";
-    return false;
-}
-
-static void print_error_message(const std::string & input_str, size_t error_pos, const std::string & error_msg) {
-    fprintf(stdout, "Input string is invalid according to the grammar.\n");
-    fprintf(stdout, "Error: %s at position %zu\n", error_msg.c_str(), error_pos);
-    fprintf(stdout, "\n");
-    fprintf(stdout, "Input string:\n");
-    fprintf(stdout, "%s", input_str.substr(0, error_pos).c_str());
-    if (error_pos < input_str.size()) {
-        fprintf(stdout, "\033[1;31m%c", input_str[error_pos]);
-        if (error_pos+1 < input_str.size()) {
-            fprintf(stdout, "\033[0;31m%s", input_str.substr(error_pos+1).c_str());
-        }
-        fprintf(stdout, "\033[0m\n");
-    }
-}
-
-int main(int argc, char** argv) {
-    if (argc != 3) {
-        fprintf(stdout, "Usage: %s <grammar_filename> <input_filename>\n", argv[0]);
-        return 1;
-    }
-
-    const std::string grammar_filename = argv[1];
-    const std::string input_filename = argv[2];
-
-    // Read the GBNF grammar file
-    FILE* grammar_file = fopen(grammar_filename.c_str(), "r");
-    if (!grammar_file) {
-        fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str());
-        return 1;
-    }
-
-    std::string grammar_str;
-    {
-        std::ifstream grammar_file(grammar_filename);
-        GGML_ASSERT(grammar_file.is_open() && "Failed to open grammar file");
-        std::stringstream buffer;
-        buffer << grammar_file.rdbuf();
-        grammar_str = buffer.str();
-    }
-
-    llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root");
-    if (grammar == nullptr) {
-        throw std::runtime_error("Failed to initialize llama_grammar");
-    }
-    // Read the input file
-    std::string input_str;
-    {
-        std::ifstream input_file(input_filename);
-        GGML_ASSERT(input_file.is_open() && "Failed to open input file");
-        std::stringstream buffer;
-        buffer << input_file.rdbuf();
-        input_str = buffer.str();
-    }
-
-    // Validate the input string against the grammar
-    size_t error_pos;
-    std::string error_msg;
-    bool is_valid = llama_grammar_validate(grammar, input_str, error_pos, error_msg);
-
-    if (is_valid) {
-        fprintf(stdout, "Input string is valid according to the grammar.\n");
-    } else {
-        print_error_message(input_str, error_pos, error_msg);
-    }
-
-    // Clean up
-    llama_grammar_free_impl(grammar);
-
-    return 0;
-}
--- a/examples/gen-docs/CMakeLists.txt
+++ b/examples/gen-docs/CMakeLists.txt
@ -1,5 +0,0 @@
-set(TARGET llama-gen-docs)
-add_executable(${TARGET} gen-docs.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@ -1,83 +0,0 @@
-#include "arg.h"
-#include "common.h"
-
-#include <fstream>
-#include <string>
-
-// Export usage message (-h) to markdown format
-
-static void write_table_header(std::ofstream & file) {
-    file << "| Argument | Explanation |\n";
-    file << "| -------- | ----------- |\n";
-}
-
-static void write_table_entry(std::ofstream & file, const common_arg & opt) {
-    file << "| `";
-    // args
-    for (const auto & arg : opt.args) {
-    if (arg == opt.args.front()) {
-            file << arg;
-            if (opt.args.size() > 1) file << ", ";
-        } else {
-            file << arg << (arg != opt.args.back() ? ", " : "");
-        }
-    }
-    // value hint
-    if (opt.value_hint) {
-        std::string md_value_hint(opt.value_hint);
-        string_replace_all(md_value_hint, "|", "\\|");
-        file << " " << md_value_hint;
-    }
-    if (opt.value_hint_2) {
-        std::string md_value_hint_2(opt.value_hint_2);
-        string_replace_all(md_value_hint_2, "|", "\\|");
-        file << " " << md_value_hint_2;
-    }
-    // help text
-    std::string md_help(opt.help);
-    string_replace_all(md_help, "\n", "<br/>");
-    string_replace_all(md_help, "|", "\\|");
-    file << "` | " << md_help << " |\n";
-}
-
-static void write_table(std::ofstream & file, std::vector<common_arg *> & opts) {
-    write_table_header(file);
-    for (const auto & opt : opts) {
-        write_table_entry(file, *opt);
-    }
-}
-
-static void export_md(std::string fname, llama_example ex) {
-    std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
-
-    common_params params;
-    auto ctx_arg = common_params_parser_init(params, ex);
-
-    std::vector<common_arg *> common_options;
-    std::vector<common_arg *> sparam_options;
-    std::vector<common_arg *> specific_options;
-    for (auto & opt : ctx_arg.options) {
-        // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
-        if (opt.is_sparam) {
-            sparam_options.push_back(&opt);
-        } else if (opt.in_example(ctx_arg.ex)) {
-            specific_options.push_back(&opt);
-        } else {
-            common_options.push_back(&opt);
-        }
-    }
-
-    file << "**Common params**\n\n";
-    write_table(file, common_options);
-    file << "\n\n**Sampling params**\n\n";
-    write_table(file, sparam_options);
-    file << "\n\n**Example-specific params**\n\n";
-    write_table(file, specific_options);
-}
-
-int main(int, char **) {
-    export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
-    export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
-
-    return 0;
-}
--- a/examples/gguf-hash/CMakeLists.txt
+++ b/examples/gguf-hash/CMakeLists.txt
@ -1,22 +0,0 @@
-set(TARGET llama-gguf-hash)
-add_executable(${TARGET} gguf-hash.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-
-# clibs dependencies
-include_directories(deps/)
-
-add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h)
-target_link_libraries(${TARGET} PRIVATE xxhash)
-
-add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h)
-target_link_libraries(${TARGET} PRIVATE sha1)
-if (NOT MSVC)
-    # disable warnings in 3rd party code
-    target_compile_options(sha1 PRIVATE -w)
-endif()
-
-add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h)
-target_link_libraries(${TARGET} PRIVATE sha256)
-
-target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/gguf-hash/deps/rotate-bits/package.json
+++ b/examples/gguf-hash/deps/rotate-bits/package.json
@ -1,13 +0,0 @@
-{
-  "name": "rotate-bits",
-  "version": "0.1.1",
-  "repo": "jb55/rotate-bits.h",
-  "description": "rotate bits",
-  "keywords": ["rotl", "rotr"],
-  "src": ["rotate-bits.h"],
-  "license": "Public Domain",
-  "development": {
-    "thlorenz/tap.c": "*"
-  }
-}
-
--- a/examples/gguf-hash/deps/rotate-bits/rotate-bits.h
+++ b/examples/gguf-hash/deps/rotate-bits/rotate-bits.h
@ -1,46 +0,0 @@
-
-
-#ifndef __ROTATE_DEFS_H
-#define __ROTATE_DEFS_H
-
-#ifdef _MSC_VER
-
-#include <stdlib.h>
-
-#define ROTL32(v, n) _rotl((v), (n))
-#define ROTL64(v, n) _rotl64((v), (n))
-
-#define ROTR32(v, n) _rotr((v), (n))
-#define ROTR64(v, n) _rotr64((v), (n))
-
-#else
-
-#include <stdint.h>
-
-#define U8V(v) ((uint8_t)(v) & 0xFFU)
-#define U16V(v) ((uint16_t)(v) & 0xFFFFU)
-#define U32V(v) ((uint32_t)(v) & 0xFFFFFFFFU)
-#define U64V(v) ((uint64_t)(v) & 0xFFFFFFFFFFFFFFFFU)
-
-#define ROTL32(v, n) \
-  (U32V((uint32_t)(v) << (n)) | ((uint32_t)(v) >> (32 - (n))))
-
-// tests fail if we don't have this cast...
-#define ROTL64(v, n) \
-  (U64V((uint64_t)(v) << (n)) | ((uint64_t)(v) >> (64 - (n))))
-
-#define ROTR32(v, n) ROTL32(v, 32 - (n))
-#define ROTR64(v, n) ROTL64(v, 64 - (n))
-
-#endif
-
-#define ROTL8(v, n) \
-  (U8V((uint8_t)(v) << (n)) | ((uint8_t)(v) >> (8 - (n))))
-
-#define ROTL16(v, n) \
-  (U16V((uint16_t)(v) << (n)) | ((uint16_t)(v) >> (16 - (n))))
-
-#define ROTR8(v, n) ROTL8(v, 8 - (n))
-#define ROTR16(v, n) ROTL16(v, 16 - (n))
-
-#endif
--- a/examples/gguf-hash/deps/sha1/package.json
+++ b/examples/gguf-hash/deps/sha1/package.json
@ -1,9 +0,0 @@
-{
-  "name": "sha1",
-  "version": "0.0.1",
-  "repo": "clibs/sha1",
-  "description": "sha1 hash algorithm",
-  "keywords": ["sha1", "hash"],
-  "license": "public domain",
-  "src": ["sha1.c", "sha1.h"]
-}
--- a/examples/gguf-hash/deps/sha1/sha1.c
+++ b/examples/gguf-hash/deps/sha1/sha1.c
@ -1,295 +0,0 @@
-/*
-SHA-1 in C
-By Steve Reid <steve@edmweb.com>
-100% Public Domain
-
-Test Vectors (from FIPS PUB 180-1)
-"abc"
-  A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
-"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
-  84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
-A million repetitions of "a"
-  34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
-*/
-
-/* #define LITTLE_ENDIAN * This should be #define'd already, if true. */
-/* #define SHA1HANDSOFF * Copies data before messing with it. */
-
-#define SHA1HANDSOFF
-
-#include <stdio.h>
-#include <string.h>
-
-/* for uint32_t */
-#include <stdint.h>
-
-#include "sha1.h"
-
-
-#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
-
-/* blk0() and blk() perform the initial expand. */
-/* I got the idea of expanding during the round function from SSLeay */
-#if BYTE_ORDER == LITTLE_ENDIAN
-#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
-    |(rol(block->l[i],8)&0x00FF00FF))
-#elif BYTE_ORDER == BIG_ENDIAN
-#define blk0(i) block->l[i]
-#else
-#error "Endianness not defined!"
-#endif
-#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
-    ^block->l[(i+2)&15]^block->l[i&15],1))
-
-/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
-#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);
-#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);
-#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
-#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);
-#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
-
-
-/* Hash a single 512-bit block. This is the core of the algorithm. */
-
-void SHA1Transform(
-    uint32_t state[5],
-    const unsigned char buffer[64]
-)
-{
-    uint32_t a, b, c, d, e;
-
-    typedef union
-    {
-        unsigned char c[64];
-        uint32_t l[16];
-    } CHAR64LONG16;
-
-#ifdef SHA1HANDSOFF
-    CHAR64LONG16 block[1];      /* use array to appear as a pointer */
-
-    memcpy(block, buffer, 64);
-#else
-    /* The following had better never be used because it causes the
-     * pointer-to-const buffer to be cast into a pointer to non-const.
-     * And the result is written through.  I threw a "const" in, hoping
-     * this will cause a diagnostic.
-     */
-    CHAR64LONG16 *block = (const CHAR64LONG16 *) buffer;
-#endif
-    /* Copy context->state[] to working vars */
-    a = state[0];
-    b = state[1];
-    c = state[2];
-    d = state[3];
-    e = state[4];
-    /* 4 rounds of 20 operations each. Loop unrolled. */
-    R0(a, b, c, d, e, 0);
-    R0(e, a, b, c, d, 1);
-    R0(d, e, a, b, c, 2);
-    R0(c, d, e, a, b, 3);
-    R0(b, c, d, e, a, 4);
-    R0(a, b, c, d, e, 5);
-    R0(e, a, b, c, d, 6);
-    R0(d, e, a, b, c, 7);
-    R0(c, d, e, a, b, 8);
-    R0(b, c, d, e, a, 9);
-    R0(a, b, c, d, e, 10);
-    R0(e, a, b, c, d, 11);
-    R0(d, e, a, b, c, 12);
-    R0(c, d, e, a, b, 13);
-    R0(b, c, d, e, a, 14);
-    R0(a, b, c, d, e, 15);
-    R1(e, a, b, c, d, 16);
-    R1(d, e, a, b, c, 17);
-    R1(c, d, e, a, b, 18);
-    R1(b, c, d, e, a, 19);
-    R2(a, b, c, d, e, 20);
-    R2(e, a, b, c, d, 21);
-    R2(d, e, a, b, c, 22);
-    R2(c, d, e, a, b, 23);
-    R2(b, c, d, e, a, 24);
-    R2(a, b, c, d, e, 25);
-    R2(e, a, b, c, d, 26);
-    R2(d, e, a, b, c, 27);
-    R2(c, d, e, a, b, 28);
-    R2(b, c, d, e, a, 29);
-    R2(a, b, c, d, e, 30);
-    R2(e, a, b, c, d, 31);
-    R2(d, e, a, b, c, 32);
-    R2(c, d, e, a, b, 33);
-    R2(b, c, d, e, a, 34);
-    R2(a, b, c, d, e, 35);
-    R2(e, a, b, c, d, 36);
-    R2(d, e, a, b, c, 37);
-    R2(c, d, e, a, b, 38);
-    R2(b, c, d, e, a, 39);
-    R3(a, b, c, d, e, 40);
-    R3(e, a, b, c, d, 41);
-    R3(d, e, a, b, c, 42);
-    R3(c, d, e, a, b, 43);
-    R3(b, c, d, e, a, 44);
-    R3(a, b, c, d, e, 45);
-    R3(e, a, b, c, d, 46);
-    R3(d, e, a, b, c, 47);
-    R3(c, d, e, a, b, 48);
-    R3(b, c, d, e, a, 49);
-    R3(a, b, c, d, e, 50);
-    R3(e, a, b, c, d, 51);
-    R3(d, e, a, b, c, 52);
-    R3(c, d, e, a, b, 53);
-    R3(b, c, d, e, a, 54);
-    R3(a, b, c, d, e, 55);
-    R3(e, a, b, c, d, 56);
-    R3(d, e, a, b, c, 57);
-    R3(c, d, e, a, b, 58);
-    R3(b, c, d, e, a, 59);
-    R4(a, b, c, d, e, 60);
-    R4(e, a, b, c, d, 61);
-    R4(d, e, a, b, c, 62);
-    R4(c, d, e, a, b, 63);
-    R4(b, c, d, e, a, 64);
-    R4(a, b, c, d, e, 65);
-    R4(e, a, b, c, d, 66);
-    R4(d, e, a, b, c, 67);
-    R4(c, d, e, a, b, 68);
-    R4(b, c, d, e, a, 69);
-    R4(a, b, c, d, e, 70);
-    R4(e, a, b, c, d, 71);
-    R4(d, e, a, b, c, 72);
-    R4(c, d, e, a, b, 73);
-    R4(b, c, d, e, a, 74);
-    R4(a, b, c, d, e, 75);
-    R4(e, a, b, c, d, 76);
-    R4(d, e, a, b, c, 77);
-    R4(c, d, e, a, b, 78);
-    R4(b, c, d, e, a, 79);
-    /* Add the working vars back into context.state[] */
-    state[0] += a;
-    state[1] += b;
-    state[2] += c;
-    state[3] += d;
-    state[4] += e;
-    /* Wipe variables */
-    a = b = c = d = e = 0;
-#ifdef SHA1HANDSOFF
-    memset(block, '\0', sizeof(block));
-#endif
-}
-
-
-/* SHA1Init - Initialize new context */
-
-void SHA1Init(
-    SHA1_CTX * context
-)
-{
-    /* SHA1 initialization constants */
-    context->state[0] = 0x67452301;
-    context->state[1] = 0xEFCDAB89;
-    context->state[2] = 0x98BADCFE;
-    context->state[3] = 0x10325476;
-    context->state[4] = 0xC3D2E1F0;
-    context->count[0] = context->count[1] = 0;
-}
-
-
-/* Run your data through this. */
-
-void SHA1Update(
-    SHA1_CTX * context,
-    const unsigned char *data,
-    uint32_t len
-)
-{
-    uint32_t i;
-
-    uint32_t j;
-
-    j = context->count[0];
-    if ((context->count[0] += len << 3) < j)
-        context->count[1]++;
-    context->count[1] += (len >> 29);
-    j = (j >> 3) & 63;
-    if ((j + len) > 63)
-    {
-        memcpy(&context->buffer[j], data, (i = 64 - j));
-        SHA1Transform(context->state, context->buffer);
-        for (; i + 63 < len; i += 64)
-        {
-            SHA1Transform(context->state, &data[i]);
-        }
-        j = 0;
-    }
-    else
-        i = 0;
-    memcpy(&context->buffer[j], &data[i], len - i);
-}
-
-
-/* Add padding and return the message digest. */
-
-void SHA1Final(
-    unsigned char digest[20],
-    SHA1_CTX * context
-)
-{
-    unsigned i;
-
-    unsigned char finalcount[8];
-
-    unsigned char c;
-
-#if 0    /* untested "improvement" by DHR */
-    /* Convert context->count to a sequence of bytes
-     * in finalcount.  Second element first, but
-     * big-endian order within element.
-     * But we do it all backwards.
-     */
-    unsigned char *fcp = &finalcount[8];
-
-    for (i = 0; i < 2; i++)
-    {
-        uint32_t t = context->count[i];
-
-        int j;
-
-        for (j = 0; j < 4; t >>= 8, j++)
-            *--fcp = (unsigned char) t}
-#else
-    for (i = 0; i < 8; i++)
-    {
-        finalcount[i] = (unsigned char) ((context->count[(i >= 4 ? 0 : 1)] >> ((3 - (i & 3)) * 8)) & 255);      /* Endian independent */
-    }
-#endif
-    c = 0200;
-    SHA1Update(context, &c, 1);
-    while ((context->count[0] & 504) != 448)
-    {
-        c = 0000;
-        SHA1Update(context, &c, 1);
-    }
-    SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform() */
-    for (i = 0; i < 20; i++)
-    {
-        digest[i] = (unsigned char)
-            ((context->state[i >> 2] >> ((3 - (i & 3)) * 8)) & 255);
-    }
-    /* Wipe variables */
-    memset(context, '\0', sizeof(*context));
-    memset(&finalcount, '\0', sizeof(finalcount));
-}
-
-void SHA1(
-    char *hash_out,
-    const char *str,
-    uint32_t len)
-{
-    SHA1_CTX ctx;
-    unsigned int ii;
-
-    SHA1Init(&ctx);
-    for (ii=0; ii<len; ii+=1)
-        SHA1Update(&ctx, (const unsigned char*)str + ii, 1);
-    SHA1Final((unsigned char *)hash_out, &ctx);
-}
-
--- a/examples/gguf-hash/deps/sha1/sha1.h
+++ b/examples/gguf-hash/deps/sha1/sha1.h
@ -1,52 +0,0 @@
-#ifndef SHA1_H
-#define SHA1_H
-
-/*
-   SHA-1 in C
-   By Steve Reid <steve@edmweb.com>
-   100% Public Domain
- */
-
-#include "stdint.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-typedef struct
-{
-    uint32_t state[5];
-    uint32_t count[2];
-    unsigned char buffer[64];
-} SHA1_CTX;
-
-void SHA1Transform(
-    uint32_t state[5],
-    const unsigned char buffer[64]
-    );
-
-void SHA1Init(
-    SHA1_CTX * context
-    );
-
-void SHA1Update(
-    SHA1_CTX * context,
-    const unsigned char *data,
-    uint32_t len
-    );
-
-void SHA1Final(
-    unsigned char digest[20],
-    SHA1_CTX * context
-    );
-
-void SHA1(
-    char *hash_out,
-    const char *str,
-    uint32_t len);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* SHA1_H */
--- a/examples/gguf-hash/deps/sha256/package.json
+++ b/examples/gguf-hash/deps/sha256/package.json
@ -1,15 +0,0 @@
-{
-  "name": "sha256",
-  "version": "0.0.2",
-  "repo": "jb55/sha256.c",
-  "description": "sha256 in c",
-  "keywords": ["sha256", "sha2"],
-  "src": ["sha256.c", "sha256.h"],
-  "dependencies": {
-    "jb55/rotate-bits.h": "0.1.1"
-  },
-  "development": {
-    "thlorenz/tap.c": "*"
-  }
-}
-
--- a/examples/gguf-hash/deps/sha256/sha256.c
+++ b/examples/gguf-hash/deps/sha256/sha256.c
@ -1,221 +0,0 @@
-/* Crypto/Sha256.c -- SHA-256 Hash
-2010-06-11 : Igor Pavlov : Public domain
-This code is based on public domain code from Wei Dai's Crypto++ library. */
-
-#include "rotate-bits/rotate-bits.h"
-#include "sha256.h"
-
-/* define it for speed optimization */
-#define _SHA256_UNROLL
-#define _SHA256_UNROLL2
-
-void
-sha256_init(sha256_t *p)
-{
-  p->state[0] = 0x6a09e667;
-  p->state[1] = 0xbb67ae85;
-  p->state[2] = 0x3c6ef372;
-  p->state[3] = 0xa54ff53a;
-  p->state[4] = 0x510e527f;
-  p->state[5] = 0x9b05688c;
-  p->state[6] = 0x1f83d9ab;
-  p->state[7] = 0x5be0cd19;
-  p->count = 0;
-}
-
-#define S0(x) (ROTR32(x, 2) ^ ROTR32(x,13) ^ ROTR32(x, 22))
-#define S1(x) (ROTR32(x, 6) ^ ROTR32(x,11) ^ ROTR32(x, 25))
-#define s0(x) (ROTR32(x, 7) ^ ROTR32(x,18) ^ (x >> 3))
-#define s1(x) (ROTR32(x,17) ^ ROTR32(x,19) ^ (x >> 10))
-
-#define blk0(i) (W[i] = data[i])
-#define blk2(i) (W[i&15] += s1(W[(i-2)&15]) + W[(i-7)&15] + s0(W[(i-15)&15]))
-
-#define Ch(x,y,z) (z^(x&(y^z)))
-#define Maj(x,y,z) ((x&y)|(z&(x|y)))
-
-#define a(i) T[(0-(i))&7]
-#define b(i) T[(1-(i))&7]
-#define c(i) T[(2-(i))&7]
-#define d(i) T[(3-(i))&7]
-#define e(i) T[(4-(i))&7]
-#define f(i) T[(5-(i))&7]
-#define g(i) T[(6-(i))&7]
-#define h(i) T[(7-(i))&7]
-
-
-#ifdef _SHA256_UNROLL2
-
-#define R(a,b,c,d,e,f,g,h, i) h += S1(e) + Ch(e,f,g) + K[i+j] + (j?blk2(i):blk0(i));\
-  d += h; h += S0(a) + Maj(a, b, c)
-
-#define RX_8(i) \
-  R(a,b,c,d,e,f,g,h, i); \
-  R(h,a,b,c,d,e,f,g, (i+1)); \
-  R(g,h,a,b,c,d,e,f, (i+2)); \
-  R(f,g,h,a,b,c,d,e, (i+3)); \
-  R(e,f,g,h,a,b,c,d, (i+4)); \
-  R(d,e,f,g,h,a,b,c, (i+5)); \
-  R(c,d,e,f,g,h,a,b, (i+6)); \
-  R(b,c,d,e,f,g,h,a, (i+7))
-
-#else
-
-#define R(i) h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + (j?blk2(i):blk0(i));\
-  d(i) += h(i); h(i) += S0(a(i)) + Maj(a(i), b(i), c(i))
-
-#ifdef _SHA256_UNROLL
-
-#define RX_8(i) R(i+0); R(i+1); R(i+2); R(i+3); R(i+4); R(i+5); R(i+6); R(i+7);
-
-#endif
-
-#endif
-
-static const uint32_t K[64] = {
-  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-static void
-sha256_transform(uint32_t *state, const uint32_t *data)
-{
-  uint32_t W[16] = {0};
-  unsigned j;
-  #ifdef _SHA256_UNROLL2
-  uint32_t a,b,c,d,e,f,g,h;
-  a = state[0];
-  b = state[1];
-  c = state[2];
-  d = state[3];
-  e = state[4];
-  f = state[5];
-  g = state[6];
-  h = state[7];
-  #else
-  uint32_t T[8];
-  for (j = 0; j < 8; j++)
-    T[j] = state[j];
-  #endif
-
-  for (j = 0; j < 64; j += 16)
-  {
-    #if defined(_SHA256_UNROLL) || defined(_SHA256_UNROLL2)
-    RX_8(0); RX_8(8);
-    #else
-    unsigned i;
-    for (i = 0; i < 16; i++) { R(i); }
-    #endif
-  }
-
-  #ifdef _SHA256_UNROLL2
-  state[0] += a;
-  state[1] += b;
-  state[2] += c;
-  state[3] += d;
-  state[4] += e;
-  state[5] += f;
-  state[6] += g;
-  state[7] += h;
-  #else
-  for (j = 0; j < 8; j++)
-    state[j] += T[j];
-  #endif
-
-  /* Wipe variables */
-  /* memset(W, 0, sizeof(W)); */
-  /* memset(T, 0, sizeof(T)); */
-}
-
-#undef S0
-#undef S1
-#undef s0
-#undef s1
-
-static void
-sha256_write_byte_block(sha256_t *p)
-{
-  uint32_t data32[16];
-  unsigned i;
-  for (i = 0; i < 16; i++)
-    data32[i] =
-      ((uint32_t)(p->buffer[i * 4    ]) << 24) +
-      ((uint32_t)(p->buffer[i * 4 + 1]) << 16) +
-      ((uint32_t)(p->buffer[i * 4 + 2]) <<  8) +
-      ((uint32_t)(p->buffer[i * 4 + 3]));
-  sha256_transform(p->state, data32);
-}
-
-
-void
-sha256_hash(unsigned char *buf, const unsigned char *data, size_t size)
-{
-  sha256_t hash;
-  sha256_init(&hash);
-  sha256_update(&hash, data, size);
-  sha256_final(&hash, buf);
-}
-
-
-void
-sha256_update(sha256_t *p, const unsigned char *data, size_t size)
-{
-  uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
-  while (size > 0)
-  {
-    p->buffer[curBufferPos++] = *data++;
-    p->count++;
-    size--;
-    if (curBufferPos == 64)
-    {
-      curBufferPos = 0;
-      sha256_write_byte_block(p);
-    }
-  }
-}
-
-
-void
-sha256_final(sha256_t *p, unsigned char *digest)
-{
-  uint64_t lenInBits = (p->count << 3);
-  uint32_t curBufferPos = (uint32_t)p->count & 0x3F;
-  unsigned i;
-  p->buffer[curBufferPos++] = 0x80;
-  while (curBufferPos != (64 - 8))
-  {
-    curBufferPos &= 0x3F;
-    if (curBufferPos == 0)
-      sha256_write_byte_block(p);
-    p->buffer[curBufferPos++] = 0;
-  }
-  for (i = 0; i < 8; i++)
-  {
-    p->buffer[curBufferPos++] = (unsigned char)(lenInBits >> 56);
-    lenInBits <<= 8;
-  }
-  sha256_write_byte_block(p);
-
-  for (i = 0; i < 8; i++)
-  {
-    *digest++ = (unsigned char)(p->state[i] >> 24);
-    *digest++ = (unsigned char)(p->state[i] >> 16);
-    *digest++ = (unsigned char)(p->state[i] >> 8);
-    *digest++ = (unsigned char)(p->state[i]);
-  }
-  sha256_init(p);
-}
--- a/examples/gguf-hash/deps/sha256/sha256.h
+++ b/examples/gguf-hash/deps/sha256/sha256.h
@ -1,24 +0,0 @@
-/* Sha256.h -- SHA-256 Hash
-2010-06-11 : Igor Pavlov : Public domain */
-
-#ifndef __CRYPTO_SHA256_H
-#define __CRYPTO_SHA256_H
-
-#include <stdlib.h>
-#include <stdint.h>
-
-#define SHA256_DIGEST_SIZE 32
-
-typedef struct sha256_t
-{
-  uint32_t state[8];
-  uint64_t count;
-  unsigned char buffer[64];
-} sha256_t;
-
-void sha256_init(sha256_t *p);
-void sha256_update(sha256_t *p, const unsigned char *data, size_t size);
-void sha256_final(sha256_t *p, unsigned char *digest);
-void sha256_hash(unsigned char *buf, const unsigned char *data, size_t size);
-
-#endif
--- a/examples/gguf-hash/deps/xxhash/clib.json
+++ b/examples/gguf-hash/deps/xxhash/clib.json
@ -1,12 +0,0 @@
-{
-  "name": "xxhash",
-  "version": "0.8.2",
-  "repo": "Cyan4973/xxhash",
-  "description": "Extremely fast non-cryptographic hash algorithm",
-  "keywords": ["xxhash", "hashing"],
-  "license": "BSD-2-Clause",
-  "src": [
-    "xxhash.c",
-    "xxhash.h"
-  ]
-}
--- a/examples/gguf-hash/deps/xxhash/xxhash.c
+++ b/examples/gguf-hash/deps/xxhash/xxhash.c
@ -1,42 +0,0 @@
-/*
- * xxHash - Extremely Fast Hash algorithm
- * Copyright (C) 2012-2023 Yann Collet
- *
- * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *    * Redistributions in binary form must reproduce the above
- *      copyright notice, this list of conditions and the following disclaimer
- *      in the documentation and/or other materials provided with the
- *      distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * You can contact the author at:
- *   - xxHash homepage: https://www.xxhash.com
- *   - xxHash source repository: https://github.com/Cyan4973/xxHash
- */
-
-/*
- * xxhash.c instantiates functions defined in xxhash.h
- */
-
-#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */
-#define XXH_IMPLEMENTATION      /* access definitions */
-
-#include "xxhash.h"
--- a/examples/gguf-hash/deps/xxhash/xxhash.h
+++ b/examples/gguf-hash/deps/xxhash/xxhash.h
--- a/examples/gguf-hash/gguf-hash.cpp
+++ b/examples/gguf-hash/gguf-hash.cpp
@ -1,693 +0,0 @@
-#include "ggml.h"
-
-#include <cstdlib>   /* abort() */
-#include <cstddef>
-#include <cstdio>
-#include <string>
-#include <stdexcept>
-#include <algorithm>
-#include <cstring>
-
-#include <sstream>
-#include <fstream>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "xxhash/xxhash.h"
-#include "sha1/sha1.h"
-#include "sha256/sha256.h"
-
-#ifdef __cplusplus
-}
-#endif
-
-
-// uuid.uuid5(uuid.NAMESPACE_URL, 'en.wikipedia.org/wiki/Llama.cpp')
-#define UUID_NAMESPACE_LLAMA_CPP "ef001206-dadc-5f6d-a15f-3359e577d4e5"
-#define UUID_NAMESPACE_LLAMA_CPP_HEX 0xef, 0x00, 0x12, 0x06, 0xda, 0xdc, 0x5f, 0x6d, 0xa1, 0x5f, 0x33, 0x59, 0xe5, 0x77, 0xd4, 0xe5
-
-
-#define HASH_TYPE_SHA256_STR "sha256"
-#define HASH_TYPE_SHA1_STR   "sha1"
-#define HASH_TYPE_XXH64_STR  "xxh64"
-#define HASH_TYPE_UUID_STR   "uuid"
-
-
-typedef enum {
-    HASH_EXIT_SUCCESS = 0, // All hash has been generated or validated
-    HASH_EXIT_FAILURE = 1, // Generic Failure
-    HASH_EXIT_MISMATCH = 2, // Hash mismatched during validation
-    HASH_EXIT_MANIFEST_MISSING_ENTRY = 3, // Hash attempted validation but missing entry in manifest
-    HASH_EXIT_MANIFEST_UNKNOWN_HASH = 4, // Manifest is present, but we do not know any hash format within it
-    HASH_EXIT_MANIFEST_FILE_ERROR = 5 // Manifest is either missing or not a known format
-} hash_exit_code_t;
-
-
-typedef enum {
-    HASH_MANIFEST_NOT_FOUND,
-    HASH_MANIFEST_MISMATCH,
-    HASH_MANIFEST_OK,
-} hash_manifest_result_t;
-
-
-struct hash_params {
-    std::string input;
-    bool xxh64 = false;
-    bool sha1 = false;
-    bool sha256 = false;
-    bool uuid = false;
-
-    bool no_layer = false;
-
-    bool manifest_is_usable = false;
-    std::string manifest_file;
-};
-
-struct manifest_check_params {
-    bool xxh64 = false;
-    bool sha1 = false;
-    bool sha256 = false;
-    bool uuid = false;
-};
-
-static char const * hash_manifest_result_to_str(hash_manifest_result_t value) {
-    switch (value) {
-        case HASH_MANIFEST_NOT_FOUND: return "Not Found";
-        case HASH_MANIFEST_MISMATCH: return "Mismatch";
-        case HASH_MANIFEST_OK: return "Ok";
-    }
-    return "?";
-}
-
-static char const * hash_exit_code_to_str(hash_exit_code_t value) {
-    switch (value) {
-        case HASH_EXIT_SUCCESS: return "Success";
-        case HASH_EXIT_FAILURE: return "Failure";
-        case HASH_EXIT_MISMATCH: return "Mismatch";
-        case HASH_EXIT_MANIFEST_MISSING_ENTRY: return "Manifest Missing Entry";
-        case HASH_EXIT_MANIFEST_UNKNOWN_HASH: return "Manifest Unknown Hash";
-        case HASH_EXIT_MANIFEST_FILE_ERROR: return "Manifest File Error";
-    }
-    return "?";
-}
-
-static void hash_print_usage(const char * executable) {
-    const hash_params default_params;
-    printf("\n");
-    printf("usage: %s [options] GGUF_IN\n", executable);
-    printf("\n");
-    printf("Hash a GGUF file");
-    printf("\n");
-    printf("options:\n");
-    printf("  -h, --help              show this help message and exit\n");
-    printf("      --xxh64             use xxh64 hash\n");
-    printf("      --sha1              use sha1 hash\n");
-    printf("      --sha256            use sha256 hash\n");
-    printf("      --all               use all hash\n");
-    printf("      --no-layer          exclude per layer hash\n");
-    printf("      --uuid              generate UUIDv5 ID\n");
-    printf("  -c, --check <manifest>  verify against a manifest\n");
-    printf("\n");
-}
-
-static void hash_params_parse_ex(int argc, const char ** argv, hash_params & params) {
-    std::string arg;
-    bool invalid_param = false;
-    const std::string arg_prefix = "--";
-
-    int arg_idx = 1;
-    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
-        arg = argv[arg_idx];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-
-        bool arg_found = false;
-        if (arg == "-h" || arg == "--help") {
-            hash_print_usage(argv[0]);
-            exit(0);
-        }
-
-        if (arg == "--xxh64") {
-            arg_found = true;
-            params.xxh64 = true;
-        }
-
-        if (arg == "--sha1") {
-            arg_found = true;
-            params.sha1 = true;
-        }
-
-        if (arg == "--uuid") {
-            arg_found = true;
-            params.uuid = true;
-        }
-
-        if (arg == "--sha256") {
-            arg_found = true;
-            params.sha256 = true;
-        }
-
-        if (arg == "--all") {
-            arg_found = true;
-            params.sha256 = true;
-            params.sha1 = true;
-            params.xxh64 = true;
-        }
-
-        if (arg == "--no-layer") {
-            arg_found = true;
-            params.no_layer = true;
-        }
-
-        if (arg == "-c" || arg == "--check") {
-            if (++arg_idx >= argc) {
-                invalid_param = true;
-                break;
-            }
-            arg_found = true;
-            params.manifest_file = argv[arg_idx];
-        }
-
-        if (!arg_found) {
-            throw std::invalid_argument("error: unknown argument: " + arg);
-        }
-    }
-
-    if (invalid_param) {
-        throw std::invalid_argument("error: invalid parameter for argument:" + arg);
-    }
-
-    if (argc - arg_idx < 1) {
-        throw std::invalid_argument("error: bad arguments");
-    }
-
-    params.input = argv[arg_idx++];
-}
-
-static bool hash_params_parse(int argc, const char ** argv, hash_params & params) {
-    bool result = true;
-    try {
-        hash_params_parse_ex(argc, argv, params);
-    }
-    catch (const std::invalid_argument & ex) {
-        fprintf(stderr, "%s\n", ex.what());
-        hash_print_usage(argv[0]);
-        exit(EXIT_FAILURE);
-    }
-    return result;
-}
-
-static bool manifest_type(const std::string & manifest_file, manifest_check_params & manifest_check) {
-    if (manifest_file.empty()) {
-        return false;
-    }
-
-    std::ifstream file(manifest_file);
-    if (!file.is_open()) {
-        return false;
-    }
-
-    std::string manifest_entry_line;
-    while (getline(file, manifest_entry_line)) {
-        // hash_type_str hash_str tensor_name
-        // e.g. 'xxh64     f66e9cd66a4396a0  test.gguf:tensor_0'
-        std::istringstream line_stream(manifest_entry_line);
-        std::string file_hash_type;
-        if (line_stream >> file_hash_type) {
-            if (file_hash_type == HASH_TYPE_SHA256_STR) {
-                manifest_check.sha256 = true;
-            } else if (file_hash_type == HASH_TYPE_SHA1_STR) {
-                manifest_check.sha1 = true;
-            } else if (file_hash_type == HASH_TYPE_XXH64_STR) {
-                manifest_check.xxh64 = true;
-            } else if (file_hash_type == HASH_TYPE_UUID_STR) {
-                manifest_check.uuid = true;
-            }
-        }
-    }
-
-    return true;
-}
-
-static hash_manifest_result_t manifest_verify(const std::string& manifest_file, const std::string& hash_type_str, const std::string& hash_str, const std::string& tensor_name) {
-    if (manifest_file.empty()) {
-        return HASH_MANIFEST_NOT_FOUND;
-    }
-
-    std::ifstream file(manifest_file);
-    if (!file.is_open()) {
-        return HASH_MANIFEST_NOT_FOUND;
-    }
-
-    std::string manifest_entry_line;
-    while (getline(file, manifest_entry_line)) {
-        std::istringstream line_stream(manifest_entry_line);
-        std::string file_hash_type;
-        std::string file_hash;
-        std::string file_tensor_name;
-        if (line_stream >> file_hash_type >> file_hash >> file_tensor_name) {
-            // Line parsed. Check hash validity
-
-            if (file_hash_type != hash_type_str) {
-                continue;
-            }
-
-            if (file_tensor_name != tensor_name) {
-                continue;
-            }
-
-            return (file_hash == hash_str) ? HASH_MANIFEST_OK : HASH_MANIFEST_MISMATCH;
-        }
-    }
-
-    return HASH_MANIFEST_NOT_FOUND;
-}
-
-static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char uuid[16]) {
-    // Ref: https://www.rfc-editor.org/rfc/rfc9562.html#section-5.5
-    // Assumes that digest was processed correctly with the expected namespace
-    for (int i = 0; i < 16; i++) {
-        uuid[i] = sha1_digest[i];
-    }
-
-    // Set bits corresponding to UUID ver 5
-    uuid[ 6] &= ~(0xF << 4);
-    uuid[ 6] |= (5 << 4);
-
-    // Set bits corresponding to UUID variant 0b10XX
-    uuid[ 8] &= ~(0xc << 4);
-    uuid[ 8] |= (0x8 << 4);
-}
-
-static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
-    const std::string & fname = hash_params.input;
-    struct ggml_context * ctx_data = NULL;
-
-    struct gguf_init_params params = {
-        /*.no_alloc = */ false,
-        /*.ctx      = */ &ctx_data,
-    };
-
-    // xxh64 init
-    XXH64_state_t* xxh64_model_hash_state = NULL;
-    if (hash_params.xxh64) {
-        xxh64_model_hash_state = XXH64_createState();
-        if (xxh64_model_hash_state==NULL) {
-            abort();
-        }
-
-        XXH64_hash_t const seed = 0;
-        if (XXH64_reset(xxh64_model_hash_state, seed) == XXH_ERROR) {
-            abort();
-        }
-    }
-
-    // sha1 init
-    SHA1_CTX sha1_model_hash_ctx;
-    if (hash_params.sha1) {
-        SHA1Init(&sha1_model_hash_ctx);
-    }
-
-    // sha256 init
-    sha256_t sha256_model_hash_ctx;
-    if (hash_params.sha256) {
-        sha256_init(&sha256_model_hash_ctx);
-    }
-
-    // sha1 for uuid init
-    SHA1_CTX sha1_for_uuid_ctx;
-    if (hash_params.uuid) {
-        unsigned char const uuidv5_namespace[] = {UUID_NAMESPACE_LLAMA_CPP_HEX};
-        SHA1Init(&sha1_for_uuid_ctx);
-        SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace));
-    }
-
-    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
-    const int n_tensors = gguf_get_n_tensors(ctx);
-    bool tensor_layer_in_manifest = false;
-    bool model_in_manifest = false;
-    bool tensor_layer_has_mismatch = false;
-    bool model_has_mismatch = false;
-    for (int i = 0; i < n_tensors; ++i) {
-        const char * name = gguf_get_tensor_name(ctx, i);
-        struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
-        auto n_bytes = ggml_nbytes(cur);
-        auto *raw_data = cur->data;
-        const std::string tensor_layer_name = fname + ":" + name;
-
-        if (hash_params.xxh64) {
-
-            if (!hash_params.no_layer) {
-                // Per Layer Hash
-                XXH64_hash_t hash = XXH64(raw_data, n_bytes, 0);
-
-                char hex_result[17];
-                for (int  offset = 0; offset < 8; offset++) {
-                    unsigned int shift_bits_by = (8 * (8 - offset - 1));
-                    snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
-                }
-
-                if (hash_params.manifest_is_usable) {
-                    hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name);
-
-                    switch (verify_result) {
-                        case HASH_MANIFEST_NOT_FOUND:
-                            break;
-                        case HASH_MANIFEST_MISMATCH:
-                            tensor_layer_in_manifest = true;
-                            tensor_layer_has_mismatch = true;
-                            break;
-                        case HASH_MANIFEST_OK:
-                            tensor_layer_in_manifest = true;
-                            break;
-                    }
-
-                    printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
-                } else {
-                    printf("%-8s  %-s  %s\n", HASH_TYPE_XXH64_STR, hex_result, tensor_layer_name.c_str());
-                }
-            }
-
-            // Overall Model Hash
-            if (XXH64_update(xxh64_model_hash_state, raw_data, n_bytes) == XXH_ERROR) abort();
-        }
-
-        if (hash_params.sha1) {
-
-            if (!hash_params.no_layer) {
-                // Per Layer Hash
-                char result[21]; // sha1 outputs 20 bytes
-                SHA1( result, (const char *)raw_data, n_bytes);
-
-                char hex_result[41] = {0};
-                for (int  offset = 0; offset < 20; offset++) {
-                    snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
-                }
-
-                if (hash_params.manifest_is_usable) {
-                    hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name);
-
-                    switch (verify_result) {
-                        case HASH_MANIFEST_NOT_FOUND:
-                            break;
-                        case HASH_MANIFEST_MISMATCH:
-                            tensor_layer_in_manifest = true;
-                            tensor_layer_has_mismatch = true;
-                            break;
-                        case HASH_MANIFEST_OK:
-                            tensor_layer_in_manifest = true;
-                            break;
-                    }
-
-                    printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
-                } else {
-                    printf("%-8s  %-s  %s\n", HASH_TYPE_SHA1_STR, hex_result, tensor_layer_name.c_str());
-                }
-            }
-
-            // Overall Model Hash
-            SHA1Update( &sha1_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
-        }
-
-        if (hash_params.sha256) {
-
-            if (!hash_params.no_layer) {
-                // Per Layer Hash
-                unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
-                sha256_hash((unsigned char*) result, (const unsigned char *)raw_data, n_bytes);
-
-                char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
-                for (int  offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
-                    snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
-                }
-
-                if (hash_params.manifest_is_usable) {
-                    hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name);
-
-                    switch (verify_result) {
-                        case HASH_MANIFEST_NOT_FOUND:
-                            break;
-                        case HASH_MANIFEST_MISMATCH:
-                            tensor_layer_in_manifest = true;
-                            tensor_layer_has_mismatch = true;
-                            break;
-                        case HASH_MANIFEST_OK:
-                            tensor_layer_in_manifest = true;
-                            break;
-                    }
-
-                    printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str(), hash_manifest_result_to_str(verify_result));
-                } else {
-                    printf("%-8s  %-s  %s\n", HASH_TYPE_SHA256_STR, hex_result, tensor_layer_name.c_str());
-                }
-            }
-
-            // Overall Model Hash
-            sha256_update( &sha256_model_hash_ctx, (unsigned char const *)raw_data, n_bytes);
-        }
-
-        if (hash_params.uuid) {
-            SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)raw_data, n_bytes);
-        }
-    }
-
-    if (hash_params.xxh64) {
-        XXH64_hash_t const hash = XXH64_digest(xxh64_model_hash_state);
-
-        char hex_result[17];
-        for (int  offset = 0; offset < 8; offset++) {
-            unsigned int shift_bits_by = (8 * (8 - offset - 1));
-            snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", (unsigned char) (hash >> shift_bits_by)&0xff);
-        }
-
-        if (hash_params.manifest_is_usable) {
-            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_XXH64_STR, hex_result, fname);
-
-            switch (verify_result) {
-                case HASH_MANIFEST_NOT_FOUND:
-                    break;
-                case HASH_MANIFEST_MISMATCH:
-                    model_in_manifest = true;
-                    model_has_mismatch = true;
-                    break;
-                case HASH_MANIFEST_OK:
-                    model_in_manifest = true;
-                    break;
-            }
-
-            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
-        } else {
-            printf("%-8s  %-s  %s\n", HASH_TYPE_XXH64_STR, hex_result, fname.c_str());
-        }
-    }
-
-    if (hash_params.sha1) {
-        unsigned char result[21];
-        SHA1Final(result, &sha1_model_hash_ctx);
-
-        char hex_result[41];
-        for (int  offset = 0; offset < 20; offset++) {
-            snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
-        }
-
-        if (hash_params.manifest_is_usable) {
-            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA1_STR, hex_result, fname);
-
-            switch (verify_result) {
-                case HASH_MANIFEST_NOT_FOUND:
-                    break;
-                case HASH_MANIFEST_MISMATCH:
-                    model_in_manifest = true;
-                    model_has_mismatch = true;
-                    break;
-                case HASH_MANIFEST_OK:
-                    model_in_manifest = true;
-                    break;
-            }
-
-            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
-        } else {
-            printf("%-8s  %-s  %s\n", HASH_TYPE_SHA1_STR, hex_result, fname.c_str());
-        }
-    }
-
-    if (hash_params.sha256) {
-        unsigned char result[SHA256_DIGEST_SIZE]; // sha256 outputs 32 bytes
-        sha256_final( &sha256_model_hash_ctx,  result);
-
-        char hex_result[SHA256_DIGEST_SIZE * 2 + 1] = {0};
-        for (int  offset = 0; offset < SHA256_DIGEST_SIZE; offset++) {
-            snprintf( ( hex_result + (2*offset)), sizeof(hex_result) - (2*offset), "%02x", result[offset]&0xff);
-        }
-
-        if (hash_params.manifest_is_usable) {
-            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, hex_result, fname);
-
-            switch (verify_result) {
-                case HASH_MANIFEST_NOT_FOUND:
-                    break;
-                case HASH_MANIFEST_MISMATCH:
-                    model_in_manifest = true;
-                    model_has_mismatch = true;
-                    break;
-                case HASH_MANIFEST_OK:
-                    model_in_manifest = true;
-                    break;
-            }
-
-            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str(), hash_manifest_result_to_str(verify_result));
-        } else {
-            printf("%-8s  %-s  %s\n", HASH_TYPE_SHA256_STR, hex_result, fname.c_str());
-        }
-    }
-
-    if (hash_params.uuid) {
-        unsigned char result[21];
-        SHA1Final(result, &sha1_for_uuid_ctx);
-
-        unsigned char uuid[16];
-        generate_uuidv5(result, uuid);
-
-        char string_buffer[37] = {0};
-        snprintf(string_buffer, sizeof(string_buffer), "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-            uuid[0], uuid[1], uuid[2], uuid[3],
-            uuid[4], uuid[5], uuid[6], uuid[7],
-            uuid[8], uuid[9], uuid[10], uuid[11],
-            uuid[12], uuid[13], uuid[14], uuid[15]);
-
-        if (hash_params.manifest_is_usable) {
-            hash_manifest_result_t verify_result = manifest_verify(hash_params.manifest_file, HASH_TYPE_SHA256_STR, string_buffer, fname);
-
-            switch (verify_result) {
-                case HASH_MANIFEST_NOT_FOUND:
-                    break;
-                case HASH_MANIFEST_MISMATCH:
-                    model_in_manifest = true;
-                    model_has_mismatch = true;
-                    break;
-                case HASH_MANIFEST_OK:
-                    model_in_manifest = true;
-                    break;
-            }
-
-            printf("%-8s  %-s  %s  -  %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str(), hash_manifest_result_to_str(verify_result));
-        } else {
-            printf("%-8s  %-s  %s\n", HASH_TYPE_UUID_STR, string_buffer, fname.c_str());
-        }
-    }
-
-
-    ggml_free(ctx_data);
-    gguf_free(ctx);
-
-
-    if (hash_params.manifest_is_usable) {
-        // In hash verification mode
-
-        if (!model_in_manifest) {
-            // model missing in manifest?
-
-            // Check tensor layer...
-            if (!tensor_layer_in_manifest) {
-                // Still missing? Maybe we are reading the wrong manifest.
-                return HASH_EXIT_MANIFEST_MISSING_ENTRY;
-            }
-
-            if (tensor_layer_has_mismatch) {
-                // Per tensor check found error
-                return HASH_EXIT_FAILURE;
-            }
-
-            // All per tensor layer checks passed? Sounds good enough.
-            return HASH_EXIT_SUCCESS;
-        }
-
-        // Overall model check passed, but let's check per layer just in case
-        // If missing, we don't care too much as the overall model checked
-        if (tensor_layer_in_manifest && tensor_layer_has_mismatch) {
-            return HASH_EXIT_FAILURE;
-        }
-
-        if (model_has_mismatch) {
-            // model has failed hash somewhere in the model
-            return HASH_EXIT_FAILURE;
-        }
-
-        // All checks appears to be fine
-        return HASH_EXIT_SUCCESS;
-    }
-
-    // In hash generation mode
-    return HASH_EXIT_SUCCESS;
-}
-
-int main(int argc, const char ** argv) {
-    hash_params params;
-    manifest_check_params manifest_check;
-    hash_params_parse(argc, argv, params);
-
-    if (!params.manifest_file.empty()) {
-        if (!manifest_type(params.manifest_file, manifest_check)) {
-            printf("ERROR cannot open manifest %s", params.manifest_file.c_str());
-            return HASH_EXIT_MANIFEST_FILE_ERROR;
-        }
-
-        if (!manifest_check.sha256 && !manifest_check.sha1 && !manifest_check.xxh64 && !manifest_check.uuid) {
-            printf("ERROR manifest does not have any known hash format in %s", params.manifest_file.c_str());
-            return HASH_EXIT_MANIFEST_UNKNOWN_HASH;
-        }
-
-        printf("manifest  %s", params.manifest_file.c_str());
-
-        if (manifest_check.sha256) {
-            printf("  sha256");
-        }
-
-        if (manifest_check.sha1) {
-            printf("  sha1");
-        }
-
-        if (manifest_check.xxh64) {
-            printf("  xxh64");
-        }
-
-        if (manifest_check.uuid) {
-            printf("  uuid");
-        }
-
-        printf("\n");
-
-        // Autoselect the highest security hash if manifest is provided but
-        // the user has not specifically defined the hash they care about
-        if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
-            // User has not selected a specific value, pick most secure hash
-            if (manifest_check.sha256) {
-                params.sha256 = true;
-            } else if (manifest_check.sha1) {
-                params.sha1 = true;
-            } else if (manifest_check.xxh64) {
-                params.xxh64 = true;
-            } else if (manifest_check.uuid) {
-                params.uuid = true;
-            }
-        }
-
-        params.manifest_is_usable = true;
-    }
-
-    // By default if no swich argument provided, assume xxh64
-    if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
-        params.xxh64 = true;
-    }
-
-    hash_exit_code_t exit_code = gguf_hash(params);
-
-    if (params.manifest_is_usable) {
-        printf("\nVerification results for %s - %s\n", params.manifest_file.c_str(), hash_exit_code_to_str(exit_code));
-    }
-
-    return exit_code;
-}
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
@ -741,7 +741,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) && ! defined(LLAMA_NOSVE)
    if (ggml_cpu_has_sve() && ggml_cpu_get_sve_cnt() == QK8_0) {
        const void * b_ptr = vx;
        const void * a_ptr = vy;
@ -2081,7 +2081,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);

 #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! defined(LLAMA_NOSVE)
    if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
        const void * b_ptr = vx;
        const void * a_ptr = vy;
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@ -59,7 +59,7 @@ struct ggml_compute_params {
 #endif
 #endif

-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) && ! defined(LLAMA_NOSVE)
 #include <arm_sve.h>
 #include <sys/prctl.h>
 #endif
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@ -1829,7 +1829,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    int ib = 0;
    float sumf = 0;

-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) && ! defined(LLAMA_NOSVE)
    svfloat32_t sumv0 = svdup_n_f32(0.0f);
    svfloat32_t sumv1 = svdup_n_f32(0.0f);

@ -3419,7 +3419,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    int ib = 0;
    float sumf = 0;

-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) && ! defined(LLAMA_NOSVE)
    svfloat32_t sumv0 = svdup_n_f32(0.0f);
    svfloat32_t sumv1 = svdup_n_f32(0.0f);

--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -40,7 +40,7 @@
 #include <omp.h>
 #endif

-#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
+#if (defined(__ARM_FEATURE_SVE) && ! defined(LLAMA_NOSVE)) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef GGML_USE_LLAMAFILE
 #endif

@ -2442,7 +2442,7 @@ static void ggml_init_arm_arch_features(void) {
    ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
    ggml_arm_arch_features.has_sve  = !!(hwcap & HWCAP_SVE);

-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) && ! defined(LLAMA_NOSVE)
    ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
 #endif
 #elif defined(__APPLE__)
@ -2479,7 +2479,7 @@ static void ggml_init_arm_arch_features(void) {
    ggml_arm_arch_features.has_i8mm = 0;
 #endif

-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) && ! defined(LLAMA_NOSVE)
    ggml_arm_arch_features.has_sve = 1;
    ggml_arm_arch_features.sve_cnt = 16;
 #else
@ -13946,7 +13946,7 @@ int ggml_cpu_has_dotprod(void) {
 }

 int ggml_cpu_has_sve(void) {
-#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE) && ! defined(LLAMA_NOSVE)
    return ggml_arm_arch_features.has_sve;
 #else
    return 0;
@ -13962,7 +13962,7 @@ int ggml_cpu_has_matmul_int8(void) {
 }

 int ggml_cpu_get_sve_cnt(void) {
-#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE) && ! defined(LLAMA_NOSVE)
    return ggml_arm_arch_features.sve_cnt;
 #else
    return 0;
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@ -10,7 +10,7 @@
 #include <stdint.h>
 #include <string.h>

-#ifdef __ARM_FEATURE_SVE
+#if defined(__ARM_FEATURE_SVE) && ! defined(LLAMA_NOSVE)
 #include <arm_sve.h>
 #endif // __ARM_FEATURE_SVE

--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -4344,7 +4344,7 @@ def main(launch_args,start_server=True):
            show_gui()
        except Exception as ex:
            exitcounter = 999
-            ermsg = "Reason: " + str(ex) + "\nFile selection GUI unsupported.\ncustomtkinter python module required!\n\nPlease check command line options with --help"
+            ermsg = "Reason: " + str(ex) + "\nFile selection GUI unsupported.\ncustomtkinter python module required!\n\nYou must use the command line instead, e.g. python ./koboldcpp.py --help"
            show_gui_msgbox("Warning, GUI failed to start",ermsg)
            if args.skiplauncher:
                print("Note: In order to use --skiplauncher, you need to specify a model with --model")