Merge branch 'master' into concedo_experimental

# Conflicts: # .devops/nix/package.nix # .github/workflows/build.yml # .gitignore # CMakeLists.txt # Makefile # README.md # ci/run.sh # flake.lock # flake.nix # scripts/get-flags.mk # scripts/get-wikitext-2.sh # scripts/sync-ggml.last # tests/CMakeLists.txt # tests/test-backend-ops.cpp # tests/test-grammar-parser.cpp # tests/test-llama-grammar.cpp
2025-09-10 17:14:36 +00:00 · 2024-02-20 16:30:21 +08:00 · 2024-02-20 16:30:21 +08:00 · f0a662112b
commit f0a662112b
parent 6181b46eef c0a8c6db37
34 changed files with 2394 additions and 753 deletions
--- a/README-sycl.md
+++ b/README-sycl.md
@ -272,7 +272,7 @@ Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact
 a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
-Recommend to install to default folder: **/opt/intel/oneapi**.
+Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**.
 Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1705,6 +1705,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    }
    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
    fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
@ -1742,7 +1743,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
    fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
-    fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
+    fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
    fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
    fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
    fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
@ -1751,7 +1752,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
    fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
-    fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
+    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
    fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
@ -1802,7 +1803,8 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
            if (cs_curr[j] < 0) { continue; }
            if (seqs.find(cs_curr[j]) == seqs.end()) {
                if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
-                seqs[cs_curr[j]] = seqs.size();
+                const size_t sz = seqs.size();
                seqs[cs_curr[j]] = sz;
            }
        }
        if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -121,7 +121,7 @@ static void sampler_queue(
                   struct llama_context * ctx_main,
            const llama_sampling_params & params,
                 llama_token_data_array & cur_p,
-                                 size_t & min_keep) {
+                                 size_t   min_keep) {
    const float         temp              = params.temp;
    const float         dynatemp_range    = params.dynatemp_range;
    const float         dynatemp_exponent = params.dynatemp_exponent;
@ -249,7 +249,7 @@ static llama_token llama_sampling_sample_impl(
            id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
        } else {
            // temperature sampling
-            size_t min_keep = std::max(1, params.n_probs);
+            size_t min_keep = std::max(1, params.min_keep);
            sampler_queue(ctx_main, params, cur_p, min_keep);
--- a/common/sampling.h
+++ b/common/sampling.h
@ -22,6 +22,7 @@ enum class llama_sampler_type : char {
 typedef struct llama_sampling_params {
    int32_t     n_prev                = 64;       // number of previous tokens to remember
    int32_t     n_probs               = 0;        // if greater than 0, output the probabilities of top n_probs tokens.
    int32_t     min_keep              = 0;        // 0 = disabled, otherwise samplers should return at least min_keep tokens
    int32_t     top_k                 = 40;       // <= 0 to use vocab size
    float       top_p                 = 0.95f;    // 1.0 = disabled
    float       min_p                 = 0.05f;    // 0.0 = disabled
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -1533,16 +1533,17 @@ int main(int argc, char ** argv) {
        int n_past = 0;
-        ggml_cgraph gf = {};
+        struct ggml_cgraph * gf = NULL;
        gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
        get_example_targets_batch(ctx0, 64*ex+0,  tokens_input, targets);
-        struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
+        struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, gf, tokens_input, n_tokens, n_past, n_batch);
        // struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
        struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
-        ggml_build_forward_expand(&gf, e);
+        ggml_build_forward_expand(gf, e);
-        ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
+        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
        float error_before_opt = ggml_get_f32_1d(e, 0);
@ -1552,8 +1553,8 @@ int main(int argc, char ** argv) {
        opt_params_lbfgs.lbfgs.n_iter = 16;
        ggml_opt(ctx0, opt_params_lbfgs, e);
        //
-        ggml_build_forward_expand(&gf, e);
+        ggml_build_forward_expand(gf, e);
-        ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
+        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
        float error_after_opt = ggml_get_f32_1d(e, 0);
@ -1600,13 +1601,14 @@ int main(int argc, char ** argv) {
            };
            struct ggml_context * ctx0 = ggml_init(params);
-            ggml_cgraph gf = {};
+            struct ggml_cgraph * gf = NULL;
            gf = ggml_new_graph_custom(ctx0, LLAMA_TRAIN_MAX_NODES, true);
            int n_past = 0;
-            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
+            struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
-            ggml_build_forward_expand(&gf, logits);
+            ggml_build_forward_expand(gf, logits);
-            ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
+            ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
            struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
            struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -159,7 +159,7 @@ int main(int argc, char ** argv) {
    }
    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %d, n_threads_batch = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
+    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
    LOG_TEE("\n");
    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
    const int n_ctx    = llama_n_ctx(ctx);
-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
    // make sure the KV cache is big enough to hold all the prompt and generated tokens
    if (n_kv_req > n_ctx) {
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -325,14 +325,14 @@ struct train_params {
 };
 static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
+    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
+    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
-    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
+    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
-    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
+    printf("%s: n_mult:  %u\n", __func__, params->n_mult);
-    printf("%s: n_head:  %d\n", __func__, params->n_head);
+    printf("%s: n_head:  %u\n", __func__, params->n_head);
-    printf("%s: n_ff:    %d\n", __func__, params->n_ff);
+    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
-    printf("%s: n_layer: %d\n", __func__, params->n_layer);
+    printf("%s: n_layer: %u\n", __func__, params->n_layer);
-    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
+    printf("%s: n_rot:   %u\n", __func__, params->n_rot);
 }
 static void init_model(struct my_llama_model * model) {
@ -350,25 +350,25 @@ static void init_model(struct my_llama_model * model) {
    model->train_tokens = 0;
    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
+    printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-    printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
+    printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd);
    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
    // printing the per-layer allocations here so we dont print in the for loop.
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
    ggml_set_name(model->norm,           "norm.weight");
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@ -7,8 +7,6 @@
 #include <string>
 #include <thread>
 static const size_t tensor_alignment = 32;
 struct lora_info {
    std::string filename;
    float scale;
--- a/examples/json-schema-to-grammar.py
+++ b/examples/json-schema-to-grammar.py
@ -87,7 +87,21 @@ class SchemaConverter:
        elif schema_type == 'array' and 'items' in schema:
            # TODO `prefixItems` keyword
            item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
-            rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space'
+            list_item_operator = f'("," space {item_rule_name})'
            successive_items = ""
            min_items = schema.get("minItems", 0)
            if min_items > 0:
               first_item = f"({item_rule_name})"
               successive_items = list_item_operator * (min_items - 1)
               min_items -= 1
            else:
               first_item = f"({item_rule_name})?"
            max_items = schema.get("maxItems")
            if max_items is not None and max_items > min_items:
                successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
            else:
                successive_items += list_item_operator + "*"
            rule = f'"[" space {first_item} {successive_items} "]" space'
            return self._add_rule(rule_name, rule)
        else:
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@ -53,7 +53,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-pa
 5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
 ```sh
-python ./convert.py ../llava-v1.5-7b
+python ./convert.py ../llava-v1.5-7b --skip-unknown
 ```
 Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -616,9 +616,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            KQ = ggml_soft_max_inplace(ctx0, KQ);
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
            KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
-            KQV = ggml_cont(ctx0, ggml_permute(ctx0, KQV, 0, 2, 1, 3));
+            KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            cur = ggml_cpy(ctx0, KQV, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size));
+            cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
        }
        // attention output
--- a/examples/llava/llava-surgery.py
+++ b/examples/llava/llava-surgery.py
@ -19,19 +19,12 @@ mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_project
 projector = {name: checkpoint[name].float() for name in mm_tensors}
 torch.save(projector, f"{args.model}/llava.projector")
 # remove these tensors from the checkpoint and save it again
 for name in mm_tensors:
    del checkpoint[name]
 # BakLLaVA models contain CLIP tensors in it
 clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
 if len(clip_tensors) > 0:
    clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
    torch.save(clip, f"{args.model}/llava.clip")
    # remove these tensors
    for name in clip_tensors:
        del checkpoint[name]
    # added tokens should be removed to be able to convert Mistral models
    if os.path.exists(f"{args.model}/added_tokens.json"):
@ -39,7 +32,6 @@ if len(clip_tensors) > 0:
            f.write("{}\n")
 torch.save(checkpoint, path)
 print("Done!")
 print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -310,7 +310,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
 }
 static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
-    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
+    // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
    // Output: `perplexity: 13.5106 [114/114]`
    // BOS tokens will be added for each chunk before eval
@ -448,7 +448,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        return perplexity_v2(ctx, params);
    }
-    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
+    // Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
    // Output: `perplexity: 13.5106 [114/114]`
    // BOS tokens will be added for each chunk before eval
@ -1624,7 +1624,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    uint32_t n_ctx;
    in.read((char *)&n_ctx, sizeof(n_ctx));
    if (n_ctx > llama_n_ctx(ctx)) {
-        fprintf(stderr, "%s: %s has been computed with %d, while the current context is %d. Increase it with -c and retry\n",
+        fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
                __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
    }
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -24,6 +24,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
    { "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization",            },
    { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization",            },
    { "IQ1_S",  LLAMA_FTYPE_MOSTLY_IQ1_S,  " 1.56 bpw quantization",            },
    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
    { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
    { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization",            },
@ -288,9 +289,10 @@ int main(int argc, char ** argv) {
        }
    }
-    if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) && imatrix_data.empty()) {
+    if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
         params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && imatrix_data.empty()) {
        fprintf(stderr, "\n===============================================================================================\n");
-        fprintf(stderr, "Please do not use IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
+        fprintf(stderr, "Please do not use IQ1_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
        fprintf(stderr, "===============================================================================================\n\n\n");
        return 1;
    }
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -39,6 +39,8 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
 - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
 - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
 - `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
 - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
 ## Build
@ -132,9 +134,11 @@ node index.js
 ## API Endpoints
 - **GET** `/health`: Returns the current state of the server:
-  - `{"status": "loading model"}` if the model is still being loaded.
+  - 503 -> `{"status": "loading model"}` if the model is still being loaded.
-  - `{"status": "error"}` if the model failed to load.
+  - 500 -> `{"status": "error"}` if the model failed to load.
-  - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
+  - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
  - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
  - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.
 - **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
@ -196,6 +200,8 @@ node index.js
    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
    `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum (default: 0)
    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
@ -379,6 +385,69 @@ Notice that each `probs` is an array of length `n_probs`.
    }'
    ```
 - **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
 ### Result JSON
 ```json
 [
    {
        "dynatemp_exponent": 1.0,
        "dynatemp_range": 0.0,
        "frequency_penalty": 0.0,
        "grammar": "",
        "id": 0,
        "ignore_eos": false,
        "logit_bias": [],
        "min_p": 0.05000000074505806,
        "mirostat": 0,
        "mirostat_eta": 0.10000000149011612,
        "mirostat_tau": 5.0,
        "model": "llama-2-7b-32k-instruct.Q2_K.gguf",
        "n_ctx": 2048,
        "n_keep": 0,
        "n_predict": 100000,
        "n_probs": 0,
        "next_token": {
            "has_next_token": true,
            "n_remain": -1,
            "num_tokens_predicted": 0,
            "stopped_eos": false,
            "stopped_limit": false,
            "stopped_word": false,
            "stopping_word": ""
        },
        "penalize_nl": true,
        "penalty_prompt_tokens": [],
        "presence_penalty": 0.0,
        "prompt": "Say hello to llama.cpp",
        "repeat_last_n": 64,
        "repeat_penalty": 1.100000023841858,
        "samplers": [
            "top_k",
            "tfs_z",
            "typical_p",
            "top_p",
            "min_p",
            "temperature"
        ],
        "seed": 42,
        "state": 1,
        "stop": [
            "\n"
        ],
        "stream": false,
        "task_id": 0,
        "temperature": 0.0,
        "tfs_z": 1.0,
        "top_k": 40,
        "top_p": 0.949999988079071,
        "typical_p": 1.0,
        "use_penalty_prompt_tokens": false
    }
 ]
 ```
 ## More examples
 ### Change system prompt on runtime
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -234,6 +234,7 @@
      mirostat_eta: 0.1, // learning rate
      grammar: '',
      n_probs: 0, // no completion_probabilities,
      min_keep: 0, // min probs from each sampler,
      image_data: [],
      cache_prompt: true,
      api_key: ''
@ -791,6 +792,9 @@
            <fieldset>
              ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
            </fieldset>
            <fieldset>
              ${IntField({ label: "Min Probabilities from each Sampler", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
            </fieldset>
            <fieldset>
              <label for="api_key">API Key</label>
              <input type="text" name="api_key" value="${params.value.api_key}" placeholder="Enter API key" oninput=${updateParams} />
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -29,6 +29,7 @@
 #include <chrono>
 #include <condition_variable>
 #include <atomic>
 #include <signal.h>
 using json = nlohmann::json;
@ -41,6 +42,7 @@ struct server_params
    int32_t port = 8080;
    int32_t read_timeout = 600;
    int32_t write_timeout = 600;
    bool slots_endpoint = true;
 };
 bool server_verbose = false;
@ -159,6 +161,7 @@ struct llama_client_slot
    int32_t n_decoded   = 0;
    int32_t n_remaining = -1;
    int32_t i_batch     = -1;
    int32_t n_predict   = -1;
    int32_t num_prompt_tokens           = 0;
    int32_t num_prompt_tokens_processed = 0;
@ -410,6 +413,7 @@ struct llama_server_context
            slot.id = i;
            slot.n_ctx = n_ctx_slot;
            slot.n_predict = params.n_predict;
            LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
@ -545,6 +549,16 @@ struct llama_server_context
        slot->params.seed               = json_value(data, "seed",              default_params.seed);
        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
        slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
        slot->sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
        if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
            // Might be better to reject the request with a 400 ?
            LOG_WARNING("Max tokens to predict exceeds server configuration", {
                {"params.n_predict", slot->params.n_predict},
                {"slot.n_predict", slot->n_predict},
            });
            slot->params.n_predict = slot->n_predict;
        }
        // infill
        if (data.count("input_prefix") != 0)
@ -1053,6 +1067,7 @@ struct llama_server_context
        return json {
            {"n_ctx",             slot.n_ctx},
            {"n_predict",         slot.n_predict},
            {"model",             params.model_alias},
            {"seed",              slot.params.seed},
            {"temperature",       slot.sparams.temp},
@ -1080,6 +1095,7 @@ struct llama_server_context
            {"stream",            slot.params.stream},
            {"logit_bias",        slot.sparams.logit_bias},
            {"n_probs",           slot.sparams.n_probs},
            {"min_keep",          slot.sparams.min_keep},
            {"grammar",           slot.sparams.grammar},
            {"samplers",          samplers_sequence}
        };
@ -1914,14 +1930,16 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("                            set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
    printf("  --mmproj MMPROJ_FILE      path to a multimodal projector file for LLaVA.\n");
    printf("  --log-disable             disables logging to a file.\n");
    printf("  --slots-endpoint-disable  disables slots monitoring endpoint.\n");
    printf("\n");
    printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict);
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("                            advanced option to override model metadata by key. may be specified multiple times.\n");
    printf("                            types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
    printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
    printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
    printf("  --chat-template FORMAT_NAME");
-    printf("                            set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str());
+    printf("                            set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str());
    printf("\n");
 }
@ -2361,6 +2379,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            log_set_target(stdout);
            LOG_INFO("logging to file is disabled.", {});
        }
        else if (arg == "--slots-endpoint-disable")
        {
            sparams.slots_endpoint = false;
        }
        else if (arg == "--chat-template")
        {
            if (++i >= argc)
@ -2512,6 +2534,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
    }
 }
 std::function<void(int)> shutdown_handler;
 inline void signal_handler(int signal) { shutdown_handler(signal); }
 int main(int argc, char **argv)
 {
 #if SERVER_VERBOSE != 1
@ -2558,13 +2583,40 @@ int main(int argc, char **argv)
        res.set_header("Access-Control-Allow-Headers", "*");
    });
-    svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) {
+    svr.Get("/health", [&](const httplib::Request& req, httplib::Response& res) {
        server_state current_state = state.load();
        switch(current_state) {
-            case SERVER_STATE_READY:
+            case SERVER_STATE_READY: {
-                res.set_content(R"({"status": "ok"})", "application/json");
+                int available_slots  = 0;
-                res.status = 200; // HTTP OK
+                int processing_slots = 0;
                for (llama_client_slot &slot: llama.slots) {
                    if (slot.available()) {
                        available_slots++;
                    } else {
                        processing_slots++;
                    }
                }
                if (available_slots > 0) {
                    json health = {
                            {"status",           "ok"},
                            {"slots_idle",       available_slots},
                            {"slots_processing", processing_slots}};
                    res.set_content(health.dump(), "application/json");
                    res.status = 200; // HTTP OK
                } else {
                    json health = {
                            {"status",           "no slot available"},
                            {"slots_idle",       available_slots},
                            {"slots_processing", processing_slots}};
                    res.set_content(health.dump(), "application/json");
                    if (req.has_param("fail_on_no_slot")) {
                        res.status = 503; // HTTP Service Unavailable
                    } else {
                        res.status = 200; // HTTP OK
                    }
                }
                break;
            }
            case SERVER_STATE_LOADING_MODEL:
                res.set_content(R"({"status": "loading model"})", "application/json");
                res.status = 503; // HTTP Service Unavailable
@ -2576,6 +2628,32 @@ int main(int argc, char **argv)
        }
    });
    if (sparams.slots_endpoint) {
        svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
            json slots;
            for (llama_client_slot & slot : llama.slots) {
                json slot_data = llama.get_formated_generation(slot);
                slot_data["id"] = slot.id;
                slot_data["task_id"] = slot.task_id;
                slot_data["state"] = slot.state;
                slot_data["prompt"] = slot.prompt;
                slot_data["next_token"] = {
                        {"has_next_token", slot.has_next_token},
                        {"n_remain", slot.n_remaining},
                        {"num_tokens_predicted", slot.n_decoded},
                        {"stopped_eos", slot.stopped_eos},
                        {"stopped_word", slot.stopped_word},
                        {"stopped_limit", slot.stopped_limit},
                        {"stopping_word", slot.stopping_word},
                };
                slots.push_back(slot_data);
            }
            res.set_content(slots.dump(), "application/json");
            res.status = 200; // HTTP OK
        });
    }
    svr.set_logger(log_server_request);
    svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
@ -3129,8 +3207,25 @@ int main(int argc, char **argv)
        std::placeholders::_2,
        std::placeholders::_3
    ));
    llama.queue_tasks.start_loop();
    shutdown_handler = [&](int) {
        llama.queue_tasks.terminate();
    };
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
    struct sigaction sigint_action;
    sigint_action.sa_handler = signal_handler;
    sigemptyset (&sigint_action.sa_mask);
    sigint_action.sa_flags = 0;
    sigaction(SIGINT, &sigint_action, NULL);
 #elif defined (_WIN32)
    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
    };
    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
    llama.queue_tasks.start_loop();
    svr.stop();
    t.join();
    llama_backend_free();
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -220,6 +220,7 @@ inline std::string format_chatml(std::vector<json> messages)
 struct llama_server_queue {
    int id = 0;
    std::mutex mutex_tasks;
    bool running;
    // queues
    std::vector<task_server> queue_tasks;
    std::vector<task_server> queue_tasks_deferred;
@ -278,9 +279,18 @@ struct llama_server_queue {
        queue_tasks_deferred.clear();
    }
-    // Start the main loop. This call is blocking
+    // end the start_loop routine
-    [[noreturn]]
+    void terminate() {
        {
            std::unique_lock<std::mutex> lock(mutex_tasks);
            running = false;
        }
        condition_tasks.notify_all();
    }
    // Start the main loop.
    void start_loop() {
        running = true;
        while (true) {
            // new task arrived
            LOG_VERBOSE("have new task", {});
@ -324,8 +334,12 @@ struct llama_server_queue {
            {
                std::unique_lock<std::mutex> lock(mutex_tasks);
                if (queue_tasks.empty()) {
                    if (!running) {
                        LOG_VERBOSE("ending start_loop", {});
                        return;
                    }
                    condition_tasks.wait(lock, [&]{
-                        return !queue_tasks.empty();
+                        return (!queue_tasks.empty() || !running);
                    });
                }
            }
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -111,13 +111,13 @@ static const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
 static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";
 static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
+    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
+    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
-    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
+    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
-    printf("%s: n_head:  %d\n", __func__, params->n_head);
+    printf("%s: n_head:  %u\n", __func__, params->n_head);
-    printf("%s: n_ff:    %d\n", __func__, params->n_ff);
+    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
-    printf("%s: n_layer: %d\n", __func__, params->n_layer);
+    printf("%s: n_layer: %u\n", __func__, params->n_layer);
-    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
+    printf("%s: n_rot:   %u\n", __func__, params->n_rot);
 }
 static void set_param_model(struct my_llama_model * model) {
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -377,6 +377,9 @@ struct ggml_gallocr {
    struct node_alloc * node_allocs; // [n_nodes]
    int n_nodes;
    struct tensor_alloc * leaf_allocs; // [n_leafs]
    int n_leafs;
 };
 ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
@ -427,6 +430,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
    free(galloc->buffers);
    free(galloc->buf_tallocs);
    free(galloc->node_allocs);
    free(galloc->leaf_allocs);
    free(galloc);
 }
@ -464,7 +468,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
            for (int i = 0; i < GGML_MAX_SRC; i++) {
                struct ggml_tensor * parent = node->src[i];
                if (parent == NULL) {
-                    break;
+                    continue;
                }
                // if the node's data is external, then we cannot re-use it
@ -544,22 +548,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
    memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
    memset(galloc->hash_values,   0, galloc->hash_set.size * sizeof(struct hash_node));
    // allocate all graph inputs first to avoid overwriting them
    for (int i = 0; i < graph->n_nodes; i++) {
        if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
            ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
        }
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            if (graph->nodes[i]->src[j] == NULL) {
                break;
            }
            if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
                ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
            }
        }
    }
    // count number of children and views
    // allocate all graph inputs and leafs first to avoid overwriting them
    for (int i = 0; i < graph->n_nodes; i++) {
        struct ggml_tensor * node = graph->nodes[i];
@ -568,14 +558,37 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
            ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
        }
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
+        if (node->flags & GGML_TENSOR_FLAG_INPUT) {
-            struct ggml_tensor * parent = node->src[j];
+            ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
            if (parent == NULL) {
                break;
            }
            ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
        }
-   }
+
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * src = node->src[j];
            if (src == NULL) {
                continue;
            }
            ggml_gallocr_hash_get(galloc, src)->n_children += 1;
            // allocate explicit inputs and leafs
            if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
                ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
            }
        }
    }
    // allocate the remaining leafs that are unused on the graph
    // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
    for (int i = 0; i < graph->n_leafs; i++) {
        struct ggml_tensor * leaf = graph->leafs[i];
        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
        if (hn->n_children == 0) {
            assert(!hn->allocated);
            // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
            ggml_gallocr_allocate_node(galloc, leaf, 0);
        }
    }
    // allocate tensors
    for (int i = 0; i < graph->n_nodes; i++) {
@ -586,7 +599,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * parent = node->src[j];
            if (parent == NULL) {
-                break;
+                continue;
            }
            ggml_gallocr_allocate_node(galloc, parent, buffer_id);
        }
@ -598,7 +611,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * parent = node->src[j];
            if (parent == NULL) {
-                break;
+                continue;
            }
            AT_PRINTF("%s", parent->name);
            if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
@ -611,7 +624,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * parent = node->src[j];
            if (parent == NULL) {
-                break;
+                continue;
            }
            struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
            p_hn->n_children -= 1;
@ -696,6 +709,18 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
            }
        }
    }
    if (galloc->n_leafs < graph->n_leafs) {
        free(galloc->leaf_allocs);
        galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
        GGML_ASSERT(galloc->leaf_allocs != NULL);
    }
    galloc->n_leafs = graph->n_leafs;
    for (int i = 0; i < graph->n_leafs; i++) {
        struct ggml_tensor * leaf = graph->leafs[i];
        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
        galloc->leaf_allocs[i].offset = hn->offset;
        galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
    }
    // reallocate buffers if needed
    for (int i = 0; i < galloc->n_buffers; i++) {
@ -722,8 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
    return ggml_gallocr_reserve_n(galloc, graph, NULL);
 }
-static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) {
+static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
-    assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
+    assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
    if (node->view_src != NULL) {
        if (node->buffer == NULL) {
@ -732,29 +757,20 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
                // this tensor was allocated without ggml-backend
                return;
            }
-            ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node);
+            ggml_backend_view_init(galloc->buffers[buffer_id], node);
        }
    } else {
        if (node->data == NULL) {
            assert(tensor_alloc->offset != SIZE_MAX);
-            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
+            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
-            void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]);
+            void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
            void * addr = (char *)base + tensor_alloc->offset;
-            ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr);
+            ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
        } else {
            if (node->buffer == NULL) {
                // this tensor was allocated without ggml-backend
                return;
            }
 #ifndef NDEBUG
            size_t offset =
                (char *)node->data -
                (char *)ggml_backend_buffer_get_base(node->buffer);
            size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
            assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
            assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
 #endif
        }
    }
 }
@ -773,6 +789,13 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
        return true;
    }
    if (galloc->n_leafs != graph->n_leafs) {
 #ifndef NDEBUG
        fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
 #endif
        return true;
    }
    for (int i = 0; i < graph->n_nodes; i++) {
        struct ggml_tensor * node = graph->nodes[i];
        struct node_alloc * node_alloc = &galloc->node_allocs[i];
@ -787,7 +810,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * src = node->src[j];
            if (src == NULL) {
-                break;
+                continue;
            }
            if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
 #ifndef NDEBUG
@ -827,17 +850,24 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
    }
    // allocate the graph tensors from the previous assignments
    // nodes
    for (int i = 0; i < graph->n_nodes; i++) {
        struct ggml_tensor * node = graph->nodes[i];
        struct node_alloc * node_alloc = &galloc->node_allocs[i];
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * src = node->src[j];
            if (src == NULL) {
-                break;
+                continue;
            }
-            ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
+            ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
        }
-        ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
+        ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
    }
    // leafs
    for (int i = 0; i < graph->n_leafs; i++) {
        struct ggml_tensor * leaf = graph->leafs[i];
        struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
        ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
    }
    return true;
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -756,7 +756,7 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
 GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
    switch (op->op) {
        case GGML_OP_CPY:
-            return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
+            return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
        case GGML_OP_MUL_MAT:
            return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
        default:
@ -1006,6 +1006,7 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, gg
        }
    }
    GGML_ASSERT(false && "tensor buffer type not supported by any backend");
    return -1; // silence warning
 }
 #if 0
@ -1040,7 +1041,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
    for (int i = 0; i < GGML_MAX_SRC; i++) {
        const struct ggml_tensor * src = tensor->src[i];
        if (src == NULL) {
-            break;
+            continue;
        }
        if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
            int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
@ -1087,7 +1088,7 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * src = node->src[j];
            if (src == NULL) {
-                break;
+                continue;
            }
            ggml_backend_t src_backend = tensor_backend(src);
            fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
@ -1143,7 +1144,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * src = node->src[j];
            if (src == NULL) {
-                break;
+                continue;
            }
            if (tensor_backend_id(src) == -1) {
                tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
@ -1255,7 +1256,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * src = node->src[j];
            if (src == NULL) {
-                break;
+                continue;
            }
            int src_backend_id = tensor_backend_id(src);
            if (src_backend_id == -1) {
@ -1314,7 +1315,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
            for (int j = 0; j < GGML_MAX_SRC; j++) {
                struct ggml_tensor * src = node->src[j];
                if (src == NULL) {
-                    break;
+                    continue;
                }
                int src_backend_id = tensor_backend_id(src);
                assert(src_backend_id != -1); // all inputs should be assigned by now
@ -1361,7 +1362,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * src = node->src[j];
            if (src == NULL) {
-                break;
+                continue;
            }
            ggml_backend_t src_backend = tensor_backend(src);
            if (src_backend != tensor_backend /* && src_backend != NULL */) {
@ -1667,7 +1668,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
    for (int i = 0; i < GGML_MAX_SRC; i++) {
        struct ggml_tensor * s = src->src[i];
        if (s == NULL) {
-            break;
+            continue;
        }
        dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
    }
@ -1696,7 +1697,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
    for (int i = 0; i < GGML_MAX_SRC; i++) {
        struct ggml_tensor * s = src->src[i];
        if (s == NULL) {
-            break;
+            continue;
        }
        graph_copy_init_tensor(hash_set, node_copies, node_init, s);
    }
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -54,6 +54,8 @@
 #define cudaDeviceProp hipDeviceProp_t
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
 #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
 #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
 #define cudaEventCreateWithFlags hipEventCreateWithFlags
 #define cudaEventDisableTiming hipEventDisableTiming
 #define cudaEventRecord hipEventRecord
@ -517,6 +519,15 @@ typedef struct {
 } block_iq3_xxs;
 static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
 #define QR1_S 8
 #define QI1_S (QK_K / (4*QR1_S))
 typedef struct {
    half d;
    uint8_t qs[QK_K/8];
    uint8_t scales[QK_K/16];
 } block_iq1_s;
 static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
 #define WARP_SIZE 32
 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
@ -643,18 +654,18 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
    return a;
 }
-static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
+//static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
-#pragma unroll
+//#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
+//    for (int mask = 16; mask > 0; mask >>= 1) {
-        a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
+//        a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
-    }
+//    }
-    return a;
+//    return a;
-#else
+//#else
-    (void) a;
+//    (void) a;
-    NO_DEVICE_CODE;
+//    NO_DEVICE_CODE;
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
-}
+//}
 static __device__ __forceinline__ float warp_reduce_max(float x) {
 #pragma unroll
@ -664,18 +675,18 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
    return x;
 }
-static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
+//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
+//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
-#pragma unroll
+//#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
+//    for (int mask = 16; mask > 0; mask >>= 1) {
-        x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
+//        x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
-    }
+//    }
-    return x;
+//    return x;
-#else
+//#else
-    (void) x;
+//    (void) x;
-    NO_DEVICE_CODE;
+//    NO_DEVICE_CODE;
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
+//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
-}
+//}
 static __device__ __forceinline__ float op_repeat(const float a, const float b) {
    return b;
@ -1682,6 +1693,137 @@ static const __device__ uint32_t iq3xxs_grid[256] = {
    0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
 };
 static const __device__ uint64_t iq1s_grid[512] = {
    0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
    0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
    0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
    0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
    0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
    0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
    0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
    0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
    0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
    0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
    0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
    0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
    0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
    0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
    0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
    0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
    0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
    0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
    0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
    0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
    0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
    0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
    0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
    0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
    0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
    0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
    0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
    0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
    0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
    0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
    0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
    0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
    0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
    0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
    0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
    0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
    0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
    0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
    0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
    0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
    0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
    0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
    0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
    0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
    0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
    0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
    0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
    0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
    0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
    0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
    0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
    0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
    0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
    0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
    0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
    0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
    0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
    0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
    0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
    0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
    0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
    0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
    0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
    0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
    0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
    0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
    0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
    0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
    0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
    0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
    0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
    0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
    0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
    0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
    0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
    0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
    0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
    0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
    0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
    0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
    0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
    0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
    0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
    0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
    0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
    0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
    0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
    0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
    0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
    0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
    0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
    0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
    0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
    0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
    0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
    0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
    0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
    0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
    0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
    0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
    0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
    0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
    0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
    0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
    0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
    0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
    0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
    0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
    0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
    0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
    0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
    0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
    0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
    0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
    0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
    0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
    0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
    0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
    0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
    0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
    0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
    0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
    0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
    0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
    0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
    0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
    0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
    0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
 };
 static const __device__ uint8_t ksigns_iq2xs[128] = {
      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
@ -1824,6 +1966,29 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
 }
 template<typename dst_t>
 static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const int i   = blockIdx.x;
    const block_iq1_s * x = (const block_iq1_s  *) vx;
    const int tid = threadIdx.x;
 #if QK_K == 256
    const int il = tid/8; // 0...3
    const int ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const int i8 = 4*ib+il;
    uint8_t h = x[i].scales[i8/2] >> 4*(i8%2);
    const int8_t * grid = (const int8_t *)(iq1s_grid + (x[i].qs[i8] | ((h & 8) << 5)));
    const float d = (float)x[i].d * (2*(h & 7) + 1);
    for (int j = 0; j < 8; ++j) y[j] = d * grid[j];
 #else
    assert(false);
 #endif
 }
 static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
@ -4479,10 +4644,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
    const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
    return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
 #else
    (void) ksigns64;
    assert(false);
    return 0.f;
 #endif
 #else
    (void) ksigns64;
    assert(false);
    return 0.f;
 #endif
@ -4523,6 +4690,49 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
 #endif
 }
 static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 #if QK_K == 256
    const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
    const int ib32 = iqs;
    int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
    const uint8_t h1 = bq1->scales[2*ib32+0];
    const uint8_t h2 = bq1->scales[2*ib32+1];
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
    const int * q8 = (const int *)bq8_1[ib32].qs;
    const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
    const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
    const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
    const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
    for (int j = 0; j < 2; ++j) {
        sumi1 = __dp4a(q8[j+0], grid1[j], sumi1);
        sumi2 = __dp4a(q8[j+2], grid2[j], sumi2);
        sumi3 = __dp4a(q8[j+4], grid3[j], sumi3);
        sumi4 = __dp4a(q8[j+6], grid4[j], sumi4);
    }
 #else
    const int8_t   * q8 = bq8_1[ib32].qs;
    const int8_t * grid1 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
    const int8_t * grid2 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
    const int8_t * grid3 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
    const int8_t * grid4 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
    for (int j = 0; j < 8; ++j) {
        sumi1 += q8[j+ 0] * grid1[j];
        sumi2 += q8[j+ 8] * grid2[j];
        sumi3 += q8[j+16] * grid3[j];
        sumi4 += q8[j+24] * grid4[j];
    }
 #endif
    const float d = (float)bq1->d * __low2float(bq8_1[ib32].ds);
    return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) +
                sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1));
 #else
    assert(false);
    return 0.f;
 #endif
 }
 template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
              allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
 static __device__ __forceinline__ void mul_mat_q(
@ -5957,149 +6167,31 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
 }
 template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
 static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
    const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
    const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;
    const int tid  = threadIdx.x;
    const int rowx = blockIdx.x;
    const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
    const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
    const int warp_id = threadIdx.x / WARP_SIZE;
    const int lane_id = threadIdx.x % WARP_SIZE;
    extern __shared__ half data_soft_max_f16[];
    half  * buf_iw = data_soft_max_f16 + 0; // shared memory buffer for inter-warp communication
    // (shared memory) buffer to cache values between iterations:
    half2 * vals = vals_smem ? (half2 *) (buf_iw + WARP_SIZE) : (half2 *) (dst + rowx*ncols_data);
    // if the buffer is larger than max. shared memory per block, use dst as temp. buffer instead
    // in that case col_smem == col_data must be enforced to avoid race conditions
    half2 max_val = make_half2(-INFINITY, -INFINITY);
 #pragma unroll
    for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
        const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
        const int col_smem = vals_smem ? col0 + tid : col_data;
        const int ix = rowx*ncols_data + col_data;
        const int iy = rowy*ncols_data + col_data;
        half2 val;
        if (need_check && col_data + 0 >= ncols_data) {
            val.x = -INFINITY;
        } else {
            val.x = x[ix + 0]*scale + (y ? y[iy + 0] : 0.0f);
        }
        if (need_check && col_data + WARP_SIZE >= ncols_data) {
            val.y = -INFINITY;
        } else {
            val.y = x[ix + WARP_SIZE]*scale + (y ? y[iy + WARP_SIZE] : 0.0f);
        }
        if (!need_check || col_smem < (vals_smem ? ncols_smem : ncols_data)) {
            vals[col_smem] = val;
        }
        max_val = __hmax2(max_val, val);
    }
    // find the max value in the block
    max_val = warp_reduce_max(max_val);
    if (block_size > WARP_SIZE) {
        if (warp_id == 0) {
            buf_iw[lane_id] = -INFINITY;
        }
        __syncthreads();
        if (lane_id == 0) {
            buf_iw[warp_id] = __hmax(max_val.x, max_val.y);
        }
        __syncthreads();
        max_val = __half2half2(buf_iw[lane_id]);
        max_val = warp_reduce_max(max_val);
    } else {
        max_val = __half2half2(__hmax(max_val.x, max_val.y));
    }
    half2 tmp = make_half2(0.0f, 0.0f); // partial sums
 #pragma unroll
    for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
        const int col_smem = vals_smem ? col0 + tid : 2*col0 + 2*warp_id*WARP_SIZE + lane_id;
        if (ncols_template == 0 && col_smem >= (vals_smem ? ncols_smem : ncols_data)) {
            break;
        }
        const half2 val = h2exp(vals[col_smem] - max_val);
        tmp += val;
        vals[col_smem] = val;
    }
    // find the sum of exps in the block
    tmp = warp_reduce_sum(tmp);
    if (block_size > WARP_SIZE) {
        if (warp_id == 0) {
            buf_iw[lane_id] = 0.0f;
        }
        __syncthreads();
        if (lane_id == 0) {
            buf_iw[warp_id] = tmp.x + tmp.y;
        }
        __syncthreads();
        tmp = __half2half2(buf_iw[lane_id]);
        tmp = warp_reduce_sum(tmp);
    } else {
        tmp = __half2half2(tmp.x + tmp.y);
    }
    const half2 inv_sum = make_half2(1.0f, 1.0f) / tmp;
 #pragma unroll
    for (int col0 = 0; col0 < ncols_smem; col0 += block_size) {
        const int col_data = 2*col0 + 2*WARP_SIZE*warp_id + lane_id;
        const int col_smem = vals_smem ? col0 + tid : col_data;
        const int idst = rowx*ncols_data + col_data;
        const half2 result = vals[col_smem] * inv_sum;
        if (need_check && col_data + 0 >= ncols_data) {
            return;
        }
        dst[idst] = result.x;
        if (need_check && col_data + WARP_SIZE >= ncols_data) {
            return;
        }
        dst[idst + WARP_SIZE] = result.y;
    }
 #else
    (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
    NO_DEVICE_CODE;
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
 }
 template <bool vals_smem, int ncols_template, int block_size_template>
-static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
+static __global__ void soft_max_f32(const float * x, const float * mask, const float * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
    const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
    const int tid  = threadIdx.x;
    const int rowx = blockIdx.x;
-    const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
+    const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension
    const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
    const int warp_id = threadIdx.x / WARP_SIZE;
    const int lane_id = threadIdx.x % WARP_SIZE;
    float slope = 0.0f;
    // ALiBi
    if (max_bias > 0.0f) {
        const int h = rowx/nrows_y; // head index
        const float base = h < n_head_log2 ? m0 : m1;
        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
        slope = powf(base, exp);
    }
    extern __shared__ float data_soft_max_f32[];
    float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
    // shared memory buffer to cache values between iterations:
@ -6118,7 +6210,8 @@ static __global__ void soft_max_f32(const float * x, const float * y, float * ds
        const int ix = rowx*ncols + col;
        const int iy = rowy*ncols + col;
-        const float val = x[ix]*scale + (y ? y[iy] : 0.0f);
+        const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
        vals[col] = val;
        max_val = max(max_val, val);
    }
@ -6679,6 +6772,12 @@ static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k,
    dequantize_block_iq3_xxs<<<nb, 32, 0, stream>>>(vx, y);
 }
 template<typename dst_t>
 static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
    const int nb = k / QK_K;
    dequantize_block_iq1_s<<<nb, 32, 0, stream>>>(vx, y);
 }
 template <typename src_t, typename dst_t>
 static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
@ -6718,6 +6817,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
            return dequantize_row_iq2_xs_cuda;
        case GGML_TYPE_IQ3_XXS:
            return dequantize_row_iq3_xxs_cuda;
        case GGML_TYPE_IQ1_S:
            return dequantize_row_iq1_s_cuda;
        case GGML_TYPE_F32:
            return convert_unary_cuda<float>;
        default:
@ -6753,6 +6854,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
            return dequantize_row_iq2_xs_cuda;
        case GGML_TYPE_IQ3_XXS:
            return dequantize_row_iq3_xxs_cuda;
        case GGML_TYPE_IQ1_S:
            return dequantize_row_iq1_s_cuda;
        case GGML_TYPE_F16:
            return convert_unary_cuda<half>;
        default:
@ -7590,89 +7693,53 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
    diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
 }
-static void soft_max_f16_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
+static void soft_max_f32_cuda(const float * x, const float * mask, const float * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
    int nth = WARP_SIZE;
    while (nth < ncols_x/2 && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
    const dim3 block_dims(nth,     1, 1);
    const dim3 block_nums(nrows_x, 1, 1);
    const size_t shmem = (GGML_PAD(ncols_x, 2*WARP_SIZE) + WARP_SIZE)*sizeof(half);
    static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
    if (shmem <= g_device_caps[g_main_device].smpb) {
        switch (ncols_x) {
            case 32:
                soft_max_f16<true, 32, 32, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
                break;
            case 64:
                soft_max_f16<true, 64, 32, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
                break;
            case 128:
                soft_max_f16<true, 128, 64, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
                break;
            case 256:
                soft_max_f16<true, 256, 128, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
                break;
            case 512:
                soft_max_f16<true, 512, 256, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
                break;
            case 1024:
                soft_max_f16<true, 1024, 512, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
                break;
            case 2048:
                soft_max_f16<true, 2048, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
                break;
            case 4096:
                soft_max_f16<true, 4096, 1024, false><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
                break;
            default:
                soft_max_f16<true, 0, 0, true><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
                break;
        }
    } else {
        const size_t shmem_low = WARP_SIZE*sizeof(half);
        soft_max_f16<false, 0, 0, true><<<block_nums, block_dims, shmem_low, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
    }
 }
 static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
    int nth = WARP_SIZE;
    while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
    const dim3 block_dims(nth,     1, 1);
    const dim3 block_nums(nrows_x, 1, 1);
    const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
    static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
    const uint32_t n_head_kv   = nrows_x/nrows_y;
    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
    if (shmem < g_device_caps[g_main_device].smpb) {
        switch (ncols_x) {
            case 32:
-                soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 64:
-                soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 128:
-                soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 256:
-                soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 512:
-                soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 1024:
-                soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 2048:
-                soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            case 4096:
-                soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
            default:
-                soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+                soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
                break;
        }
    } else {
        const size_t shmem_low = WARP_SIZE*sizeof(float);
-        soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, y, dst, ncols_x, nrows_y, scale);
+        soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
    }
 }
@ -8523,6 +8590,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
        default:
            GGML_ASSERT(false);
@ -8546,6 +8614,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUD
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
            return max_compute_capability >= CC_VOLTA ? 128 : 64;
        case GGML_TYPE_Q6_K:
            return 64;
@ -8643,6 +8712,10 @@ static void ggml_cuda_op_mul_mat_vec_q(
            mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        case GGML_TYPE_IQ1_S:
            mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
                (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
            break;
        default:
            GGML_ASSERT(false);
            break;
@ -9082,30 +9155,36 @@ static void ggml_cuda_op_soft_max(
    GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
-    const int64_t ne00 = src0->ne[0];
+    const int64_t ne00    = src0->ne[0];
    const int64_t nrows_x = ggml_nrows(src0);
-    const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
+    const int64_t nrows_y = src0->ne[1];
-    float scale = 1.0f;
+    float scale    = 1.0f;
-    memcpy(&scale, dst->op_params, sizeof(float));
+    float max_bias = 0.0f;
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX
+    memcpy(&scale,    (float *) dst->op_params + 0, sizeof(float));
-#ifdef GGML_CUDA_F16
+    memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
    const bool use_f16_soft_max = true;
 #else
    const bool use_f16_soft_max = false;
 #endif // GGML_CUDA_F16
 #else
    const bool use_f16_soft_max = false;
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX
-    if (use_f16_soft_max) {
+    // positions tensor
-        soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
+    float * src2_dd = nullptr;
-    } else {
+    cuda_pool_alloc<float> src2_f;
-        soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
+
    ggml_tensor * src2 = dst->src[2];
    const bool use_src2 = src2 != nullptr;
    if (use_src2) {
        const bool src2_on_device = src2->backend == GGML_BACKEND_GPU;
        if (src2_on_device) {
            ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
            src2_dd = (float *) src2_extra->data_device[g_main_device];
        } else {
            src2_dd = src2_f.alloc(ggml_nelements(src2));
            CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
        }
    }
-    (void) dst;
+    soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream);
 }
 static void ggml_cuda_op_scale(
@ -9240,9 +9319,15 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
            CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
            if (can_access_peer) {
                if (enable_peer_access) {
-                    CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
+                    cudaError_t err = cudaDeviceEnablePeerAccess(id_other, 0);
                    if (err != cudaErrorPeerAccessAlreadyEnabled) {
                        CUDA_CHECK(err);
                    }
                } else {
-                    CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
+                    cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
                    if (err != cudaErrorPeerAccessNotEnabled) {
                        CUDA_CHECK(err);
                    }
                }
            }
        }
@ -10920,10 +11005,10 @@ GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backe
    UNUSED(buffer);
 }
-// unused at the moment
+static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
-//static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
+    return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
-//    return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
+    UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
-//}
+}
 GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
@ -11311,7 +11396,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            if (node->src[j] != nullptr) {
                assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
-                assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
+                assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
                assert(node->src[j]->extra != nullptr);
            }
        }
@ -11359,7 +11444,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                    return false;
                }
                ggml_type a_type = a->type;
-                if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS) {
+                if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ1_S) {
                    if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
                        return false;
                    }
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -61,6 +61,7 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,
    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,
    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,
    GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,
    GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,
    GGML_METAL_KERNEL_TYPE_RMS_NORM,
    GGML_METAL_KERNEL_TYPE_GROUP_NORM,
@ -83,6 +84,7 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,
  //GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,
@ -101,6 +103,7 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,
@ -116,6 +119,7 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
@ -131,6 +135,7 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
    GGML_METAL_KERNEL_TYPE_ROPE_F32,
    GGML_METAL_KERNEL_TYPE_ROPE_F16,
    GGML_METAL_KERNEL_TYPE_ALIBI_F32,
@ -176,7 +181,7 @@ struct ggml_metal_context {
 // MSL code
 // TODO: move the contents here when ready
 //       for now it is easier to work in a separate file
-//static NSString * const msl_library_source = @"see metal.metal";
+// static NSString * const msl_library_source = @"see metal.metal";
 // Here to assist with NSBundle Path Hack
@interface GGMLMetalClass : NSObject
@ -272,6 +277,14 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
                return NULL;
            }
        } else {
 #if GGML_METAL_EMBED_LIBRARY
            GGML_METAL_LOG_INFO("%s: using embedded metal library\n", __func__);
            extern const char ggml_metallib_start[];
            extern const char ggml_metallib_end[];
            NSString * src  = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
 #else
            GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
            NSString * sourcePath;
@ -294,6 +307,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                return NULL;
            }
 #endif
            @autoreleasepool {
                // dictionary of preprocessor macros
@ -433,6 +447,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,          get_rows_iq2_xxs,       true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,           get_rows_iq2_xs,        true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS,          get_rows_iq3_xxs,       true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S,            get_rows_iq1_s,         true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,              get_rows_i32,           true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM,                  rms_norm,               ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM,                group_norm,             ctx->support_simdgroup_reduction);
@ -455,6 +470,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,        mul_mv_iq2_xxs_f32,     ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,         mul_mv_iq2_xs_f32,      ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32,        mul_mv_iq3_xxs_f32,     ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32,          mul_mv_iq1_s_f32,       ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,         mul_mv_id_f32_f32,      ctx->support_simdgroup_reduction);
      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,         mul_mv_id_f16_f16,      ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,         mul_mv_id_f16_f32,      ctx->support_simdgroup_reduction);
@ -473,6 +489,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,     mul_mv_id_iq2_xxs_f32,  ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,      mul_mv_id_iq2_xs_f32,   ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32,     mul_mv_id_iq3_xxs_f32,  ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32,       mul_mv_id_iq1_s_f32,    ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,            mul_mm_f32_f32,         ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,            mul_mm_f16_f32,         ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,           mul_mm_q4_0_f32,        ctx->support_simdgroup_mm);
@ -488,6 +505,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,        mul_mm_iq2_xxs_f32,     ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,         mul_mm_iq2_xs_f32,      ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32,        mul_mm_iq3_xxs_f32,     ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32,          mul_mm_iq1_s_f32,       ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,         mul_mm_id_f32_f32,      ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,         mul_mm_id_f16_f32,      ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,        mul_mm_id_q4_0_f32,     ctx->support_simdgroup_mm);
@ -503,6 +521,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,     mul_mm_id_iq2_xxs_f32,  ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,      mul_mm_id_iq2_xs_f32,   ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,     mul_mm_id_iq3_xxs_f32,  ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,       mul_mm_id_iq1_s_f32,    ctx->support_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32,                  rope_f32,               true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16,                  rope_f16,               true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32,                 alibi_f32,              true);
@ -728,6 +747,7 @@ static bool ggml_metal_graph_compute(
        size_t offs_src0 = 0;
        size_t offs_src1 = 0;
        size_t offs_src2 = 0;
        size_t offs_dst  = 0;
        id<MTLCommandBuffer> command_buffer  = command_buffers[cb_idx];
@ -746,6 +766,7 @@ static bool ggml_metal_graph_compute(
            struct ggml_tensor * src0 = gf->nodes[i]->src[0];
            struct ggml_tensor * src1 = gf->nodes[i]->src[1];
            struct ggml_tensor * src2 = gf->nodes[i]->src[2];
            struct ggml_tensor * dst  = gf->nodes[i];
            switch (dst->op) {
@ -807,6 +828,7 @@ static bool ggml_metal_graph_compute(
            id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil;
            id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil;
            id<MTLBuffer> id_src2 = src2 ? ggml_metal_get_buffer(src2, &offs_src2) : nil;
            id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(dst,  &offs_dst)  : nil;
            //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
@ -1188,7 +1210,16 @@ static bool ggml_metal_graph_compute(
                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
                        }
-                        const float scale = ((float *) dst->op_params)[0];
+                        const float scale    = ((float *) dst->op_params)[0];
                        const float max_bias = ((float *) dst->op_params)[1];
                        const int64_t nrows_x = ggml_nrows(src0);
                        const int64_t nrows_y = src0->ne[1];
                        const uint32_t n_head_kv   = nrows_x/nrows_y;
                        const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
                        const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
                        const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
                        [encoder setComputePipelineState:pipeline];
                        [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
@ -1197,11 +1228,20 @@ static bool ggml_metal_graph_compute(
                        } else {
                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:1];
                        }
-                        [encoder setBuffer:id_dst  offset:offs_dst    atIndex:2];
+                        if (id_src2) {
-                        [encoder setBytes:&ne00  length:sizeof(ne00)  atIndex:3];
+                            [encoder setBuffer:id_src2 offset:offs_src2   atIndex:2];
-                        [encoder setBytes:&ne01  length:sizeof(ne01)  atIndex:4];
+                        } else {
-                        [encoder setBytes:&ne02  length:sizeof(ne02)  atIndex:5];
+                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:2];
-                        [encoder setBytes:&scale length:sizeof(scale) atIndex:6];
+                        }
                        [encoder setBuffer:id_dst   offset:offs_dst          atIndex:3];
                        [encoder setBytes:&ne00     length:sizeof(ne00)      atIndex:4];
                        [encoder setBytes:&ne01     length:sizeof(ne01)      atIndex:5];
                        [encoder setBytes:&ne02     length:sizeof(ne02)      atIndex:6];
                        [encoder setBytes:&scale    length:sizeof(scale)     atIndex:7];
                        [encoder setBytes:&max_bias length:sizeof(max_bias)  atIndex:8];
                        [encoder setBytes:&m0       length:sizeof(m0)        atIndex:9];
                        [encoder setBytes:&m1       length:sizeof(m1)        atIndex:10];
                        [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:11];
                        [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
                        [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
@ -1297,6 +1337,7 @@ static bool ggml_metal_graph_compute(
                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
                                case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break;
                                case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32  ].pipeline; break;
                                default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
                            }
@ -1431,6 +1472,12 @@ static bool ggml_metal_graph_compute(
                                        nth1 = 16;
                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline;
                                    } break;
                                case GGML_TYPE_IQ1_S:
                                    {
                                        nth0 = 4;
                                        nth1 = 16;
                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline;
                                    } break;
                                default:
                                    {
                                        GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
@ -1465,7 +1512,7 @@ static bool ggml_metal_graph_compute(
                            if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
                                src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
-                                src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
+                                src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S) { // || src0t == GGML_TYPE_Q4_K) {
                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
@ -1514,8 +1561,6 @@ static bool ggml_metal_graph_compute(
                        // max size of the src1ids array in the kernel stack
                        GGML_ASSERT(ne11 <= 512);
                        struct ggml_tensor * src2 = gf->nodes[i]->src[2];
                        const int64_t  ne20 = src2 ? src2->ne[0] : 0;
                        const int64_t  ne21 = src2 ? src2->ne[1] : 0;
                        const int64_t  ne22 = src2 ? src2->ne[2] : 0;
@ -1573,6 +1618,7 @@ static bool ggml_metal_graph_compute(
                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
                                case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
                                case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32  ].pipeline; break;
                                default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
                            }
@ -1710,6 +1756,12 @@ static bool ggml_metal_graph_compute(
                                        nth1 = 16;
                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline;
                                    } break;
                                case GGML_TYPE_IQ1_S:
                                    {
                                        nth0 = 4;
                                        nth1 = 16;
                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline;
                                    } break;
                                default:
                                    {
                                        GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
@ -1760,7 +1812,7 @@ static bool ggml_metal_graph_compute(
                            if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
                                src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
-                                src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) {
+                                src2t == GGML_TYPE_Q2_K || src2t == GGML_TYPE_IQ1_S) { // || src2t == GGML_TYPE_Q4_K) {
                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
@ -1814,6 +1866,7 @@ static bool ggml_metal_graph_compute(
                            case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
                            case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
                            case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break;
                            case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S  ].pipeline; break;
                            case GGML_TYPE_I32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32    ].pipeline; break;
                            default: GGML_ASSERT(false && "not implemented");
                        }
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -351,12 +351,17 @@ kernel void kernel_sum_rows(
 kernel void kernel_soft_max(
        device const float * src0,
        device const float * src1,
        device const float * src2,
        device       float * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
        constant     float & scale,
-        threadgroup float  * buf [[threadgroup(0)]],
+        constant     float & max_bias,
        constant     float & m0,
        constant     float & m1,
        constant  uint32_t & n_head_log2,
        threadgroup  float * buf [[threadgroup(0)]],
        uint  tgpig[[threadgroup_position_in_grid]],
        uint  tpitg[[thread_position_in_threadgroup]],
        uint  sgitg[[simdgroup_index_in_threadgroup]],
@ -368,13 +373,26 @@ kernel void kernel_soft_max(
    device const float * psrc0 =         src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
    device const float * pmask = src1 != src0 ? src1                               + i01*ne00 : nullptr;
    device const float * ppos  = src2 != src0 ? src2                                          : nullptr;
    device       float * pdst  =         dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
    float slope = 0.0f;
    // ALiBi
    if (max_bias > 0.0f) {
        const int64_t h = i02;
        const float base = h < n_head_log2 ? m0 : m1;
        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
        slope = pow(base, exp);
    }
    // parallel max
    float lmax = -INFINITY;
    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f));
+        lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
    }
    // find the max value in the block
@ -399,7 +417,7 @@ kernel void kernel_soft_max(
    // parallel sum
    float lsum = 0.0f;
    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
+        const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
        lsum += exp_psrc0;
        pdst[i00] = exp_psrc0;
    }
@ -437,12 +455,17 @@ kernel void kernel_soft_max(
 kernel void kernel_soft_max_4(
        device const float * src0,
        device const float * src1,
        device const float * src2,
        device       float * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
        constant     float & scale,
-        threadgroup float  * buf [[threadgroup(0)]],
+        constant     float & max_bias,
        constant     float & m0,
        constant     float & m1,
        constant  uint32_t & n_head_log2,
        threadgroup  float * buf [[threadgroup(0)]],
        uint  tgpig[[threadgroup_position_in_grid]],
        uint  tpitg[[thread_position_in_threadgroup]],
        uint  sgitg[[simdgroup_index_in_threadgroup]],
@ -454,13 +477,25 @@ kernel void kernel_soft_max_4(
    device const float4 * psrc4 =                (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
    device const float4 * pmask = src1 != src0 ? (device const float4 *)(src1 +                                      i01*ne00) : nullptr;
    device const float4 * ppos  = src2 != src0 ? (device const float4 *)(src2)                                                 : nullptr;
    device       float4 * pdst4 =                (device       float4 *)(dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
    float slope = 0.0f;
    if (max_bias > 0.0f) {
        const int64_t h = i02;
        const float base = h < n_head_log2 ? m0 : m1;
        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
        slope = pow(base, exp);
    }
    // parallel max
    float4 lmax4 = -INFINITY;
    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f));
+        lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
    }
    const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
@ -486,7 +521,7 @@ kernel void kernel_soft_max_4(
    // parallel sum
    float4 lsum4 = 0.0f;
    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
+        const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
        lsum4 += exp_psrc4;
        pdst4[i00] = exp_psrc4;
    }
@ -2490,6 +2525,13 @@ typedef struct {
 } block_iq3_xxs;
 // 98 bytes / block for QK_K = 256, so 3.0625 bpw
 typedef struct {
    half d;
    uint8_t qs[QK_K/8];
    uint8_t scales[QK_K/16];
 } block_iq1_s;
 //====================================== dot products =========================
 void kernel_mul_mv_q2_K_f32_impl(
@ -3747,6 +3789,137 @@ constexpr constant static uint32_t iq3xxs_grid[256] = {
    0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
 };
 #define NGRID_IQ1S 512
 constexpr constant static uint64_t iq1s_grid[NGRID_IQ1S] = {
    0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
    0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
    0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
    0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
    0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
    0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
    0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
    0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
    0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
    0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
    0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
    0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
    0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
    0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
    0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
    0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
    0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
    0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
    0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
    0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
    0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
    0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
    0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
    0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
    0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
    0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
    0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
    0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
    0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
    0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
    0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
    0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
    0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
    0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
    0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
    0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
    0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
    0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
    0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
    0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
    0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
    0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
    0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
    0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
    0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
    0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
    0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
    0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
    0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
    0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
    0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
    0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
    0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
    0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
    0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
    0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
    0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
    0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
    0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
    0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
    0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
    0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
    0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
    0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
    0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
    0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
    0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
    0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
    0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
    0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
    0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
    0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
    0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
    0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
    0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
    0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
    0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
    0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
    0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
    0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
    0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
    0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
    0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
    0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
    0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
    0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
    0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
    0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
    0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
    0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
    0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
    0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
    0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
    0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
    0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
    0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
    0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
    0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
    0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
    0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
    0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
    0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
    0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
    0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
    0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
    0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
    0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
    0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
    0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
    0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
    0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
    0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
    0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
    0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
    0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
    0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
    0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
    0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
    0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
    0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
    0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
    0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
    0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
    0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
    0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
    0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
    0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
    0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
 };
 constexpr constant static uint8_t ksigns_iq2xs[128] = {
      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
@ -3854,7 +4027,10 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
        y4 += 32 * 32;
    }
 #else
-    // TODO
+    (void) x;
    (void) y;
    (void) yl;
    (void) nb32;
 #endif
    for (int row = 0; row < N_DST; ++row) {
@ -3997,7 +4173,10 @@ void kernel_mul_mv_iq2_xs_f32_impl(
        y4 += 32 * 32;
    }
 #else
-    // TODO
+    (void) x;
    (void) y;
    (void) yl;
    (void) nb32;
 #endif
    for (int row = 0; row < N_DST; ++row) {
@ -4133,7 +4312,10 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
        y4 += 32 * 32;
    }
 #else
-    // TODO
+    (void) x;
    (void) y;
    (void) yl;
    (void) nb32;
 #endif
    for (int row = 0; row < N_DST; ++row) {
@ -4173,6 +4355,126 @@ kernel void kernel_mul_mv_iq3_xxs_f32(
    kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
 }
 void kernel_mul_mv_iq1_s_f32_impl(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
        constant   int64_t & ne10,
        constant   int64_t & ne12,
        constant   int64_t & ne0,
        constant   int64_t & ne1,
        constant   uint    & r2,
        constant   uint    & r3,
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint  tiisg[[thread_index_in_simdgroup]],
        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
    const int nb = ne00/QK_K;
    const int r0 = tgpig.x;
    const int r1 = tgpig.y;
    const int im = tgpig.z;
    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
    const int ib_row = first_row * nb;
    const uint i12 = im%ne12;
    const uint i13 = im/ne12;
    const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
    device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0;
    device const float       * y = (device const float       *) src1 + r1*ne10 + im*ne00*ne1;
    float yl[16];
    float sumf[N_DST]={0.f}, all_sum;
    const int nb32 = nb * (QK_K / 32);
 #if QK_K == 256
    const int ix = tiisg/2;
    const int il = tiisg%2;
    device const float * y4 = y + 32 * ix + 16 * il;
    for (int ib32 = ix; ib32 < nb32; ib32 += 16) {
        for (int i = 0; i < 16; ++i) {
            yl[i] = y4[i];
        }
        const int ibl = ib32 / (QK_K / 32);
        const int ib  = ib32 % (QK_K / 32);
        device const block_iq1_s * xr = x + ibl;
        device const uint8_t * qs = xr->qs + 4 * ib + 2 * il;
        device const uint8_t * sc = xr->scales + 2 * ib + il;
        device const half    * dh = &xr->d;
        for (int row = 0; row < N_DST; row++) {
            constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
            constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
            float2 sum = {0};
            for (int j = 0; j < 8; ++j) {
                sum[0] += yl[j+ 0] * grid1[j];
                sum[1] += yl[j+ 8] * grid2[j];
            }
            sumf[row] += (float)dh[0] * (sum[0] * (2*(sc[0] & 7) + 1) + sum[1] * (2*((sc[0] >> 4) & 7) + 1));
            dh += nb*sizeof(block_iq1_s)/2;
            qs += nb*sizeof(block_iq1_s);
            sc += nb*sizeof(block_iq1_s);
        }
        y4 += 16 * 32;
    }
 #else
    (void) x;
    (void) y;
    (void) yl;
    (void) nb32;
 #endif
    for (int row = 0; row < N_DST; ++row) {
        all_sum = simd_sum(sumf[row]);
        if (tiisg == 0) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
        }
    }
 }
 [[host_name("kernel_mul_mv_iq1_s_f32")]]
 kernel void kernel_mul_mv_iq1_s_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
        constant  uint64_t & nb00,
        constant  uint64_t & nb01,
        constant  uint64_t & nb02,
        constant   int64_t & ne10,
        constant   int64_t & ne11,
        constant   int64_t & ne12,
        constant  uint64_t & nb10,
        constant  uint64_t & nb11,
        constant  uint64_t & nb12,
        constant   int64_t & ne0,
        constant   int64_t & ne1,
        constant   uint    & r2,
        constant   uint    & r3,
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint  tiisg[[thread_index_in_simdgroup]],
        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
    kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg);
 }
 //============================= templates and their specializations =============================
@ -4369,6 +4671,8 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
    const float dl = d * sc[0];
    const float ml = min * sc[1];
 #else
    (void) get_scale_min_k4_just2;
    q = q + 16 * (il&1);
    device const uint8_t * s = xb->scales;
    device const half2 * dh = (device const half2 *)xb->d;
@ -4518,6 +4822,22 @@ void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x
    }
 }
 template <typename type4x4>
 void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) {
    // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
    const float d = xb->d;
    device const uint8_t * qs = xb->qs + 2*il;
    device const uint8_t * sc = xb->scales + il;
    const float dl1 = d * (2*(sc[0] & 7) + 1);
    const float dl2 = d * (2*((sc[0] >> 4) & 7) + 1);
    constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
    constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
    for (int i = 0; i < 8; ++i) {
        reg[i/4+0][i%4] = dl1 * grid1[i];
        reg[i/4+2][i%4] = dl2 * grid2[i];
    }
 }
 template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
 kernel void kernel_get_rows(
        device const  void * src0,
@ -5060,6 +5380,7 @@ template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows
 template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
 template [[host_name("kernel_get_rows_iq2_xs")]]  kernel get_rows_t kernel_get_rows<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
 template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
 template [[host_name("kernel_get_rows_iq1_s")]]   kernel get_rows_t kernel_get_rows<block_iq1_s,   QK_NL, dequantize_iq1_s>;
 //
 // matrix-matrix multiplication
@ -5099,6 +5420,7 @@ template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm<b
 template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
 template [[host_name("kernel_mul_mm_iq2_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
 template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
 template [[host_name("kernel_mul_mm_iq1_s_f32")]]   kernel mat_mm_t kernel_mul_mm<block_iq1_s,   QK_NL, dequantize_iq1_s>;
 //
 // indirect matrix-matrix multiplication
@ -5150,6 +5472,7 @@ template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mu
 template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
 template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
 template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
 template [[host_name("kernel_mul_mm_id_iq1_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s,   QK_NL, dequantize_iq1_s>;
 //
 // matrix-vector multiplication
@ -6117,3 +6440,66 @@ kernel void kernel_mul_mv_id_iq3_xxs_f32(
        tiisg,
        sgitg);
 }
 [[host_name("kernel_mul_mv_id_iq1_s_f32")]]
 kernel void kernel_mul_mv_id_iq1_s_f32(
        device const    char * ids,
        device const    char * src1,
        device         float * dst,
        constant    uint64_t & nbi1,
        constant     int64_t & ne00,
        constant     int64_t & ne01,
        constant     int64_t & ne02,
        constant    uint64_t & nb00,
        constant    uint64_t & nb01,
        constant    uint64_t & nb02,
        constant     int64_t & ne10,
        constant     int64_t & ne11,
        constant     int64_t & ne12,
        constant     int64_t & ne13,
        constant    uint64_t & nb10,
        constant    uint64_t & nb11,
        constant    uint64_t & nb12,
        constant     int64_t & ne0,
        constant     int64_t & ne1,
        constant    uint64_t & nb1,
        constant        uint & r2,
        constant        uint & r3,
        constant         int & idx,
        device const    char * src00,
        device const    char * src01,
        device const    char * src02,
        device const    char * src03,
        device const    char * src04,
        device const    char * src05,
        device const    char * src06,
        device const    char * src07,
        uint3                  tgpig[[threadgroup_position_in_grid]],
        uint                   tiitg[[thread_index_in_threadgroup]],
        uint                   tiisg[[thread_index_in_simdgroup]],
        uint                   sgitg[[simdgroup_index_in_threadgroup]]) {
    device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07};
    const int64_t bid = tgpig.z/(ne12*ne13);
    tgpig.z = tgpig.z%(ne12*ne13);
    const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx];
    kernel_mul_mv_iq1_s_f32_impl(
        src0[id],
        (device const float *) (src1 + bid*nb11),
        dst + bid*ne0,
        ne00,
        ne01,
        ne02,
        ne10,
        ne12,
        ne0,
        ne1,
        r2,
        r3,
        tgpig,
        tiisg,
        sgitg);
 }
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -1839,9 +1839,9 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
        float sigma2 = sumx2/QK_K;
        for (int j = 0; j < QK_K/16; ++j) {
            const float * restrict qw = quant_weights + QK_K * i + 16*j;
-            for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
+            for (int l = 0; l < QK_K/16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
-            for (int l = 0; l < 16; ++l) sw[j] += weight[l];
+            for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
-            scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
+            scales[j] = make_qkx3_quants(QK_K/16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
        }
        float dm  = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
@ -3482,6 +3482,139 @@ static const uint32_t iq3xxs_grid[256] = {
    0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
 };
 #define NGRID_IQ2XXS 512
 static const  uint64_t iq1s_grid[NGRID_IQ2XXS] = {
    0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000,
    0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01,
    0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100,
    0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00,
    0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101,
    0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100,
    0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00,
    0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff,
    0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000,
    0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000,
    0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001,
    0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff,
    0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01,
    0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001,
    0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00,
    0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001,
    0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100,
    0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000,
    0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000,
    0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000,
    0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff,
    0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff,
    0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01,
    0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100,
    0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff,
    0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000,
    0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101,
    0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff,
    0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff,
    0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001,
    0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01,
    0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101,
    0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100,
    0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00,
    0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001,
    0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff,
    0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000,
    0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000,
    0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100,
    0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100,
    0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01,
    0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff,
    0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101,
    0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000,
    0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff,
    0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000,
    0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff,
    0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00,
    0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101,
    0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000,
    0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000,
    0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000,
    0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100,
    0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000,
    0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
    0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff,
    0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000,
    0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000,
    0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
    0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000,
    0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff,
    0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000,
    0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001,
    0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
    0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100,
    0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000,
    0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00,
    0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100,
    0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000,
    0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001,
    0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00,
    0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff,
    0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100,
    0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff,
    0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000,
    0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff,
    0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff,
    0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00,
    0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001,
    0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001,
    0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01,
    0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000,
    0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101,
    0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00,
    0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100,
    0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101,
    0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101,
    0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000,
    0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff,
    0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff,
    0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101,
    0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff,
    0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101,
    0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001,
    0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff,
    0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff,
    0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01,
    0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff,
    0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100,
    0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001,
    0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00,
    0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff,
    0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff,
    0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000,
    0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000,
    0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101,
    0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001,
    0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000,
    0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101,
    0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000,
    0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001,
    0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000,
    0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100,
    0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000,
    0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000,
    0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100,
    0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff,
    0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff,
    0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00,
    0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101,
    0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000,
    0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00,
    0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000,
    0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff,
    0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101,
    0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff,
    0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00,
    0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff,
 };
 static const uint8_t ksigns_iq2xs[128] = {
      0, 129, 130,   3, 132,   5,   6, 135, 136,   9,  10, 139,  12, 141, 142,  15,
    144,  17,  18, 147,  20, 149, 150,  23,  24, 153, 154,  27, 156,  29,  30, 159,
@ -3580,6 +3713,49 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
    }
 }
 // ====================== 1.5625 bpw (de)-quantization
 void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;
    float db[4];
    uint16_t idx[4];
    //const int8_t * grid[4];
    for (int i = 0; i < nb; i++) {
        const float d = GGML_FP16_TO_FP32(x[i].d);
        const uint8_t * sc = x[i].scales;
        const uint8_t * qs = x[i].qs;
        for (int i8 = 0; i8 < QK_K/8; i8 += 4) {
            idx[0] = qs[0] | ((sc[0] & 0x08) << 5);
            idx[1] = qs[1] | ((sc[0] & 0x80) << 1);
            idx[2] = qs[2] | ((sc[1] & 0x08) << 5);
            idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
            //grid[0] = (const int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5)));
            //grid[1] = (const int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1)));
            //grid[2] = (const int8_t *)(iq1s_grid + (qs[2] | ((sc[1] & 0x08) << 5)));
            //grid[3] = (const int8_t *)(iq1s_grid + (qs[3] | ((sc[1] & 0x80) << 1)));
            db[0] = d * (2*(sc[0] & 7) + 1);
            db[1] = d * (2*((sc[0] >> 4) & 7) + 1);
            db[2] = d * (2*(sc[1] & 7) + 1);
            db[3] = d * (2*((sc[1] >> 4) & 7) + 1);
            for (int l = 0; l < 4; ++l) {
                const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
                for (int j = 0; j < 8; ++j) {
                    //y[j] = db[l] * grid[l][j];
                    y[j] = db[l] * grid[j];
                }
                y += 8;
            }
            qs += 4;
            sc += 2;
        }
    }
 }
 //===================================== Q8_K ==============================================
 void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
@ -3850,15 +4026,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
        const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
-        __m128i bx = _mm_and_si128(lowMask, tmp);
+        __m128i bx_0 = _mm_and_si128(lowMask, tmp);
-        __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs);
+        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
-        bx = _mm_sub_epi8(bx, off);
+        bx_0 = _mm_sub_epi8(bx_0, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
-        bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
+        bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
-        by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
+        by_0 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
-        bx = _mm_sub_epi8(bx, off);
+        bx_0 = _mm_sub_epi8(bx_0, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0);
        // Convert int32_t to float
        __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
@ -4444,21 +4620,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
        /* Compute combined scale for the block */
        const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        __m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
        __m128i bxhil = _mm256_castsi256_si128(bxhi);
        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
        bxhil = _mm_andnot_si128(bxhil, mask);
        bxhih = _mm_andnot_si128(bxhih, mask);
-        __m128i bxl = _mm256_castsi256_si128(bx);
+        __m128i bxl = _mm256_castsi256_si128(bx_0);
-        __m128i bxh = _mm256_extractf128_si256(bx, 1);
+        __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
        bxl = _mm_or_si128(bxl, bxhil);
        bxh = _mm_or_si128(bxh, bxhih);
-        bx = MM256_SET_M128I(bxh, bxl);
+        bx_0 = MM256_SET_M128I(bxh, bxl);
-        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+        const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+        const __m256 q = mul_sum_i8_pairs_float(bx_0, by_0);
        /* Multiply q with scale and accumulate */
        acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
@ -4751,22 +4927,22 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        __m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
        __m128i bxhil = _mm256_castsi256_si128(bxhi);
        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
        bxhil = _mm_and_si128(bxhil, mask);
        bxhih = _mm_and_si128(bxhih, mask);
-        __m128i bxl = _mm256_castsi256_si128(bx);
+        __m128i bxl = _mm256_castsi256_si128(bx_0);
-        __m128i bxh = _mm256_extractf128_si256(bx, 1);
+        __m128i bxh = _mm256_extractf128_si256(bx_0, 1);
        bxl = _mm_or_si128(bxl, bxhil);
        bxh = _mm_or_si128(bxh, bxhih);
-        bx = MM256_SET_M128I(bxh, bxl);
+        bx_0 = MM256_SET_M128I(bxh, bxl);
        const __m256 dy = _mm256_set1_ps(y[i].d);
-        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+        const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
-        const __m256 q = mul_sum_us8_pairs_float(bx, by);
+        const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
        acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
    }
@ -4995,10 +5171,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    for (int i = 0; i < nb; i++) {
        // load elements
-        vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl);
+        vint8m1_t bx_0 = __riscv_vle8_v_i8m1(x[i].qs, vl);
-        vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl);
+        vint8m1_t by_0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
-        vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl);
+        vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx_0, by_0, vl);
        vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl);
        vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl);
@ -9109,6 +9285,178 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
 #endif
 }
 #ifdef __AVX2__
 static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
    const __m256i ax = _mm256_sign_epi8(x, x);
    const __m256i sy = _mm256_sign_epi8(y, x);
    return _mm256_maddubs_epi16(ax, sy);
 }
 #endif
 void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
    assert(n % QK_K == 0);
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
    UNUSED(by);
    UNUSED(bs);
    const block_iq1_s * restrict x = vx;
    const block_q8_K  * restrict y = vy;
    const int nb = n / QK_K;
 #if defined __ARM_NEON
    const uint8x16_t m8 = vdupq_n_u8(0x08);
    const uint8x16_t m7 = vdupq_n_u8(0x07);
    const uint8x16_t m1 = vdupq_n_u8(0x01);
    const int32x4_t vzero = vdupq_n_s32(0);
    uint16_t gindex[8];
    uint16x8x2_t vindex;
    int8x16x4_t q1b;
    int8x16x4_t q8b;
    uint16x8x4_t scales;
    int32x4x2_t sumi;
    int32x4x2_t dotq;
    float sumf = 0;
    for (int i = 0; i < nb; ++i) {
        const int8_t  * q8 = y[i].qs;
        const uint8_t * qs = x[i].qs;
        const uint8_t * sc = x[i].scales;
        sumi.val[0] = sumi.val[1] = vzero;
        for (int i128 = 0; i128 < QK_K/128; ++i128) {
            const uint8x16_t ql = vld1q_u8(qs); qs += 16;
            const uint8x8_t tm1 = vld1_u8 (sc); sc +=  8;
            const uint8x8_t tm2 = vshr_n_u8(tm1, 4);
            const uint8x16_t qh = vcombine_u8(vzip1_u8(tm1, tm2), vzip2_u8(tm1, tm2));
            const uint8x16_t hbit = vandq_u8(qh, m8);
            vindex.val[0] = vorrq_u16(vmovl_u8(vget_low_u8 (ql)), vshlq_n_u16(vmovl_u8(vget_low_u8 (hbit)), 5));
            vindex.val[1] = vorrq_u16(vmovl_u8(vget_high_u8(ql)), vshlq_n_u16(vmovl_u8(vget_high_u8(hbit)), 5));
            const uint8x16_t scales8 = vorrq_u8(vshlq_n_u8(vandq_u8(qh, m7), 1), m1);
            scales.val[0] = vmovl_u8(vget_low_u8 (scales8));
            scales.val[1] = vmovl_u8(vget_high_u8 (scales8));
            for (int l = 0; l < 2; ++l) {
                vst1q_u16(gindex+0, vindex.val[l]);
                q1b.val[0] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[0])), vld1_s8((const void *)(iq1s_grid+gindex[1])));
                q1b.val[1] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[2])), vld1_s8((const void *)(iq1s_grid+gindex[3])));
                q1b.val[2] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[4])), vld1_s8((const void *)(iq1s_grid+gindex[5])));
                q1b.val[3] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[6])), vld1_s8((const void *)(iq1s_grid+gindex[7])));
                q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
                dotq.val[0] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(vzero, q1b.val[1], q8b.val[1]));
                dotq.val[1] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(vzero, q1b.val[3], q8b.val[3]));
                sumi.val[0] = vmlaq_s32(sumi.val[0], dotq.val[0], vreinterpretq_s32_u32(vmovl_u16(vget_low_u16 (scales.val[l]))));
                sumi.val[1] = vmlaq_s32(sumi.val[1], dotq.val[1], vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales.val[l]))));
            }
        }
        sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * vaddvq_s32(vaddq_s32(sumi.val[0], sumi.val[1]));
    }
    *s = sumf;
 #elif defined __AVX2__
    const __m128i m8 = _mm_set1_epi8(0x08);
    const __m128i m7 = _mm_set1_epi8(0x07);
    const __m128i m1 = _mm_set1_epi8(0x01);
    const __m128i shuffle_h = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
    const __m128i shuffle_s[4] = {
        _mm_set_epi32(0x03030303, 0x02020202, 0x01010101, 0x00000000),
        _mm_set_epi32(0x07070707, 0x06060606, 0x05050505, 0x04040404),
        _mm_set_epi32(0x0b0b0b0b, 0x0a0a0a0a, 0x09090909, 0x08080808),
        _mm_set_epi32(0x0f0f0f0f, 0x0e0e0e0e, 0x0d0d0d0d, 0x0c0c0c0c)
    };
    uint64_t aux64;
    __m256i v_gindex;
    const uint16_t * gindex = (const uint16_t *)&v_gindex;
    __m256 accum = _mm256_setzero_ps();
    for (int i = 0; i < nb; ++i) {
        const int8_t  * q8 = y[i].qs;
        const uint8_t * qs = x[i].qs;
        const uint8_t * sc = x[i].scales;
        __m256i sumi = _mm256_setzero_si256();
        for (int i128 = 0; i128 < QK_K/128; ++i128) {
            const __m128i ql = _mm_loadu_si128((const __m128i*)qs); qs += 16;
            memcpy(&aux64, sc, 8); sc += 8;
            const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h);
            const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8));
            v_gindex = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5));
            const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1);
            for (int i32 = 0; i32 < 4; ++i32) {
                const __m256i q8b = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
                const __m256i q1b = _mm256_set_epi64x(iq1s_grid[gindex[4*i32+3]], iq1s_grid[gindex[4*i32+2]],
                                                      iq1s_grid[gindex[4*i32+1]], iq1s_grid[gindex[4*i32+0]]);
                const __m256i dot = mul_add_epi8(q1b, q8b);
                const __m256i s16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, shuffle_s[i32]));
                const __m256i p   = _mm256_madd_epi16(s16, dot);
                sumi = _mm256_add_epi32(sumi, p);
            }
        }
        accum = _mm256_fmadd_ps(_mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)), _mm256_cvtepi32_ps(sumi), accum);
    }
    *s = hsum_float_8(accum);
 #else
    int db[4];
    uint16_t idx[4];
    float sumf = 0;
    for (int i = 0; i < nb; ++i) {
        const int8_t  * q8 = y[i].qs;
        const uint8_t * qs = x[i].qs;
        const uint8_t * sc = x[i].scales;
        int sumi = 0;
        for (int i32 = 0; i32 < QK_K/32; ++i32) {
            idx[0] = qs[0] | ((sc[0] & 0x08) << 5);
            idx[1] = qs[1] | ((sc[0] & 0x80) << 1);
            idx[2] = qs[2] | ((sc[1] & 0x08) << 5);
            idx[3] = qs[3] | ((sc[1] & 0x80) << 1);
            db[0] = (2*(sc[0] & 7) + 1);
            db[1] = (2*((sc[0] >> 4) & 7) + 1);
            db[2] = (2*(sc[1] & 7) + 1);
            db[3] = (2*((sc[1] >> 4) & 7) + 1);
            for (int l = 0; l < 4; ++l) {
                const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]);
                int suml = 0;
                for (int j = 0; j < 8; ++j) suml += q8[j] * grid[j];
                sumi += db[l] * suml;
                q8 += 8;
            }
            qs += 4;
            sc += 2;
        }
        sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * sumi;
    }
    *s = sumf;
 #endif
 }
 // ================================ IQ2 quantization =============================================
 typedef struct {
@ -9117,14 +9465,22 @@ typedef struct {
    uint16_t * neighbours;
 } iq2_entry_t;
-static iq2_entry_t iq2_data[2] = {
+static iq2_entry_t iq2_data[3] = {
    {NULL, NULL, NULL},
    {NULL, NULL, NULL},
    {NULL, NULL, NULL},
 };
-static inline int iq2_data_index(int grid_size) {
+static inline int iq2_data_index(enum ggml_type type) {
-    GGML_ASSERT(grid_size == 256 || grid_size == 512);
+    GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
-    return grid_size == 256 ? 0 : 1;
+    return type == GGML_TYPE_IQ2_XXS ? 0 :
           type == GGML_TYPE_IQ2_XS  ? 1 : 2;
 }
 static inline int iq2_grid_size(enum ggml_type type) {
    GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
    return type == GGML_TYPE_IQ2_XXS ? 256 :
           type == GGML_TYPE_IQ2_XS  ? 512 : 512;
 }
 static int iq2_compare_func(const void * left, const void * right) {
@ -9133,12 +9489,13 @@ static int iq2_compare_func(const void * left, const void * right) {
    return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
 }
-void iq2xs_init_impl(int grid_size) {
+void iq2xs_init_impl(enum ggml_type type) {
-    const int gindex = iq2_data_index(grid_size);
+    const int gindex = iq2_data_index(type);
    const int grid_size = iq2_grid_size(type);
    if (iq2_data[gindex].grid) {
        return;
    }
-    static const uint16_t kgrid_256[256] = {
+    static const uint16_t kgrid_2bit_256[256] = {
            0,     2,     5,     8,    10,    17,    20,    32,    34,    40,    42,    65,    68,    80,    88,    97,
          100,   128,   130,   138,   162,   257,   260,   272,   277,   320,   388,   408,   512,   514,   546,   642,
         1025,  1028,  1040,  1057,  1060,  1088,  1090,  1096,  1120,  1153,  1156,  1168,  1188,  1280,  1282,  1288,
@ -9156,7 +9513,7 @@ void iq2xs_init_impl(int grid_size) {
        33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142,
        37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268,
    };
-    static const uint16_t kgrid_512[512] = {
+    static const uint16_t kgrid_2bit_512[512] = {
            0,     2,     5,     8,    10,    17,    20,    22,    25,    32,    34,    37,    40,    65,    68,    70,
           73,    80,    82,    85,    88,    97,   100,   128,   130,   133,   136,   145,   148,   153,   160,   257,
          260,   262,   265,   272,   274,   277,   280,   282,   289,   292,   320,   322,   325,   328,   337,   340,
@ -9190,9 +9547,45 @@ void iq2xs_init_impl(int grid_size) {
        40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
        42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
    };
    static const uint16_t kgrid_1bit_512[512] = {
           10,    33,    41,    85,   132,   134,   160,   162,   277,   337,   340,   345,   357,   405,   516,   545,
          553,   598,   641,   650,   681,  1042,  1044,  1097,  1169,  1176,  1320,  1345,  1365,  1378,  1434,  1444,
         1545,  1617,  1642,  1685,  2053,  2080,  2089,  2133,  2176,  2182,  2208,  2214,  2306,  2384,  2393,  2440,
         2453,  2581,  2664,  2690,  2721,  4117,  4161,  4182,  4184,  4261,  4357,  4369,  4372,  4377,  4390,  4422,
         4432,  4437,  4449,  4457,  4485,  4497,  4505,  4629,  4677,  4696,  4774,  5205,  5217,  5225,  5386,  5397,
         5409,  5445,  5457,  5460,  5461,  5462,  5465,  5472,  5477,  5525,  5545,  5650,  5668,  5717,  5729,  5769,
         5777,  6212,  6234,  6244,  6293,  6424,  6482,  6485,  6502,  6505,  6529,  6538,  6565,  6656,  6682,  6788,
         6806,  6820,  8218,  8224,  8226,  8232,  8277,  8326,  8354,  8469,  8521,  8530,  8549,  8596,  8737,  8794,
         9221,  9253,  9348,  9369,  9380,  9474,  9557,  9633,  9732,  9753,  9793,  9830,  9862,  9880, 10240, 10272,
        10282, 10321, 10406, 10517, 10530, 10566, 10585, 10645, 10896, 16466, 16468, 16473, 16485, 16646, 16660, 16665,
        16725, 16793, 16806, 16914, 16969, 16977, 16996, 17028, 17057, 17408, 17416, 17434, 17493, 17512, 17578, 17685,
        17696, 17733, 17745, 17748, 17749, 17750, 17753, 17765, 17794, 17813, 17946, 17984, 18005, 18072, 18453, 18529,
        18569, 18722, 18756, 18762, 18773, 18794, 18833, 18853, 18945, 19026, 19033, 19077, 20489, 20497, 20500, 20517,
        20565, 20586, 20610, 20633, 20757, 20769, 20776, 20805, 20817, 20820, 20821, 20822, 20825, 20837, 20864, 20872,
        20885, 20896, 21002, 21029, 21077, 21146, 21510, 21525, 21573, 21585, 21588, 21589, 21590, 21593, 21605, 21653,
        21665, 21765, 21777, 21780, 21781, 21782, 21785, 21797, 21825, 21828, 21829, 21830, 21833, 21840, 21841, 21842,
        21844, 21846, 21848, 21849, 21850, 21857, 21860, 21861, 21862, 21865, 21893, 21905, 21908, 21909, 21910, 21913,
        21925, 22024, 22037, 22085, 22097, 22100, 22101, 22102, 22105, 22117, 22165, 22545, 22566, 22568, 22594, 22608,
        22613, 22676, 22697, 22793, 22805, 22853, 22865, 22868, 22869, 22870, 22873, 22885, 22933, 22946, 23046, 23072,
        23125, 23209, 24597, 24640, 24665, 24673, 24725, 24833, 24840, 24869, 24917, 24934, 24965, 25001, 25108, 25110,
        25152, 25184, 25192, 25234, 25616, 25618, 25625, 25685, 25704, 25738, 25744, 25770, 25877, 25897, 25925, 25937,
        25940, 25941, 25942, 25945, 25957, 25986, 26005, 26186, 26197, 26276, 26632, 26634, 26725, 26757, 26770, 26885,
        26965, 26976, 26986, 27032, 27153, 27174, 27200, 27208, 27240, 27269, 27282, 27290, 32778, 32800, 32802, 32808,
        32810, 32853, 32904, 32922, 32930, 32932, 33105, 33110, 33112, 33125, 33157, 33280, 33288, 33301, 33312, 33320,
        33424, 33797, 33829, 33858, 34068, 34133, 34146, 34176, 34217, 34306, 34342, 34441, 34454, 34468, 34832, 34918,
        34965, 34984, 35094, 35137, 35161, 35208, 35232, 35332, 35338, 35368, 35429, 36932, 36934, 36953, 37009, 37125,
        37136, 37138, 37145, 37157, 37205, 37220, 37258, 37290, 37444, 37446, 37465, 37478, 37525, 37905, 37968, 37973,
        38040, 38054, 38145, 38154, 38165, 38180, 38186, 38213, 38225, 38228, 38229, 38230, 38233, 38245, 38293, 38485,
        38504, 38530, 38938, 38985, 38993, 39012, 39040, 39173, 39192, 39253, 39265, 39301, 39316, 39322, 39442, 39497,
        39504, 39590, 40970, 40984, 40992, 41002, 41045, 41120, 41128, 41237, 41289, 41297, 41317, 41364, 41366, 41514,
        41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512,
        42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680,
    };
    const int kmap_size = 43692;
-    const int nwant = 2;
+    const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2;
-    const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
+    const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 :
                             type == GGML_TYPE_IQ2_XS  ? kgrid_2bit_512 : kgrid_1bit_512;
    uint64_t * kgrid_q2xs;
    int      * kmap_q2xs;
    uint16_t * kneighbors_q2xs;
@ -9288,9 +9681,9 @@ void iq2xs_init_impl(int grid_size) {
    free(dist2);
 }
-void iq2xs_free_impl(int grid_size) {
+void iq2xs_free_impl(enum ggml_type type) {
-    GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
+    GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S);
-    const int gindex = iq2_data_index(grid_size);
+    const int gindex = iq2_data_index(type);
    if (iq2_data[gindex].grid) {
        free(iq2_data[gindex].grid);       iq2_data[gindex].grid = NULL;
        free(iq2_data[gindex].map);        iq2_data[gindex].map  = NULL;
@ -9324,7 +9717,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
 static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
-    const int gindex = iq2_data_index(256);
+    const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
    const uint64_t * kgrid_q2xs      = iq2_data[gindex].grid;
    const int      * kmap_q2xs       = iq2_data[gindex].map;
@ -9497,7 +9890,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
 static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
-    const int gindex = iq2_data_index(512);
+    const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
    const uint64_t * kgrid_q2xs      = iq2_data[gindex].grid;
    const int      * kmap_q2xs       = iq2_data[gindex].map;
@ -10134,3 +10527,207 @@ void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * re
    assert(k % QK_K == 0);
    quantize_row_iq3_xxs_impl(x, y, k, NULL);
 }
 // =================================== 1.5 bpw ===================================================
 static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
        const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) {
    int num_neighbors = neighbours[0];
    GGML_ASSERT(num_neighbors > 0);
    float best_score = 0;
    int grid_index = -1;
    for (int j = 1; j <= num_neighbors; ++j) {
        const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
        float sumqx = 0, sumq2 = 0;
        for (int i = 0; i < 8; ++i) {
            float q = (pg[i] - 3)/2;
            float w = weight[i];
            sumqx += w*q*xval[i];
            sumq2 += w*q*q;
        }
        if (sumqx > 0 && sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
            *scale = sumqx/sumq2; best_score = *scale * sumqx;
            grid_index = neighbours[j];
        }
    }
    if (grid_index < 0) {
        for (int i = 0; i < ngrid; ++i) {
            const int8_t * grid_i = (const int8_t *)(grid + i);
            float sumqx = 0, sumq2 = 0;
            for (int j = 0; j < 8; ++j) {
                float w = weight[j];
                float q = (grid_i[j] - 3)/2;
                sumqx += w*q*xval[j];
                sumq2 += w*q*q;
            }
            if (sumqx > 0 && sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
                *scale = sumqx/sumq2; best_score = *scale*sumqx;
                grid_index = i;
            }
        }
    }
    if (grid_index < 0) {
        printf("Oops, did not find grid point\n");
        printf("Have %d neighbours\n", num_neighbors);
        for (int j = 1; j <= num_neighbors; ++j) {
            const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
            float sumqx = 0, sumq2 = 0;
            for (int i = 0; i < 8; ++i) {
                float q = (pg[i] - 3)/2;
                float w = weight[i];
                sumqx += w*q*xval[i];
                sumq2 += w*q*q;
            }
            printf("    neighbour %d: sumqx = %g sumq2 = %g\n", j, (double)sumqx, (double)sumq2);
        }
    }
    GGML_ASSERT(grid_index >= 0);
    //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    *scale *= 1.05f;  // This is a fudge factor. Don't ask me why it improves the result.
    //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    const int8_t * pg = (const int8_t *)(grid + grid_index);
    for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
    return grid_index;
 }
 static int iq1_sort_helper(const void * left, const void * right) {
    const float * l = left;
    const float * r = right;
    return *l < *r ? -1 : *l > *r ? 1 : 0;
 }
 static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
    const int gindex = iq2_data_index(GGML_TYPE_IQ1_S);
    const uint64_t * kgrid_q2xs      = iq2_data[gindex].grid;
    const int      * kmap_q2xs       = iq2_data[gindex].map;
    const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
    GGML_ASSERT(quant_weights   && "missing quantization weights");
    GGML_ASSERT(kgrid_q2xs      && "forgot to call ggml_quantize_init()?");
    GGML_ASSERT(kmap_q2xs       && "forgot to call ggml_quantize_init()?");
    GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
    GGML_ASSERT(n%QK_K == 0);
    const int nbl = n/256;
    block_iq1_s * y = vy;
    float  scales[QK_K/8];
    float  weight[8];
    int8_t L[8];
    float  sumx[9];
    float  sumw[9];
    float  pairs[16];
    int * idx = (int *)(pairs + 1);
    uint8_t hbit[QK_K/8];
    for (int ibl = 0; ibl < nbl; ++ibl) {
        y[ibl].d = GGML_FP32_TO_FP16(0.f);
        memset(y[ibl].qs, 0, QK_K/8);
        memset(y[ibl].scales, 0, QK_K/16);
        float max_scale = 0;
        const float * xbl = x + QK_K*ibl;
        float sumx2 = 0;
        for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
        float sigma2 = sumx2/QK_K;
        for (int ib = 0; ib < QK_K/8; ++ib) {
            const float * xb = xbl + 8*ib;
            const float * qw = quant_weights + QK_K*ibl + 8*ib;
            for (int i = 0; i < 8; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
            float max = fabsf(xb[0]);
            for (int i = 1; i < 8; ++i) max = MAX(max, fabsf(xb[i]));
            if (!max) {
                scales[ib] = 0;
                memset(L, 1, 8);
                continue;
            }
            // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem.
            // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two
            // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights
            // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and
            // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale
            // for each possible and score for each split.
            for (int j = 0; j < 8; ++j) {
                pairs[2*j] = xb[j];
                idx[2*j] = j;
            }
            qsort(pairs, 8, 2*sizeof(float), iq1_sort_helper);
            {
                sumx[0] = sumw[0] = 0;
                for (int j = 0; j < 8; ++j) {
                    int i = idx[2*j];
                    sumx[j+1] = sumx[j] + weight[i]*xb[i];
                    sumw[j+1] = sumw[j] + weight[i];
                }
            }
            float best_score = 0, scale = max;
            int besti1 = 0, besti2 = 0;
            for (int i1 = 0; i1 <= 8; ++i1) {
                for (int i2 = i1; i2 <= 8; ++i2) {
                    float sumqx = -(sumx[i1] - sumx[0]) + (sumx[8] - sumx[i2]);
                    float sumq2 =  (sumw[i1] - sumw[0]) + (sumw[8] - sumw[i2]);
                    if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) {
                        scale = sumqx/sumq2; best_score = scale*sumqx;
                        besti1 = i1; besti2 = i2;
                    }
                }
            }
            for (int j =      0; j < besti1; ++j) L[idx[2*j]] = 0;
            for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1;
            for (int j = besti2; j <      8; ++j) L[idx[2*j]] = 2;
            if (scale < 0) {
                for (int j = 0; j < 8; ++j) L[j] = 2 - L[j];
                scale = -scale;
            }
            // Now we check if the solution found above corresponds to a grid point and, if not, use a neighbouring
            // grid point that minimizes SSD.
            uint16_t u = 0;
            for (int j = 0; j < 8; ++j) u |= (L[j] << 2*j);
            int grid_index = kmap_q2xs[u];
            if (grid_index < 0) {
                const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
                grid_index = iq1_find_best_neighbour(neighbours, kgrid_q2xs, xb, weight, &scale, L, NGRID_IQ2XXS);
                GGML_ASSERT(grid_index >= 0);
            }
            y[ibl].qs[ib] = grid_index & 255;
            hbit[ib] = grid_index >> 8;
            GGML_ASSERT(scale >= 0);
            scales[ib] = scale;
            max_scale = MAX(max_scale, scale);
        }
        if (!max_scale) {
            memset(y[ibl].qs, 0, QK_K/8);
            continue;
        }
        float d = max_scale/15;
        y[ibl].d = GGML_FP32_TO_FP16(d*1.085f); // 1.085f is another fudge factor. Don't ask me why it is needed.
        float id = 1/d;
        for (int ib = 0; ib < QK_K/8; ++ib) {
            int l = nearest_int(0.5f*(id*scales[ib]-1));
            l = MAX(0, MIN(7, l));
            if (hbit[ib]) l |= 8;
            y[ibl].scales[ib/2] |= (l << 4*(ib%2));
        }
    }
 }
 size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
    (void)hist;
    GGML_ASSERT(n_per_row%QK_K == 0);
    int nblock = n_per_row/QK_K;
    char * qrow = (char *)dst;
    for (int row = 0; row < nrow; ++row) {
        quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights);
        src += n_per_row;
        qrow += nblock*sizeof(block_iq1_s);
    }
    return nrow * nblock * sizeof(block_iq1_s);
 }
--- a/ggml-quants.h
+++ b/ggml-quants.h
@ -191,6 +191,13 @@ typedef struct {
 } block_iq3_xxs;
 static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
 typedef struct {
    ggml_fp16_t d;
    uint8_t qs[QK_K/8];
    uint8_t scales[QK_K/16];
 } block_iq1_s;
 static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -243,6 +250,7 @@ void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRI
 void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 void dequantize_row_iq1_s  (const block_iq1_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 // Dot product
 void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
@ -259,6 +267,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
 void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 //
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
@ -266,6 +275,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
 size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_iq1_s  (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q2_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q3_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q4_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
@ -276,8 +286,8 @@ size_t quantize_q4_1   (const float * src, void * dst, int nrows, int n_per_row,
 size_t quantize_q5_0   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q5_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-void iq2xs_init_impl(int grid_size);
+void iq2xs_init_impl(enum ggml_type type);
-void iq2xs_free_impl(int grid_size);
+void iq2xs_free_impl(enum ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -9188,174 +9188,22 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
    }
 }
-static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
+template <int qk, int qi, typename block_q_t, int vdr,
-                                       float *dst, const int ncols,
+          vec_dot_q_sycl_t vec_dot_q_sycl>
-                                       const int nrows,
+static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy,
-                                       dpct::queue_ptr stream) {
+                                         float *dst, const int ncols,
-    GGML_ASSERT(ncols % QK4_0 == 0);
+                                         const int nrows,
-    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+                                         dpct::queue_ptr stream) {
-    const sycl::range<3> block_nums(1, 1, block_num_y);
+  GGML_ASSERT(ncols % QK4_0 == 0);
-    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+  const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
-    stream->parallel_for(
+  const sycl::range<3> block_nums(1, 1, block_num_y);
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+  const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+  stream->parallel_for(
-            mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
+      sycl::nd_range<3>(block_nums * block_dims, block_dims), [=
-                          vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows,
+  ](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                                             item_ct1);
+        mul_mat_vec_q<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
-        });
+            vx, vy, dst, ncols, nrows, item_ct1);
-}
+      });
 static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK4_1 == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    stream->parallel_for(
        sycl::nd_range<3>(block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
            mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
                          vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
                                             item_ct1);
        });
 }
 static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK5_0 == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    stream->parallel_for(
        sycl::nd_range<3>(block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
            mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
                          vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
                                             item_ct1);
        });
 }
 static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK5_1 == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    stream->parallel_for(
        sycl::nd_range<3>(block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
            mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
                          vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
                                             item_ct1);
        });
 }
 static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK8_0 == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    stream->parallel_for(
        sycl::nd_range<3>(block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
            mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
                          vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
                                             item_ct1);
        });
 }
 static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    stream->parallel_for(
        sycl::nd_range<3>(block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
            mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
                          vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
                                             item_ct1);
        });
 }
 static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    stream->parallel_for(
        sycl::nd_range<3>(block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
            mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
                          vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
                                             item_ct1);
        });
 }
 static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    stream->parallel_for(
        sycl::nd_range<3>(block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
            mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
                          vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
                                             item_ct1);
        });
 }
 static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    stream->parallel_for(
        sycl::nd_range<3>(block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
            mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
                          vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
                                             item_ct1);
        });
 }
 static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
                                       float *dst, const int ncols,
                                       const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
    stream->parallel_for(
        sycl::nd_range<3>(block_nums * block_dims, block_dims),
        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
            mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
                          vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
                                             item_ct1);
        });
 }
 int get_device_index_by_id(int id){
@ -12095,37 +11943,63 @@ inline void ggml_sycl_op_mul_mat_vec_q(
    const int64_t ne00 = src0->ne[0];
    const int64_t row_diff = row_high - row_low;
    // TODO: support these quantization types
    GGML_ASSERT(!(src0->type == GGML_TYPE_IQ2_XXS ||
                  src0->type == GGML_TYPE_IQ2_XS ||
                  src0->type == GGML_TYPE_IQ3_XXS ||
                  src0->type == GGML_TYPE_IQ1_S));
    switch (src0->type) {
        case GGML_TYPE_Q4_0:
-            mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK4_0, QI4_0, block_q4_0,
-            break;
+                                       VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        case GGML_TYPE_Q4_1:
-            mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK4_1, QI4_1, block_q4_1,
-            break;
+                                       VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        case GGML_TYPE_Q5_0:
-            mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK5_0, QI5_0, block_q5_0,
-            break;
+                                       VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        case GGML_TYPE_Q5_1:
-            mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK5_1, QI5_1, block_q5_1,
-            break;
+                                       VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        case GGML_TYPE_Q8_0:
-            mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK8_0, QI8_0, block_q8_0,
-            break;
+                                       VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        case GGML_TYPE_Q2_K:
-            mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK_K, QI2_K, block_q2_K,
-            break;
+                                       VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        case GGML_TYPE_Q3_K:
-            mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK_K, QI3_K, block_q3_K,
-            break;
+                                       VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        case GGML_TYPE_Q4_K:
-            mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK_K, QI4_K, block_q4_K,
-            break;
+                                       VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        case GGML_TYPE_Q5_K:
-            mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK_K, QI5_K, block_q5_K,
-            break;
+                                       VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        case GGML_TYPE_Q6_K:
-            mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+          mul_mat_vec_q_sycl_submitter<QK_K, QI6_K, block_q6_K,
-            break;
+                                       VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
              src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
          break;
        default:
            GGML_ASSERT(false);
            break;
@ -12145,7 +12019,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
    const int64_t src1_ncols, const int64_t src1_padded_row_size,
    const dpct::queue_ptr &stream) {
-    GGML_TENSOR_BINARY_OP_LOCALS
+    GGML_TENSOR_BINARY_OP_LOCALS;
    const int64_t row_diff = row_high - row_low;
@ -15093,6 +14967,12 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
                    return false;
                }
                if (a->type == GGML_TYPE_IQ1_S) {
                    return false;
                }
                if (a->type == GGML_TYPE_IQ3_XXS) {
                  return false;
                }
                if (a->type == GGML_TYPE_IQ2_XXS) {
                    return false;
                }
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@ -1091,7 +1091,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
    }
 }
-static void ggml_vk_instance_init() {
+static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
 static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
 void ggml_vk_instance_init() {
    if (vk_instance_initialized) {
        return;
    }
@ -1100,28 +1103,42 @@ static void ggml_vk_instance_init() {
 #endif
    vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
    const std::vector<const char*> layers = {
 #ifdef GGML_VULKAN_VALIDATE
        "VK_LAYER_KHRONOS_validation",
 #endif
    };
    const std::vector<const char*> extensions = {
 #ifdef GGML_VULKAN_VALIDATE
        "VK_EXT_validation_features",
 #endif
    };
    vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags(), &app_info, layers, extensions);
 #ifdef GGML_VULKAN_VALIDATE
    const std::vector<vk::ValidationFeatureEnableEXT> features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
    vk::ValidationFeaturesEXT validation_features = {
        features_enable,
        {},
    };
    validation_features.setPNext(nullptr);
    instance_create_info.setPNext(&validation_features);
-    std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
+    const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
-#endif
+    const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
    const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
    std::vector<const char*> layers;
    if (validation_ext) {
        layers.push_back("VK_LAYER_KHRONOS_validation");
    }
    std::vector<const char*> extensions;
    if (validation_ext) {
        extensions.push_back("VK_EXT_validation_features");
    }
    if (portability_enumeration_ext) {
        extensions.push_back("VK_KHR_portability_enumeration");
    }
    vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
    if (portability_enumeration_ext) {
        instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
    }
    std::vector<vk::ValidationFeatureEnableEXT> features_enable;
    vk::ValidationFeaturesEXT validation_features;
    if (validation_ext) {
        features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
        validation_features = {
            features_enable,
            {},
        };
        validation_features.setPNext(nullptr);
        instance_create_info.setPNext(&validation_features);
        std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
    }
    vk_instance.instance = vk::createInstance(instance_create_info);
    memset(vk_instance.initialized, 0, sizeof(bool) * GGML_VK_MAX_DEVICES);
@ -1168,12 +1185,12 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
    vk_instance.devices[idx] = std::make_shared<vk_device>();
    ctx->device = vk_instance.devices[idx];
    ctx->device.lock()->physical_device = devices[dev_num];
-    std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
+    const std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
    bool maintenance4_support = false;
    // Check if maintenance4 is supported
-    for (auto properties : ext_props) {
+    for (const auto& properties : ext_props) {
        if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
            maintenance4_support = true;
        }
@ -1204,7 +1221,7 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
    bool fp16_storage = false;
    bool fp16_compute = false;
-    for (auto properties : ext_props) {
+    for (const auto& properties : ext_props) {
        if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
            fp16_storage = true;
        } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
@ -5301,6 +5318,42 @@ GGML_CALL int ggml_backend_vk_reg_devices() {
    return vk_instance.device_indices.size();
 }
 // Extension availability
 static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
 #ifdef GGML_VULKAN_VALIDATE
    bool portability_enumeration_ext = false;
    // Check for portability enumeration extension for MoltenVK support
    for (const auto& properties : instance_extensions) {
        if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
            return true;
        }
    }
    if (!portability_enumeration_ext) {
        std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
    }
 #endif
    return false;
    UNUSED(instance_extensions);
 }
 static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
 #ifdef __APPLE__
    bool portability_enumeration_ext = false;
    // Check for portability enumeration extension for MoltenVK support
    for (const auto& properties : instance_extensions) {
        if (strcmp("VK_KHR_portability_enumeration", properties.extensionName) == 0) {
            return true;
        }
    }
    if (!portability_enumeration_ext) {
        std::cerr << "ggml_vulkan: WARNING: Instance extension VK_KHR_portability_enumeration not found." << std::endl;
    }
 #endif
    return false;
    UNUSED(instance_extensions);
 }
 // checks
 #ifdef GGML_VULKAN_CHECK_RESULTS
--- a/ggml.c
+++ b/ggml.c
@ -23,6 +23,9 @@
 #include <limits.h>
 #include <stdarg.h>
 #include <signal.h>
 #if defined(__gnu_linux__)
 #include <syscall.h>
 #endif
 #ifdef GGML_USE_METAL
 #include <unistd.h>
@ -270,6 +273,8 @@ inline static void * ggml_calloc(size_t num, size_t size) {
 #include <Accelerate/Accelerate.h>
 #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
 #include "ggml-opencl.h"
 #elif defined(GGML_USE_VULKAN)
 #include "ggml-vulkan.h"
 #endif
 #elif defined(GGML_USE_OPENBLAS)
 #if defined(GGML_BLAS_USE_MKL)
@ -673,6 +678,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_IQ1_S] = {
        .type_name                = "iq1_s",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_iq1_s),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq1_s,
        .from_float               = NULL,
        .from_float_reference     = NULL,
        .vec_dot                  = ggml_vec_dot_iq1_s_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
        .nrows                    = 1,
    },
    [GGML_TYPE_Q8_K] = {
        .type_name                = "q8_K",
        .blck_size                = QK_K,
@ -868,7 +885,7 @@ do {                                                              \
    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]),    \
                                 _mm256_extractf128_ps(x[0], 1)); \
    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \
-    res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1));                     \
+    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1));        \
 } while (0)
 // TODO: is this optimal ?
@ -1149,7 +1166,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
    }                                                             \
    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);                    \
-    res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0));                     \
+    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0));        \
 }
 // TODO: is this optimal ?
@ -1959,7 +1976,7 @@ struct ggml_numa_nodes {
    uint32_t n_nodes;
    uint32_t total_cpus; // hardware threads on system
    uint32_t current_node; // node on which main process is execting
-#ifdef __linux__
+#if defined(__gnu_linux__)
    cpu_set_t cpuset; // cpuset from numactl
 #else
    uint32_t cpuset; // no NUMA support outside of Linux at this time. Use a portable datatype
@ -1997,7 +2014,7 @@ inline static void ggml_critical_section_end(void) {
    atomic_fetch_sub(&g_state_barrier, 1);
 }
-#ifdef __linux__
+#if defined(__gnu_linux__)
 static cpu_set_t ggml_get_numa_affinity(void) {
    cpu_set_t cpuset;
    pthread_t thread;
@ -2019,7 +2036,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
        return;
    }
-#ifdef __linux__
+#if defined(__gnu_linux__)
    struct stat st;
    char path[256];
    int rv;
@ -2051,7 +2068,13 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
    // figure out which node we're on
    uint current_cpu;
-    int getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
+    int getcpu_ret = 0;
 #if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28)
    getcpu_ret = getcpu(&current_cpu, &g_state.numa.current_node);
 #else
    // old glibc doesn't have a wrapper for this call. Fall back on direct syscall
    getcpu_ret = syscall(SYS_getcpu,&current_cpu,&g_state.numa.current_node);
 #endif
    if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
        g_state.numa.n_nodes = 0;
@ -2086,6 +2109,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
        }
    }
 #else
    GGML_UNUSED(numa_flag);
    // TODO
 #endif
 }
@ -2266,6 +2290,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
        case GGML_FTYPE_MOSTLY_IQ2_XXS:       wtype = GGML_TYPE_IQ2_XXS;  break;
        case GGML_FTYPE_MOSTLY_IQ2_XS:        wtype = GGML_TYPE_IQ2_XS;   break;
        case GGML_FTYPE_MOSTLY_IQ3_XXS:       wtype = GGML_TYPE_IQ3_XXS;  break;
        case GGML_FTYPE_MOSTLY_IQ1_S:         wtype = GGML_TYPE_IQ1_S;    break;
        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
    }
@ -3219,7 +3244,7 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
 }
 struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
-    strncpy(tensor->name, name, sizeof(tensor->name));
+    strncpy(tensor->name, name, sizeof(tensor->name) - 1);
    tensor->name[sizeof(tensor->name) - 1] = '\0';
    return tensor;
 }
@ -5095,16 +5120,28 @@ static struct ggml_tensor * ggml_soft_max_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * mask,
        struct ggml_tensor  * pos,
        float                 scale,
        float                 max_bias,
        bool                  inplace) {
    GGML_ASSERT(ggml_is_contiguous(a));
    if (mask) {
        GGML_ASSERT(ggml_is_contiguous(mask));
-        GGML_ASSERT(mask->ne[2] == 1);
+        GGML_ASSERT(ggml_is_matrix(mask));
        GGML_ASSERT(mask->ne[3] == 1);
        GGML_ASSERT(ggml_can_repeat_rows(mask, a));
    }
    if (pos) {
        GGML_ASSERT(ggml_is_vector(pos));
        GGML_ASSERT(pos->type == GGML_TYPE_F32);
        GGML_ASSERT(pos->ne[0] == a->ne[0]);
    }
    if (max_bias > 0.0f) {
        GGML_ASSERT(pos);
    }
    bool is_node = false;
    if (a->grad) {
@ -5113,13 +5150,14 @@ static struct ggml_tensor * ggml_soft_max_impl(
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    float params[] = { scale };
+    float params[] = { scale, max_bias };
    ggml_set_op_params(result, params, sizeof(params));
    result->op   = GGML_OP_SOFT_MAX;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    result->src[1] = mask;
    result->src[2] = pos;
    return result;
 }
@ -5127,21 +5165,23 @@ static struct ggml_tensor * ggml_soft_max_impl(
 struct ggml_tensor * ggml_soft_max(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
-    return ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
+    return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false);
 }
 struct ggml_tensor * ggml_soft_max_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
-    return ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
+    return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true);
 }
 struct ggml_tensor * ggml_soft_max_ext(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * mask,
-        float                 scale) {
+        struct ggml_tensor  * pos,
-    return ggml_soft_max_impl(ctx, a, mask, scale, false);
+        float                 scale,
        float                 max_bias) {
    return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false);
 }
 // ggml_soft_max_back
@ -7661,6 +7701,7 @@ static void ggml_compute_forward_add(
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
            {
                ggml_compute_forward_add_q_f32(params, src0, src1, dst);
            } break;
@ -7928,6 +7969,7 @@ static void ggml_compute_forward_add1(
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
            {
                ggml_compute_forward_add1_q_f32(params, src0, src1, dst);
            } break;
@ -8048,6 +8090,7 @@ static void ggml_compute_forward_acc(
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
        default:
            {
                GGML_ASSERT(false);
@ -10814,6 +10857,7 @@ static void ggml_compute_forward_out_prod(
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
            {
                ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst);
            } break;
@ -10994,6 +11038,7 @@ static void ggml_compute_forward_set(
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
        default:
            {
                GGML_ASSERT(false);
@ -11191,6 +11236,7 @@ static void ggml_compute_forward_get_rows(
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
            {
                ggml_compute_forward_get_rows_q(params, src0, src1, dst);
            } break;
@ -11494,6 +11540,7 @@ static void ggml_compute_forward_soft_max_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
        const struct ggml_tensor * src2,
              struct ggml_tensor * dst) {
    assert(ggml_is_contiguous(dst));
    assert(ggml_are_same_shape(src0, dst));
@ -11502,16 +11549,29 @@ static void ggml_compute_forward_soft_max_f32(
        return;
    }
-    float scale = 1.0f;
+    float scale    = 1.0f;
-    memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
+    float max_bias = 0.0f;
    memcpy(&scale,    (float *) dst->op_params + 0, sizeof(float));
    memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
    // TODO: handle transposed/permuted matrices
    const int ith = params->ith;
    const int nth = params->nth;
    GGML_TENSOR_UNARY_OP_LOCALS
    const int64_t ne11 = src1 ? src1->ne[1] : 1;
    // TODO: is this supposed to be ceil instead of floor?
    //       https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
    const uint32_t n_head_kv   = ne02;
    const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv));
    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
    const int nc = src0->ne[0];
    const int nr = ggml_nrows(src0);
@ -11524,6 +11584,9 @@ static void ggml_compute_forward_soft_max_f32(
    float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
    // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
    float * pos = src2 ? (float *) src2->data : src0->data;
    for (int i1 = ir0; i1 < ir1; i1++) {
        float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
        float * dp = (float *)((char *)  dst->data +  i1*dst->nb[1]);
@ -11537,6 +11600,16 @@ static void ggml_compute_forward_soft_max_f32(
            ggml_vec_acc_f32(nc, wp, mp);
        }
        // ALiBi bias
        if (max_bias > 0.0f) {
            const uint32_t h  = (i1/ne01)%ne02; // head
            const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
            for (int i = 0; i < nc; i++) {
                wp[i] = wp[i] + slope*pos[i];
            }
        }
 #ifndef NDEBUG
        for (int i = 0; i < nc; ++i) {
            //printf("p[%d] = %f\n", i, p[i]);
@ -11581,11 +11654,12 @@ static void ggml_compute_forward_soft_max(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
        const struct ggml_tensor * src2,
              struct ggml_tensor * dst) {
    switch (src0->type) {
        case GGML_TYPE_F32:
            {
-                ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
+                ggml_compute_forward_soft_max_f32(params, src0, src1, src2, dst);
            } break;
        default:
            {
@ -11729,22 +11803,20 @@ static void ggml_compute_forward_alibi_f32(
    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-    for (int64_t i = 0; i < ne0; i++) {
+    for (int64_t k = 0; k < ne2_ne3; k++) {
-        for (int64_t j = 0; j < ne1; j++) {
+        // TODO: k*nb2 or k*nb3
-            for (int64_t k = 0; k < ne2_ne3; k++) {
+        float m_k;
        if (k < n_heads_log2_floor) {
            m_k = powf(m0, k + 1);
        } else {
            m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
        }
        for (int64_t i = 0; i < ne0; i++) {
            for (int64_t j = 0; j < ne1; j++) {
                float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
                float *      pdst = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
                // TODO: k*nb2 or k*nb3
                float m_k;
                if (k < n_heads_log2_floor) {
                    m_k = powf(m0, k + 1);
                } else {
                    m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
                }
                pdst[0] = i * m_k + src[0];
            }
        }
@ -11789,21 +11861,20 @@ static void ggml_compute_forward_alibi_f16(
    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-    for (int i = 0; i < ne0; i++) {
+    for (int k = 0; k < ne2_ne3; k++) {
-        for (int j = 0; j < ne1; j++) {
+        // TODO: k*nb2 or k*nb3
-            for (int k = 0; k < ne2_ne3; k++) {
+        float m_k;
        if (k < n_heads_log2_floor) {
            m_k = powf(m0, k + 1);
        } else {
            m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
        }
        for (int i = 0; i < ne0; i++) {
            for (int j = 0; j < ne1; j++) {
                ggml_fp16_t * const src  = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
-                      float *      pdst  =       (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
+                float       *      pdst  =       (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
                // TODO: k*nb2 or k*nb3
                float m_k;
                if (k < n_heads_log2_floor) {
                    m_k = powf(m0, k + 1);
                } else {
                    m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
                }
                // we return F32
                pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
@ -11839,6 +11910,7 @@ static void ggml_compute_forward_alibi(
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_Q8_K:
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
@ -11916,6 +11988,7 @@ static void ggml_compute_forward_clamp(
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ3_XXS:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_Q8_K:
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
@ -15115,7 +15188,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            } break;
        case GGML_OP_SOFT_MAX:
            {
-                ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
+                ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
            } break;
        case GGML_OP_SOFT_MAX_BACK:
            {
@ -16672,7 +16745,7 @@ typedef pthread_t ggml_thread_t;
 #endif
 // Android's libc implementation "bionic" does not support setting affinity
-#if defined(__linux__) && !defined(__BIONIC__)
+#if defined(__gnu_linux__)
 static void set_numa_thread_affinity(int thread_n) {
    if (!ggml_is_numa()) {
        return;
@ -17847,7 +17920,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
                ptr += ggml_nbytes(tensor);
-                fprintf(stderr, "%s: loaded leaf %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
+                fprintf(stderr, "%s: loaded leaf %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
            }
        }
@ -17950,7 +18023,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
                result->nodes[i] = tensor;
-                fprintf(stderr, "%s: loaded node %d: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
+                fprintf(stderr, "%s: loaded node %u: '%16s', %9zu bytes\n", __func__, i, tensor->name, ggml_nbytes(tensor));
            }
        }
    }
@ -18575,7 +18648,9 @@ static enum ggml_opt_result linesearch_backtracking(
        (*step) *= width;
    }
-    GGML_UNREACHABLE();
+    GGML_ASSERT(false && "line search failed");
    return GGML_LINESEARCH_FAIL;
 }
 static enum ggml_opt_result ggml_opt_lbfgs(
@ -18843,7 +18918,9 @@ static enum ggml_opt_result ggml_opt_lbfgs(
        step[0] = 1.0;
    }
-    GGML_UNREACHABLE();
+    GGML_ASSERT(false && "lbfgs failed");
    return GGML_OPT_DID_NOT_CONVERGE;
 }
 struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
@ -19091,8 +19168,9 @@ void ggml_quantize_init(enum ggml_type type) {
    ggml_critical_section_start();
    switch (type) {
-        case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
+        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:  iq2xs_init_impl(512); break;
+        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ1_S:   iq2xs_init_impl(type); break;
        case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
        default: // nothing
            break;
@ -19104,8 +19182,10 @@ void ggml_quantize_init(enum ggml_type type) {
 void ggml_quantize_free(void) {
    ggml_critical_section_start();
-    iq2xs_free_impl(256);
+    iq2xs_free_impl(GGML_TYPE_IQ2_XXS);
-    iq2xs_free_impl(512);
+    iq2xs_free_impl(GGML_TYPE_IQ2_XS);
    iq2xs_free_impl(GGML_TYPE_IQ1_S);
    iq3xs_free_impl(256);
    ggml_critical_section_end();
 }
@ -19240,7 +19320,8 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
 bool ggml_quantize_requires_imatrix(enum ggml_type type) {
    return
        type == GGML_TYPE_IQ2_XXS ||
-        type == GGML_TYPE_IQ2_XS;
+        type == GGML_TYPE_IQ2_XS  ||
        type == GGML_TYPE_IQ1_S;
 }
 size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
@ -19365,6 +19446,15 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
                result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
                GGML_ASSERT(result == row_size * nrows);
            } break;
        case GGML_TYPE_IQ1_S:
            {
                GGML_ASSERT(start % QK_K == 0);
                GGML_ASSERT(start % n_per_row == 0);
                size_t start_row = start / n_per_row;
                size_t row_size = ggml_row_size(type, n_per_row);
                result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
                GGML_ASSERT(result == row_size * nrows);
            } break;
        case GGML_TYPE_F16:
            {
                size_t elemsize = sizeof(ggml_fp16_t);
--- a/ggml.h
+++ b/ggml.h
@ -361,6 +361,7 @@ extern "C" {
        GGML_TYPE_IQ2_XXS = 16,
        GGML_TYPE_IQ2_XS  = 17,
        GGML_TYPE_IQ3_XXS = 18,
        GGML_TYPE_IQ1_S   = 19,
        GGML_TYPE_I8,
        GGML_TYPE_I16,
        GGML_TYPE_I32,
@ -398,6 +399,7 @@ extern "C" {
        GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ2_XS  = 16, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ1_S   = 18, // except 1d tensors
    };
    // available tensor operations:
@ -1390,13 +1392,17 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-    // fused soft_max(a*scale + mask)
+    // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
    // mask is optional
    // pos is required when max_bias > 0.0f
    // max_bias = 0.0f for no ALiBi
    GGML_API struct ggml_tensor * ggml_soft_max_ext(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * mask,
-            float                 scale);
+            struct ggml_tensor  * pos,
            float                 scale,
            float                 max_bias);
    GGML_API struct ggml_tensor * ggml_soft_max_back(
            struct ggml_context * ctx,
@ -1498,12 +1504,13 @@ extern "C" {
    // alibi position embedding
    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_alibi(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past,
            int                   n_head,
-            float                 bias_max);
+            float                 bias_max),
        "use ggml_soft_max_ext instead (will be removed in Mar 2024)");
    // clamp
    // in-place, returns view(a)
--- a/llama.cpp
+++ b/llama.cpp
@ -1585,12 +1585,13 @@ struct llama_hparams {
    uint32_t n_yarn_orig_ctx;
    int32_t  rope_scaling_type_train;
-    float f_clamp_kqv;
+    float f_clamp_kqv      = 0.0f;
-    float f_max_alibi_bias;
+    float f_max_alibi_bias = 0.0f;
    bool causal_attn = true;
-    uint32_t pooling_type = LLAMA_POOLING_NONE;
+    bool need_kq_pos = false;
    uint32_t pooling_type = LLAMA_POOLING_NONE;
    bool operator!=(const llama_hparams & other) const {
        if (this->vocab_only    != other.vocab_only)    return true;
@ -1955,6 +1956,7 @@ struct llama_context {
    struct ggml_tensor * inp_embd;      // F32 [n_embd, n_batch]
    struct ggml_tensor * inp_pos;       // I32 [n_batch]
    struct ggml_tensor * inp_KQ_mask;   // F32 [n_ctx, n_batch]
    struct ggml_tensor * inp_KQ_pos;    // F32 [n_ctx]
    struct ggml_tensor * inp_K_shift;   // I32 [n_ctx]
    struct ggml_tensor * inp_mean;      // F32 [n_batch, n_batch]
    struct ggml_tensor * inp_cls;       // I32 [n_batch]
@ -2557,6 +2559,7 @@ struct llama_model_loader {
                case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
                case GGML_TYPE_IQ2_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS;  break;
                case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
                case GGML_TYPE_IQ1_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ1_S;   break;
                default:
                    {
                        LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@ -2919,6 +2922,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
        case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
        case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
        case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
        case LLAMA_FTYPE_MOSTLY_IQ1_S  :return "IQ1_S - 1.5625 bpw";
        default: return "unknown, may not work";
    }
@ -3100,6 +3104,11 @@ static void llm_load_hparams(
                    case 40: model.type = e_model::MODEL_13B; break;
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
                if (model.type == e_model::MODEL_13B) {
                    // TODO: become GGUF KV parameter
                    hparams.f_max_alibi_bias = 8.0f;
                }
            } break;
        case LLM_ARCH_STARCODER:
            {
@ -3127,6 +3136,9 @@ static void llm_load_hparams(
                    case 32: model.type = e_model::MODEL_1B; break;
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
                // TODO: become GGUF KV parameter
                hparams.f_max_alibi_bias = 8.0f;
            } break;
        case LLM_ARCH_BERT:
            {
@ -3172,11 +3184,12 @@ static void llm_load_hparams(
                            case 4096: model.type = e_model::MODEL_7B; break;
                        } break;
                }
                // TODO: become GGUF KV parameter
                hparams.f_max_alibi_bias = 8.0f;
            } break;
        case LLM_ARCH_MPT:
            {
                hparams.f_clamp_kqv = 0.0f;
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,  hparams.f_norm_eps);
                ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV,      hparams.f_clamp_kqv, false);
                ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
@ -3278,6 +3291,10 @@ static void llm_load_hparams(
    }
    model.ftype = ml.ftype;
    if (hparams.f_max_alibi_bias > 0.0f) {
        hparams.need_kq_pos = true;
    }
 }
 // TODO: This should probably be in llama.h
@ -4846,10 +4863,10 @@ static struct ggml_tensor * llm_build_kqv(
         struct ggml_tensor * wo_b,
         struct ggml_tensor * q_cur,
         struct ggml_tensor * kq_mask,
         struct ggml_tensor * kq_pos,
                    int64_t   n_ctx,
                    int32_t   n_tokens,
                    int32_t   n_kv,
                    float     max_alibi_bias,
                    float     kq_scale,
         const llm_build_cb & cb,
                    int       il) {
@ -4879,26 +4896,26 @@ static struct ggml_tensor * llm_build_kqv(
        ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
    }
-    if (max_alibi_bias > 0.0f) {
+#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_SYCL)
-        // temporary branch until we figure out how to handle ggml_alibi through ggml_add
+#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, Kompute, and SYCL")
 #pragma message("      Falling back to ggml_alibi(). Will become an error in Mar 2024")
 #pragma message("ref:  https://github.com/ggerganov/llama.cpp/pull/5488")
    if (hparams.f_max_alibi_bias > 0.0f) {
        kq = ggml_scale(ctx, kq, kq_scale);
        cb(kq, "kq_scaled", il);
-        if (max_alibi_bias > 0.0f) {
+        kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
-            // TODO: n_head or n_head_kv
+        cb(kq, "kq_scaled_alibi", il);
            // TODO: K-shift is likely not working
            // TODO: change to ggml_add
            kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
            cb(kq, "kq_scaled_alibi", il);
        }
        kq = ggml_add(ctx, kq, kq_mask);
        cb(kq, "kq_masked", il);
        kq = ggml_soft_max(ctx, kq);
        cb(kq, "kq_soft_max", il);
-    } else {
+    } else
-        kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
+#endif
    {
        kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
        cb(kq, "kq_soft_max_ext", il);
    }
@ -4946,11 +4963,11 @@ static struct ggml_tensor * llm_build_kv(
         struct ggml_tensor * v_cur,
         struct ggml_tensor * q_cur,
         struct ggml_tensor * kq_mask,
         struct ggml_tensor * kq_pos,
                    int64_t   n_ctx,
                    int32_t   n_tokens,
                    int32_t   kv_head,
                    int32_t   n_kv,
                    float     max_alibi_bias,
                    float     kq_scale,
         const llm_build_cb & cb,
                    int       il) {
@ -4964,9 +4981,8 @@ static struct ggml_tensor * llm_build_kv(
    llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
    struct ggml_tensor * cur;
-    cur  = llm_build_kqv(ctx, model, hparams, kv, graph,
+    cur  = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
-            wo, wo_b,
+            q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
            q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
    cb(cur, "kqv_out", il);
    return cur;
@ -5134,7 +5150,7 @@ struct llm_build_context {
                }
                Qcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
                    hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                );
@ -5149,7 +5165,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5279,6 +5295,10 @@ struct llm_build_context {
        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
        cb(KQ_mask, "KQ_mask", -1);
        // positions of the tokens in the KV cache
        struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
        cb(KQ_pos, "KQ_pos", -1);
        // shift the entire K-cache if needed
        if (do_rope_shift) {
            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
@ -5327,12 +5347,9 @@ struct llm_build_context {
                cb(Kcur, "Kcur", il);
                // apply ALiBi for 13B model
                const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5456,7 +5473,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5555,7 +5572,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5760,7 +5777,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5822,6 +5839,10 @@ struct llm_build_context {
        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
        cb(KQ_mask, "KQ_mask", -1);
        // positions of the tokens in the KV cache
        struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
        cb(KQ_pos, "KQ_pos", -1);
        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
@ -5849,7 +5870,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -5950,7 +5971,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            } else {
                // compute Q and K and RoPE them
@ -5981,7 +6002,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -6057,6 +6078,10 @@ struct llm_build_context {
        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
        cb(KQ_mask, "KQ_mask", -1);
        // positions of the tokens in the KV cache
        struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
        cb(KQ_pos, "KQ_pos", -1);
        inpL = llm_build_norm(ctx0, inpL, hparams,
                model.tok_norm,
                model.tok_norm_b,
@ -6090,7 +6115,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -6150,6 +6175,10 @@ struct llm_build_context {
        struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
        cb(KQ_mask, "KQ_mask", -1);
        // positions of the tokens in the KV cache
        struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
        cb(KQ_pos, "KQ_pos", -1);
        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * attn_norm;
@ -6183,7 +6212,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -6305,7 +6334,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -6420,7 +6449,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -6541,7 +6570,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -6668,7 +6697,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
                cb(cur, "kqv_out", il);
            }
@ -6771,7 +6800,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
            struct ggml_tensor * sa_out = cur;
@ -6870,7 +6899,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -6979,7 +7008,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -7097,7 +7126,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -7216,7 +7245,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -7348,7 +7377,7 @@ struct llm_build_context {
                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
@ -7579,6 +7608,18 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
        }
    }
    if (hparams.need_kq_pos) {
        const int64_t n_kv = kv_self.n;
        assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
        float * data = (float *) lctx.inp_KQ_pos->data;
        for (int i = 0; i < n_kv; ++i) {
            data[i] = float(lctx.kv_self.cells[i].pos);
        }
    }
    if (kv_self.has_shift) {
        const int64_t n_ctx = cparams.n_ctx;
@ -10596,20 +10637,20 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
        if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
            new_type = GGML_TYPE_Q8_0;
        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
            new_type = GGML_TYPE_Q5_K;
        }
        else if (new_type != GGML_TYPE_Q8_0) {
            new_type = GGML_TYPE_Q6_K;
        }
    } else if (name == "token_embd.weight") {
-        if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
+        if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
            new_type = GGML_TYPE_Q2_K;
        }
        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
            new_type = GGML_TYPE_Q4_K;
        }
-    } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
+    } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
        if (name.find("attn_v.weight") != std::string::npos) {
            if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
            else new_type = GGML_TYPE_Q2_K;
@ -10619,6 +10660,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
            if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K;
            ++qs.i_ffn_down;
        }
        else if (name.find("attn_output.weight") != std::string::npos) {
            if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
        }
    } else if (name.find("attn_v.weight") != std::string::npos) {
        if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
            new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
@ -10752,7 +10796,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
    if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
        new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
        new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
-        new_type == GGML_TYPE_IQ3_XXS) {
+        new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
        int nx = tensor->ne[0];
        int ny = tensor->ne[1];
        if (nx % QK_K != 0) {
@ -10767,6 +10811,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
            case GGML_TYPE_IQ2_XXS:
            case GGML_TYPE_IQ2_XS:
            case GGML_TYPE_IQ3_XXS:
            case GGML_TYPE_IQ1_S:
            case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
            case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
            case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
@ -10809,6 +10854,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
        case LLAMA_FTYPE_MOSTLY_IQ2_XS:  quantized_type = GGML_TYPE_IQ2_XS;  break;
        case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
        case LLAMA_FTYPE_MOSTLY_IQ1_S:   quantized_type = GGML_TYPE_IQ1_S  ; break;
        default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
    }
@ -10982,6 +11028,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            }
            if ((new_type == GGML_TYPE_IQ2_XXS ||
                 new_type == GGML_TYPE_IQ2_XS  ||
                 new_type == GGML_TYPE_IQ1_S   ||
                (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
                LLAMA_LOG_ERROR("\n\n============================================================\n");
                LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
@ -11747,7 +11794,7 @@ struct llama_context * llama_new_context_with_model(
        // graph inputs
        {
            ggml_init_params init_params = {
-                /* .mem_size   */ ggml_tensor_overhead()*7,
+                /* .mem_size   */ ggml_tensor_overhead()*8,
                /* .mem_buffer */ nullptr,
                /* .no_alloc   */ true,
            };
@ -11757,6 +11804,7 @@ struct llama_context * llama_new_context_with_model(
            ctx->inp_embd    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
            ctx->inp_pos     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
            ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
            ctx->inp_KQ_pos  = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
            ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
            ctx->inp_mean    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
            ctx->inp_cls     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
@ -11765,6 +11813,7 @@ struct llama_context * llama_new_context_with_model(
            ggml_set_name(ctx->inp_embd,    "inp_embd");
            ggml_set_name(ctx->inp_pos,     "inp_pos");
            ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
            ggml_set_name(ctx->inp_KQ_pos,  "inp_KQ_pos");
            ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
            ggml_set_name(ctx->inp_mean,    "inp_mean");
            ggml_set_name(ctx->inp_cls,     "inp_cls");
@ -12787,6 +12836,123 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
    return 0;
 }
 // trim whitespace from the beginning and end of a string
 static std::string trim(const std::string & str) {
    size_t start = 0;
    size_t end = str.size();
    while (start < end && isspace(str[start])) {
        start += 1;
    }
    while (end > start && isspace(str[end - 1])) {
        end -= 1;
    }
    return str.substr(start, end - start);
 }
 // Simple version of "llama_apply_chat_template" that only works with strings
 // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
 static int32_t llama_chat_apply_template_internal(
    const std::string & tmpl,
    const std::vector<const llama_chat_message *> & chat,
    std::string & dest, bool add_ass) {
    // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
    std::stringstream ss;
    if (tmpl.find("<|im_start|>") != std::string::npos) {
        // chatml template
        for (auto message : chat) {
            ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
        }
        if (add_ass) {
            ss << "<|im_start|>assistant\n";
        }
    } else if (tmpl.find("[INST]") != std::string::npos) {
        // llama2 template and its variants
        // [variant] support system message
        bool support_system_message = tmpl.find("<<SYS>>") != std::string::npos;
        // [variant] space before + after response
        bool space_around_response = tmpl.find("' ' + eos_token") != std::string::npos;
        // [variant] add BOS inside history
        bool add_bos_inside_history = tmpl.find("bos_token + '[INST]") != std::string::npos;
        // [variant] trim spaces from the input message
        bool strip_message = tmpl.find("content.strip()") != std::string::npos;
        // construct the prompt
        bool is_inside_turn = true; // skip BOS at the beginning
        ss << "[INST] ";
        for (auto message : chat) {
            std::string content = strip_message ? trim(message->content) : message->content;
            std::string role(message->role);
            if (!is_inside_turn) {
                is_inside_turn = true;
                ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
            }
            if (role == "system") {
                if (support_system_message) {
                    ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
                } else {
                    // if the model does not support system message, we still include it in the first message, but without <<SYS>>
                    ss << content << "\n";
                }
            } else if (role == "user") {
                ss << content << " [/INST]";
            } else {
                ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
                is_inside_turn = false;
            }
        }
        // llama2 templates seem to not care about "add_generation_prompt"
    } else if (tmpl.find("<|user|>") != std::string::npos) {
        // zephyr template
        for (auto message : chat) {
            ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
        }
        if (add_ass) {
            ss << "<|assistant|>\n";
        }
    } else {
        // template not supported
        return -1;
    }
    dest = ss.str();
    return dest.size();
 }
 LLAMA_API int32_t llama_chat_apply_template(
                const struct llama_model * model,
                              const char * tmpl,
         const struct llama_chat_message * chat,
                                  size_t   n_msg,
                                    bool   add_ass,
                                    char * buf,
                                 int32_t   length) {
    std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
    if (tmpl == nullptr) {
        GGML_ASSERT(model != nullptr);
        // load template from model
        std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
        std::string template_key = "tokenizer.chat_template";
        int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), curr_tmpl.size());
        if (res < 0) {
            // worst case: there is no information about template, we will use chatml by default
            curr_tmpl = "<|im_start|>"; // see llama_chat_apply_template_internal
        } else {
            curr_tmpl = std::string(model_template.data(), model_template.size());
        }
    }
    // format the chat to string
    std::vector<const llama_chat_message *> chat_vec;
    chat_vec.resize(n_msg);
    for (size_t i = 0; i < n_msg; i++) {
        chat_vec[i] = &chat[i];
    }
    std::string formatted_chat;
    int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
    if (res < 0) {
        return res;
    }
    strncpy(buf, formatted_chat.c_str(), length);
    return res;
 }
 struct llama_timings llama_get_timings(struct llama_context * ctx) {
    struct llama_timings result = {
        /*.t_start_ms  =*/ 1e-3 * ctx->t_start_us,
--- a/llama.h
+++ b/llama.h
@ -100,6 +100,7 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q3_K_XS       = 22, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ3_XXS       = 23, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ1_S         = 24, // except 1d tensors
        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
@ -304,6 +305,12 @@ extern "C" {
        int32_t n_eval;
    };
    // used in chat template
    typedef struct llama_chat_message {
        const char * role;
        const char * content;
    } llama_chat_message;
    // Helpers for getting default parameters
    LLAMA_API struct llama_model_params llama_model_default_params(void);
    LLAMA_API struct llama_context_params llama_context_default_params(void);
@ -700,6 +707,25 @@ extern "C" {
                                  char * buf,
                               int32_t   length);
    /// Apply chat template. Inspired by hf apply_chat_template() on python.
    /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
    /// NOTE: This function only support some known jinja templates. It is not a jinja parser.
    /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
    /// @param chat Pointer to a list of multiple llama_chat_message
    /// @param n_msg Number of llama_chat_message in this chat
    /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
    /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
    /// @param length The size of the allocated buffer
    /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
    LLAMA_API int32_t llama_chat_apply_template(
              const struct llama_model * model,
                            const char * tmpl,
       const struct llama_chat_message * chat,
                                size_t   n_msg,
                                  bool   add_ass,
                                  char * buf,
                               int32_t   length);
    //
    // Grammar
    //
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@ -0,0 +1,64 @@
 #include <iostream>
 #include <string>
 #include <vector>
 #include <sstream>
 #undef NDEBUG
 #include <cassert>
 #include "llama.h"
 int main(void) {
    llama_chat_message conversation[] = {
        {"system", "You are a helpful assistant"},
        {"user", "Hello"},
        {"assistant", "Hi there"},
        {"user", "Who are you"},
        {"assistant", "   I am an assistant   "},
        {"user", "Another question"},
    };
    size_t message_count = 6;
    std::vector<std::string> templates = {
        // teknium/OpenHermes-2.5-Mistral-7B
        "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
        // mistralai/Mistral-7B-Instruct-v0.2
        "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
        // TheBloke/FusionNet_34Bx2_MoE-AWQ
        "{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
        // bofenghuang/vigogne-2-70b-chat
        "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
    };
    std::vector<std::string> expected_substr = {
        "<|im_start|>assistant\n   I am an assistant   <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant",
        "[/INST]Hi there</s>[INST] Who are you [/INST]   I am an assistant   </s>[INST] Another question [/INST]",
        "</s><s>[INST] Who are you [/INST]    I am an assistant    </s><s>[INST] Another question [/INST]",
        "[/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
    };
    std::vector<char> formatted_chat(1024);
    int32_t res;
    // test invalid chat template
    res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
    assert(res < 0);
    for (size_t i = 0; i < templates.size(); i++) {
        std::string custom_template = templates[i];
        std::string substr = expected_substr[i];
        formatted_chat.resize(1024);
        res = llama_chat_apply_template(
            nullptr,
            custom_template.c_str(),
            conversation,
            message_count,
            true,
            formatted_chat.data(),
            formatted_chat.size()
        );
        formatted_chat.resize(res);
        std::string output(formatted_chat.data(), formatted_chat.size());
        std::cout << output << "\n-------------------------\n";
        // expect the "formatted_chat" to contain pre-defined strings
        assert(output.find(substr) != std::string::npos);
    }
    return 0;
 }