Merge branch 'upstream' into concedo_experimental

# Conflicts: # .dockerignore # .github/workflows/build.yml # .github/workflows/docker.yml # Makefile # README.md # examples/infill/infill.cpp # examples/perplexity/perplexity.cpp # examples/server/README.md # examples/speculative/speculative.cpp # flake.lock # ggml/src/CMakeLists.txt # scripts/sync-ggml.last # tests/test-backend-ops.cpp # tests/test-sampling.cpp
2025-09-10 17:14:36 +00:00 · 2024-09-27 11:21:28 +08:00 · 2024-09-27 11:21:28 +08:00 · ea55f69dc1
commit ea55f69dc1
parent 6342b414ea 95bc82fbc0
39 changed files with 2587 additions and 1564 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -692,7 +692,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
        [](gpt_params & params) {
            params.ctx_shift = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
    add_opt(llama_arg(
        {"--chunks"}, "N",
        format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@ -1103,7 +1103,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
            else { throw std::invalid_argument("invalid value"); }
        }
-    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
    add_opt(llama_arg(
        {"--attention"}, "{causal,non,causal}",
        "attention type for embeddings, use model default if unspecified",
@ -1122,77 +1122,77 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
            else { throw std::invalid_argument("invalid value"); }
        }
-    ));
+    ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE"));
    add_opt(llama_arg(
        {"--rope-scale"}, "N",
        "RoPE context scaling factor, expands context by a factor of N",
        [](gpt_params & params, const std::string & value) {
            params.rope_freq_scale = 1.0f / std::stof(value);
        }
-    ));
+    ).set_env("LLAMA_ARG_ROPE_SCALE"));
    add_opt(llama_arg(
        {"--rope-freq-base"}, "N",
        "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
        [](gpt_params & params, const std::string & value) {
            params.rope_freq_base = std::stof(value);
        }
-    ));
+    ).set_env("LLAMA_ARG_ROPE_FREQ_BASE"));
    add_opt(llama_arg(
        {"--rope-freq-scale"}, "N",
        "RoPE frequency scaling factor, expands context by a factor of 1/N",
        [](gpt_params & params, const std::string & value) {
            params.rope_freq_scale = std::stof(value);
        }
-    ));
+    ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
    add_opt(llama_arg(
        {"--yarn-orig-ctx"}, "N",
        format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
        [](gpt_params & params, int value) {
            params.yarn_orig_ctx = value;
        }
-    ));
+    ).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
    add_opt(llama_arg(
        {"--yarn-ext-factor"}, "N",
        format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
        [](gpt_params & params, const std::string & value) {
            params.yarn_ext_factor = std::stof(value);
        }
-    ));
+    ).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
    add_opt(llama_arg(
        {"--yarn-attn-factor"}, "N",
        format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
        [](gpt_params & params, const std::string & value) {
            params.yarn_attn_factor = std::stof(value);
        }
-    ));
+    ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
    add_opt(llama_arg(
        {"--yarn-beta-slow"}, "N",
        format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
        [](gpt_params & params, const std::string & value) {
            params.yarn_beta_slow = std::stof(value);
        }
-    ));
+    ).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
    add_opt(llama_arg(
        {"--yarn-beta-fast"}, "N",
        format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
        [](gpt_params & params, const std::string & value) {
            params.yarn_beta_fast = std::stof(value);
        }
-    ));
+    ).set_env("LLAMA_ARG_YARN_BETA_FAST"));
    add_opt(llama_arg(
        {"-gan", "--grp-attn-n"}, "N",
        format("group-attention factor (default: %d)", params.grp_attn_n),
        [](gpt_params & params, int value) {
            params.grp_attn_n = value;
        }
-    ));
+    ).set_env("LLAMA_ARG_GRP_ATTN_N"));
    add_opt(llama_arg(
        {"-gaw", "--grp-attn-w"}, "N",
        format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
        [](gpt_params & params, int value) {
            params.grp_attn_w = value;
        }
-    ));
+    ).set_env("LLAMA_ARG_GRP_ATTN_W"));
    add_opt(llama_arg(
        {"-dkvc", "--dump-kv-cache"},
        "verbose print of the KV cache",
@ -1206,7 +1206,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
        [](gpt_params & params) {
            params.no_kv_offload = true;
        }
-    ));
+    ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
    add_opt(llama_arg(
        {"-ctk", "--cache-type-k"}, "TYPE",
        format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
@ -1214,7 +1214,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            // TODO: get the type right here
            params.cache_type_k = value;
        }
-    ));
+    ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
    add_opt(llama_arg(
        {"-ctv", "--cache-type-v"}, "TYPE",
        format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
@ -1222,7 +1222,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            // TODO: get the type right here
            params.cache_type_v = value;
        }
-    ));
+    ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
    add_opt(llama_arg(
        {"--perplexity", "--all-logits"},
        format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
@ -1356,7 +1356,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
        [](gpt_params & params, const std::string & value) {
            params.rpc_servers = value;
        }
-    ));
+    ).set_env("LLAMA_ARG_RPC"));
 #endif
    add_opt(llama_arg(
        {"--mlock"},
@ -1364,14 +1364,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
        [](gpt_params & params) {
            params.use_mlock = true;
        }
-    ));
+    ).set_env("LLAMA_ARG_MLOCK"));
    add_opt(llama_arg(
        {"--no-mmap"},
        "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
        [](gpt_params & params) {
            params.use_mmap = false;
        }
-    ));
+    ).set_env("LLAMA_ARG_NO_MMAP"));
    add_opt(llama_arg(
        {"--numa"}, "TYPE",
        "attempt optimizations that help on some NUMA systems\n"
@ -1386,7 +1386,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
            else { throw std::invalid_argument("invalid value"); }
        }
-    ));
+    ).set_env("LLAMA_ARG_NUMA"));
    add_opt(llama_arg(
        {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
        "number of layers to store in VRAM",
@ -1434,7 +1434,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
            }
        }
-    ));
+    ).set_env("LLAMA_ARG_SPLIT_MODE"));
    add_opt(llama_arg(
        {"-ts", "--tensor-split"}, "N0,N1,N2,...",
        "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
@ -1461,7 +1461,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
            }
        }
-    ));
+    ).set_env("LLAMA_ARG_TENSOR_SPLIT"));
    add_opt(llama_arg(
        {"-mg", "--main-gpu"}, "INDEX",
        format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
@ -1471,7 +1471,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
            }
        }
-    ));
+    ).set_env("LLAMA_ARG_MAIN_GPU"));
    add_opt(llama_arg(
        {"--check-tensors"},
        format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
@ -1534,7 +1534,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
        [](gpt_params & params, const std::string & value) {
            params.model_alias = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
    add_opt(llama_arg(
        {"-m", "--model"}, "FNAME",
        ex == LLAMA_EXAMPLE_EXPORT_LORA
@ -1742,7 +1742,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
        [](gpt_params & params, const std::string & value) {
            params.public_path = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
    add_opt(llama_arg(
        {"--embedding", "--embeddings"},
        format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@ -1780,14 +1780,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
        [](gpt_params & params, const std::string & value) {
            params.ssl_file_key = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE"));
    add_opt(llama_arg(
        {"--ssl-cert-file"}, "FNAME",
        "path to file a PEM-encoded SSL certificate",
        [](gpt_params & params, const std::string & value) {
            params.ssl_file_cert = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
    add_opt(llama_arg(
        {"-to", "--timeout"}, "N",
        format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@ -1795,7 +1795,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
            params.timeout_read  = value;
            params.timeout_write = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
    add_opt(llama_arg(
        {"--threads-http"}, "N",
        format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
--- a/common/log.cpp
+++ b/common/log.cpp
@ -82,7 +82,7 @@ struct gpt_log_entry {
            }
        }
-        if (level != GGML_LOG_LEVEL_NONE && prefix) {
+        if (level != GGML_LOG_LEVEL_NONE && level != GGML_LOG_LEVEL_CONT && prefix) {
            if (timestamp) {
                // [M.s.ms.us]
                fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
--- a/common/log.h
+++ b/common/log.h
@ -83,8 +83,10 @@ void gpt_log_set_timestamps(struct gpt_log * log,       bool   timestamps); // w
 #define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  0,                 __VA_ARGS__)
 #define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0,                 __VA_ARGS__)
 #define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
 #define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  0,                 __VA_ARGS__)
 #define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
 #define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
 #define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
 #define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
 #define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  verbosity, __VA_ARGS__)
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -209,7 +209,15 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
            GGML_ASSERT(false && "unknown mirostat version");
        }
    } else {
-        llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
+        if (params.n_probs > 0) {
            // some use cases require to sample greedily, but still obtain the probabilities of the top tokens
            // ref: https://github.com/ggerganov/llama.cpp/pull/9605
            //
            // the following will not produce exactly the same probs as applyging softmax to the full vocabulary, but
            // it is much faster, since we avoid sorting all tokens and should give a good approximation
            llama_sampler_chain_add(result->chain, llama_sampler_init_top_k(params.n_probs));
            llama_sampler_chain_add(result->chain, llama_sampler_init_softmax());
        }
        llama_sampler_chain_add(result->chain, llama_sampler_init_greedy());
    }
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -4102,16 +4102,45 @@ class GraniteModel(LlamaModel):
        #   consistency
        if attention_scale := self.hparams.get("attention_multiplier"):
            self.gguf_writer.add_attention_scale(attention_scale)
            logger.info("gguf: (granite) attention_scale = %s", attention_scale)
        if embedding_scale := self.hparams.get("embedding_multiplier"):
            self.gguf_writer.add_embedding_scale(embedding_scale)
            logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
        if residual_scale := self.hparams.get("residual_multiplier"):
            self.gguf_writer.add_residual_scale(residual_scale)
-        if logits_scaling := self.hparams.get("logits_scaling"):
+            logger.info("gguf: (granite) residual_scale = %s", residual_scale)
-            self.gguf_writer.add_logit_scale(logits_scaling)
+        if logits_scale := self.hparams.get("logits_scaling"):
            self.gguf_writer.add_logit_scale(logits_scale)
            logger.info("gguf: (granite) logits_scale = %s", logits_scale)
@Model.register("GraniteMoeForCausalLM")
 class GraniteMoeModel(GraniteModel):
    """Conversion for IBM's GraniteMoeForCausalLM"""
    model_arch = gguf.MODEL_ARCH.GRANITE_MOE
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        """In modeling_granitemoe, the JetMoe implementation of parallel experts
        is used. This essentially merges w1 and w3 into a single tensor with 2x
        the hidden size that is then split during forward. To keep compatibility
        with existing mixtral support, we pull them apart here.
        """
        if name.endswith("block_sparse_moe.input_linear.weight"):
            ffn_dim = self.hparams["intermediate_size"]
            assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
            gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
            return [
                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
            ]
        return super().modify_tensors(data_torch, name, bid)
 ###### CONVERSION LOGIC ######
 # tree of lazy tensors
 class LazyTorchTensor(gguf.LazyBase):
    _tensor_type = torch.Tensor
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@ -6,42 +6,73 @@
 // Export usage message (-h) to markdown format
 static void write_table_header(std::ofstream & file) {
    file << "| Argument | Explanation |\n";
    file << "| -------- | ----------- |\n";
 }
 static void write_table_entry(std::ofstream & file, const llama_arg & opt) {
    file << "| `";
    // args
    for (const auto & arg : opt.args) {
    if (arg == opt.args.front()) {
            file << arg;
            if (opt.args.size() > 1) file << ", ";
        } else {
            file << arg << (arg != opt.args.back() ? ", " : "");
        }
    }
    // value hint
    if (opt.value_hint) {
        std::string md_value_hint(opt.value_hint);
        string_replace_all(md_value_hint, "|", "\\|");
        file << " " << md_value_hint;
    }
    if (opt.value_hint_2) {
        std::string md_value_hint_2(opt.value_hint_2);
        string_replace_all(md_value_hint_2, "|", "\\|");
        file << " " << md_value_hint_2;
    }
    // help text
    std::string md_help(opt.help);
    string_replace_all(md_help, "\n", "<br/>");
    string_replace_all(md_help, "|", "\\|");
    file << "` | " << md_help << " |\n";
 }
 static void write_table(std::ofstream & file, std::vector<llama_arg *> & opts) {
    write_table_header(file);
    for (const auto & opt : opts) {
        write_table_entry(file, *opt);
    }
 }
 static void export_md(std::string fname, llama_example ex) {
    std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
    gpt_params params;
    auto ctx_arg = gpt_params_parser_init(params, ex);
-    file << "| Argument | Explanation |\n";
+    std::vector<llama_arg *> common_options;
-    file << "| -------- | ----------- |\n";
+    std::vector<llama_arg *> sparam_options;
    std::vector<llama_arg *> specific_options;
    for (auto & opt : ctx_arg.options) {
-        file << "| `";
+        // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
-        // args
+        if (opt.is_sparam) {
-        for (const auto & arg : opt.args) {
+            sparam_options.push_back(&opt);
-        if (arg == opt.args.front()) {
+        } else if (opt.in_example(ctx_arg.ex)) {
-                file << arg;
+            specific_options.push_back(&opt);
-                if (opt.args.size() > 1) file << ", ";
+        } else {
-            } else {
+            common_options.push_back(&opt);
                file << arg << (arg != opt.args.back() ? ", " : "");
            }
        }
        // value hint
        if (opt.value_hint) {
            std::string md_value_hint(opt.value_hint);
            string_replace_all(md_value_hint, "|", "\\|");
            file << " " << md_value_hint;
        }
        if (opt.value_hint_2) {
            std::string md_value_hint_2(opt.value_hint_2);
            string_replace_all(md_value_hint_2, "|", "\\|");
            file << " " << md_value_hint_2;
        }
        // help text
        std::string md_help(opt.help);
        string_replace_all(md_help, "\n", "<br/>");
        string_replace_all(md_help, "|", "\\|");
        file << "` | " << md_help << " |\n";
    }
    file << "**Common params**\n\n";
    write_table(file, common_options);
    file << "\n\n**Sampling params**\n\n";
    write_table(file, sparam_options);
    file << "\n\n**Example-specific params**\n\n";
    write_table(file, specific_options);
 }
 int main(int, char **) {
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -386,9 +386,9 @@ int main(int argc, char ** argv) {
        if (params.n_keep > add_bos) {
            LOG_INF("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
-                LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
            }
-            LOG("'\n");
+            LOG_CNT("'\n");
        }
        LOG_INF("\n");
    }
@ -410,40 +410,40 @@ int main(int argc, char ** argv) {
    }
    if (params.interactive) {
-        LOG("%s: interactive mode on.\n", __func__);
+        LOG_INF("%s: interactive mode on.\n", __func__);
        if (!params.antiprompt.empty()) {
            for (const auto & antiprompt : params.antiprompt) {
-                LOG("Reverse prompt: '%s'\n", antiprompt.c_str());
+                LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
                if (params.verbose_prompt) {
                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
                    for (int i = 0; i < (int) tmp.size(); i++) {
-                        LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                        LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                    }
                }
            }
        }
        if (params.input_prefix_bos) {
-            LOG("Input prefix with BOS\n");
+            LOG_INF("Input prefix with BOS\n");
        }
        if (!params.input_prefix.empty()) {
-            LOG("Input prefix: '%s'\n", params.input_prefix.c_str());
+            LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
            if (params.verbose_prompt) {
                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
                for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                }
            }
        }
        if (!params.input_suffix.empty()) {
-            LOG("Input suffix: '%s'\n", params.input_suffix.c_str());
+            LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
            if (params.verbose_prompt) {
                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
                for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                }
            }
        }
@ -475,7 +475,7 @@ int main(int argc, char ** argv) {
      //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
        LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
    }
-    LOG("\n");
+    LOG_INF("\n");
    if (params.interactive) {
        const char * control_message;
@ -487,11 +487,11 @@ int main(int argc, char ** argv) {
                              " - To return control without starting a new line, end your input with '/'.\n"
                              " - If you want to submit another line, end your input with '\\'.\n";
        }
-        LOG("== Running in interactive mode. ==\n");
+        LOG_INF("== Running in interactive mode. ==\n");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG(       " - Press Ctrl+C to interject at any time.\n");
+        LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-        LOG(       "%s\n", control_message);
+        LOG_INF(       "%s\n", control_message);
        is_interacting = params.interactive_first;
    }
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1181,6 +1181,15 @@ struct server_context {
            SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
        }
        // if context shift is disabled, we stop when it reaches the context limit
        if (slot.n_decoded >= slot.n_ctx) {
            slot.truncated      = true;
            slot.stopped_limit  = true;
            slot.has_next_token = false;
            SLT_DBG(slot, "stopped due to running out of context capacity, n_decoded = %d, n_ctx = %d\n", slot.n_decoded, slot.n_ctx);
        }
        if (llama_token_is_eog(model, result.tok)) {
            slot.stopped_eos    = true;
            slot.has_next_token = false;
@ -1481,7 +1490,7 @@ struct server_context {
            if (result.error) {
                error_handler(result.data);
                cancel_tasks(id_tasks);
-                break;
+                return;
            }
            size_t idx = result.data["index"];
@ -1828,6 +1837,14 @@ struct server_context {
        for (server_slot & slot : slots) {
            if (slot.ga_n == 1) {
                if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
                    if (!params.ctx_shift) {
                        // this check is redundant (for good)
                        // we should never get here, because generation should already stopped in process_token()
                        slot.release();
                        send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER);
                        continue;
                    }
                    // Shift context
                    const int n_keep    = slot.params.n_keep + add_bos_token;
                    const int n_left    = (int) system_tokens.size() + slot.n_past - n_keep;
@ -1962,6 +1979,14 @@ struct server_context {
                                continue;
                            }
                        } else {
                            if (!params.ctx_shift) {
                                // if context shift is disabled, we make sure prompt size is smaller than KV size
                                if ((int) system_tokens.size() + slot.n_prompt_tokens >= slot.n_ctx) {
                                    slot.release();
                                    send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST);
                                    continue;
                                }
                            }
                            if (slot.params.n_keep < 0) {
                                slot.params.n_keep = slot.n_prompt_tokens;
                            }
@ -2332,6 +2357,10 @@ int main(int argc, char ** argv) {
        svr.reset(new httplib::Server());
    }
 #else
    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
        LOG_ERR("Server is built without SSL support\n");
        return 1;
    }
    svr.reset(new httplib::Server());
 #endif
@ -3155,7 +3184,7 @@ int main(int argc, char ** argv) {
    }
    // print sample chat example to make it clear which template is used
-    LOG_INF("%s: chat template, built_in: %d, chat_example: '%s\n'", __func__, params.chat_template.empty(), llama_chat_format_example(ctx_server.model, params.chat_template).c_str());
+    LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), llama_chat_format_example(ctx_server.model, params.chat_template).c_str());
    ctx_server.queue_tasks.on_new_task(std::bind(
                &server_context::process_single_task, &ctx_server, std::placeholders::_1));
--- a/examples/server/tests/features/ctx_shift.feature
+++ b/examples/server/tests/features/ctx_shift.feature
@ -0,0 +1,62 @@
@llama.cpp
@ctx_shift
 Feature: llama.cpp server
  Background: Server startup
    Given a server listening on localhost:8080
    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
    And   a model file test-model.gguf
    And   a model alias tinyllama-2
    And   BOS token is 1
    And   42 as server seed
    And   256 KV cache size
    And   32 as batch size
    And   2 slots
  Scenario: Inference with context shift
    And   64 server max tokens to predict
    Then  the server is starting
    Then  the server is healthy
    Given a prompt:
    """
    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
    """
    And   a completion request with no api error
    Then  64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl
    And   the completion is  truncated
    And   109 prompt tokens are processed
  Scenario Outline: Inference without context shift
    And   <n_predict> server max tokens to predict
    And   disable context shifting
    Then  the server is starting
    Then  the server is healthy
    Given a prompt:
    """
    Hi how are you
    """
    And   a completion request with no api error
    Then  <n_token_output> tokens are predicted matching twind|Anna
    And   the completion is <truncated> truncated
    And   8 prompt tokens are processed
    Examples:
      | n_predict | n_token_output | truncated |
      | 64        | 64             | not       |
      | -1        | 120            |           |
  Scenario: Inference without context shift (expected error: prompt too long)
    And   disable context shifting
    Then  the server is starting
    Then  the server is healthy
    Given a prompt:
    """
    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
    """
    And   a completion request with 400 api error
--- a/examples/server/tests/features/embeddings.feature
+++ b/examples/server/tests/features/embeddings.feature
@ -10,11 +10,11 @@ Feature: llama.cpp server
    And   42 as server seed
    And   2 slots
    # the bert-bge-small model has context size of 512
-    # since the generated prompts are as big as the batch size, we need to set the batch size to 512
+    # since the generated prompts are as big as the batch size, we need to set the batch size to <= 512
    # ref: https://huggingface.co/BAAI/bge-small-en-v1.5/blob/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json#L20
-    And   512 as batch size
+    And   128 as batch size
-    And   512 as ubatch size
+    And   128 as ubatch size
-    And   2048 KV cache size
+    And   512 KV cache size
    And   embeddings extraction
    Then  the server is starting
    Then  the server is healthy
@ -26,6 +26,20 @@ Feature: llama.cpp server
    """
    Then embeddings are generated
  Scenario: Embedding (error: prompt too long)
    When embeddings are computed for:
    """
    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
    Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
    Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
    """
    And  embeddings request with 500 api error
  Scenario: OAI Embeddings compatibility
    Given a model bert-bge-small
    When an OAI compatible embeddings computation request for:
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -77,6 +77,7 @@ def step_server_config(context, server_fqdn: str, server_port: str):
    context.response_format = None
    context.temperature = None
    context.lora_file = None
    context.disable_ctx_shift = False
    context.tasks_result = []
    context.concurrent_tasks = []
@ -148,7 +149,7 @@ def step_n_slots(context, n_slots: int):
@step('{n_predict:d} server max tokens to predict')
 def step_server_n_predict(context, n_predict: int):
-    context.n_server_predict = n_predict
+    context.n_server_predict = n_predict if n_predict > 0 else None
@step('{slot_save_path} as slot save path')
@ -180,6 +181,9 @@ def step_server_embeddings(context):
 def step_server_metrics(context):
    context.server_metrics = True
@step('disable context shifting')
 def step_server_disable_ctx_shift(context):
    context.disable_ctx_shift = True
@step("the server is starting")
 def step_start_server(context):
@ -257,7 +261,7 @@ async def step_all_slots_status(context, expected_slot_status_string: Literal['i
@step('a completion request with {api_error} api error')
@async_run_until_complete
 async def step_request_completion(context, api_error: Literal['raised'] | str):
-    expect_api_error = api_error == 'raised'
+    expect_api_error = api_error == 'raised' or api_error != 'no'
    seeds = await completions_seed(context, num_seeds=1)
    completion = await request_completion(context.prompts.pop(),
                                          seeds[0] if seeds is not None else seeds,
@ -272,8 +276,11 @@ async def step_request_completion(context, api_error: Literal['raised'] | str):
    context.tasks_result.append(completion)
    if context.debug:
        print(f"Completion response: {completion}")
-    if expect_api_error:
+    if api_error == 'raised':
        assert completion == 401, f"completion must be an 401 status code: {completion}"
    elif api_error.isdigit():
        api_error_code = int(api_error)
        assert completion == api_error_code, f"completion must be an {api_error_code} status code: {completion}"
@step('{predicted_n:d} tokens are predicted matching {re_content}')
@ -645,6 +652,9 @@ def step_assert_embeddings(context):
    for embedding in context.embeddings:
        assert_embeddings(embedding)
@step('embeddings request with {api_error_code:d} api error')
 def step_assert_embeddings(context, api_error_code: int):
    assert context.embeddings == api_error_code, f"embeddings request must return code {api_error_code}, but got {context.embeddings}"
@step('an OAI compatible embeddings computation request for')
@async_run_until_complete
@ -1089,15 +1099,17 @@ async def oai_chat_completions(user_prompt,
    return completion_response
-async def request_embedding(content, seed, base_url=None) -> list[list[float]]:
+async def request_embedding(content, seed, base_url=None) -> list[list[float]] | int:
    async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
        async with session.post(f'{base_url}/embedding',
                                json={
                                    "content": content,
                                }) as response:
-            assert response.status == 200
+            if response.status == 200:
-            response_json = await response.json()
+                response_json = await response.json()
-            return [response_json['embedding']]
+                return [response_json['embedding']]
            else:
                return response.status
 async def request_oai_embeddings(input, seed,
@ -1372,6 +1384,8 @@ def start_server_background(context):
        server_args.append('--verbose')
    if context.lora_file:
        server_args.extend(['--lora', context.lora_file])
    if context.disable_ctx_shift:
        server_args.extend(['--no-context-shift'])
    args = [str(arg) for arg in [context.server_path, *server_args]]
    print(f"bench: starting server with: {' '.join(args)}")
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -576,6 +576,7 @@ extern "C" {
        GGML_LOG_LEVEL_WARN  = 2,
        GGML_LOG_LEVEL_ERROR = 3,
        GGML_LOG_LEVEL_DEBUG = 4,
        GGML_LOG_LEVEL_CONT  = 5, // continue previous log
    };
    // this tensor...
@ -1985,6 +1986,9 @@ extern "C" {
    typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
    typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
 #define GGML_N_TASKS_MAX (-1)
    // n_tasks == GGML_N_TASKS_MAX means to use max number of tasks
    GGML_API struct ggml_tensor * ggml_map_custom1(
            struct ggml_context   * ctx,
            struct ggml_tensor    * a,
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@ -294,6 +294,12 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
    alloc->free_blocks[0].offset = 0;
    alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
    alloc->max_size = 0;
 #ifdef GGML_ALLOCATOR_DEBUG
    for (int i = 0; i < 1024; i++) {
        alloc->allocated_tensors[i].tensor = NULL;
    }
 #endif
 }
 static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@ -227,6 +227,7 @@ struct ggml_backend_cann_context {
     * @brief Destructor for cleaning up resources.
     */
    ~ggml_backend_cann_context() {
        ggml_cann_set_device(device);
        if (copy_event != nullptr) {
            ACL_CHECK(aclrtDestroyEvent(copy_event));
        }
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -36,6 +36,7 @@ bool g_mul_mat_q = false;
 #include "ggml-cuda/tsembd.cuh"
 #include "ggml-cuda/unary.cuh"
 #include "ggml-cuda/upscale.cuh"
 #include "ggml-cuda/rwkv-wkv.cuh"
 #include <algorithm>
 #include <array>
@ -137,7 +138,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
    return res;
 #else
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+#if !defined(GGML_USE_HIPBLAS)
    cudaError_t err;
    if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
    {
@ -150,7 +151,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
    return err;
 #else
    return cudaMalloc(ptr, size);
-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+#endif // !defined(GGML_USE_HIPBLAS)
 #endif
 }
@ -188,7 +189,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
    for (int id = 0; id < info.device_count; ++id) {
        int device_vmm = 0;
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
        CUdevice device;
        CU_CHECK(cuDeviceGet(&device, id));
        CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@ -200,7 +201,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
            alloc_prop.location.id = id;
            CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
        }
-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
        info.devices[id].vmm = !!device_vmm;
        cudaDeviceProp prop;
@ -334,7 +335,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
 };
 // pool with virtual memory
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
 struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
    static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
@ -428,14 +429,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
        GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
    }
 };
-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
 std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
    if (ggml_cuda_info().devices[device].vmm) {
        return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
    }
-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
    return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
 }
@ -2247,6 +2248,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
                case GGML_UNARY_OP_HARDSWISH:
                    ggml_cuda_op_hardswish(ctx, dst);
                    break;
                case GGML_UNARY_OP_EXP:
                    ggml_cuda_op_exp(ctx, dst);
                    break;
                default:
                    return false;
            }
@ -2349,6 +2353,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_CROSS_ENTROPY_LOSS:
            ggml_cuda_cross_entropy_loss(ctx, dst);
            break;
        case GGML_OP_RWKV_WKV:
            ggml_cuda_op_rwkv_wkv(ctx, dst);
            break;
        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
            ggml_cuda_cross_entropy_loss_back(ctx, dst);
            break;
@ -2810,6 +2817,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                case GGML_UNARY_OP_HARDSWISH:
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_EXP:
                    return ggml_is_contiguous(op->src[0]);
                default:
                    return false;
@ -2826,6 +2834,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
                    return false;
                }
 #ifdef GGML_USE_MUSA
                if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 &&
                    !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
                    return false;
                }
 #endif // GGML_USE_MUSA
                switch (a->type) {
                    case GGML_TYPE_F32:
                    case GGML_TYPE_F16:
@ -2849,6 +2863,11 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                    case GGML_TYPE_IQ3_XXS:
                    case GGML_TYPE_IQ4_NL:
                    case GGML_TYPE_IQ4_XS:
 #ifdef GGML_USE_MUSA
                        if (a->type == GGML_TYPE_Q3_K) {
                            return false;
                        }
 #endif // GGML_USE_MUSA
                        return true;
                    default:
                        return false;
@ -2884,6 +2903,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
                    return true;
                }
                if (src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_F32) {
                    return true;
                }
                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
                    return true;
                }
@ -2971,20 +2993,24 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_LEAKY_RELU:
        case GGML_OP_RWKV_WKV:
            return true;
-        case GGML_OP_FLASH_ATTN_EXT:
+        case GGML_OP_FLASH_ATTN_EXT: {
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#ifndef FLASH_ATTN_AVAILABLE
-            return (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) || op->src[0]->ne[0] == 128;
+            return false;
-#else
+#endif
            if (op->src[0]->ne[0] == 128) {
                return true;
            }
            if (op->src[0]->ne[0] ==  64 && op->src[1]->type == GGML_TYPE_F16) {
                return true;
            }
-            return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
+            if (op->src[0]->ne[0] == 128) {
-                op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
+                return true;
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+            }
            if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) {
                return true;
            }
            const int cc = ggml_cuda_info().devices[cuda_ctx->device].cc;
            return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
        }
        case GGML_OP_CROSS_ENTROPY_LOSS:
        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
        case GGML_OP_OPT_STEP_ADAMW:
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -50,6 +50,8 @@
 #define CC_RDNA1      (CC_OFFSET_AMD + 1010)
 #define CC_RDNA2      (CC_OFFSET_AMD + 1030)
 #define CC_RDNA3      (CC_OFFSET_AMD + 1100)
 #define CC_QY1        210
 #define CC_QY2        220
 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
@ -134,6 +136,10 @@ typedef float2 dfloat2;
 #define INT8_MMA_AVAILABLE
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
 #if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
 #define FLASH_ATTN_AVAILABLE
 #endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
 static constexpr bool fast_fp16_available(const int cc) {
    return cc >= CC_PASCAL && cc != 610;
 }
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@ -81,6 +81,17 @@ static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
    }
 }
 static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
    const block_q8_0 * xi = (const block_q8_0 *) cxi;
    float * dsti = (float *) cdsti;
    const float d = (float)xi->d;
    for (int j = 0; j < QK8_0; j++) {
       dsti[j] = xi->qs[j] * d;
    }
 }
 static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    block_q4_0 * dsti = (block_q4_0 *) cdsti;
@ -288,6 +299,32 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
    cpy_blck(cx + x_offset, cdst + dst_offset);
 }
 template <cpy_kernel_t cpy_blck, int qk>
 static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
                                 const int nb12, const int nb13) {
    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
    if (i >= ne) {
        return;
    }
    const int i03 = i/(ne00 * ne01 * ne02);
    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
    const int x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
    const int i13 = i/(ne10 * ne11 * ne12);
    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
    cpy_blck(cx + x_offset, cdst + dst_offset);
 }
 static void ggml_cpy_f16_f32_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@ -329,6 +366,16 @@ static void ggml_cpy_f32_q8_0_cuda(
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
 static void ggml_cpy_q8_0_f32_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
    const int num_blocks = ne;
    cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
 static void ggml_cpy_f32_q4_0_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@ -437,6 +484,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
        ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
@ -471,6 +520,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
        return (void*) cpy_f32_f16<cpy_1_f32_f16>;
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
        return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
    } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
        return (void*) cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>;
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
        return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
--- a/ggml/src/ggml-cuda/fattn-tile-f32.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu
@ -44,13 +44,17 @@ static __global__ void flash_attn_tile_ext_f32(
        const int ne1,
        const int ne2,
        const int ne3) {
 #ifndef FLASH_ATTN_AVAILABLE
    NO_DEVICE_CODE;
    return;
 #endif // FLASH_ATTN_AVAILABLE
    // Skip unused kernel variants for faster compilation:
    if (use_logit_softcap && !(D == 128 || D == 256)) {
        NO_DEVICE_CODE;
        return;
    }
-    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
+    // In this kernel Q, K, V are matrices while i, j, k are matrix indices.
    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@ -314,7 +314,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    }
    if (!fast_fp16_available(cc)) {
-        if (Q->ne[1] <= 8) {
+        if (Q->ne[1] <= 8 || Q->ne[0] == 256) {
            ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
        } else {
            ggml_cuda_flash_attn_ext_tile_f32(ctx, dst);
--- a/ggml/src/ggml-cuda/rwkv-wkv.cu
+++ b/ggml/src/ggml-cuda/rwkv-wkv.cu
@ -0,0 +1,89 @@
 #include "common.cuh"
 #include "rwkv-wkv.cuh"
 static __global__ void rwkv_wkv_f32(const int B, const int T, const int C, const int H, const float * k, const float * v, const float * r, const float * tf, const float * td, const float * s, float * dst) {
    const int tid = threadIdx.x;
    const int bid = blockIdx.x;
    const int head_size = CUDA_WKV_BLOCK_SIZE;
    const int batch_i = bid / H;
    const int head_i = bid % H;
    const int state_size = C * head_size;
    const int n_seq_tokens = T / B;
    float state[head_size];
    __shared__ float _k[head_size], _r[head_size], _tf[head_size], _td[head_size];
    #pragma unroll
    for (int i = 0; i < head_size; i++) {
        state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid];
    }
    __syncthreads();
    _tf[tid] = tf[head_i * head_size + tid];
    __syncthreads();
    for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; t += C) {
        __syncthreads();
        _k[tid] = k[t];
        _r[tid] = r[t];
        _td[tid] = td[t];
        __syncthreads();
        const float _v = v[t];
        float y = 0;
        for (int j = 0; j < head_size; j += 4) {
            const float4& k = (float4&)(_k[j]);
            const float4& r = (float4&)(_r[j]);
            const float4& tf = (float4&)(_tf[j]);
            const float4& td = (float4&)(_td[j]);
            float4& s = (float4&)(state[j]);
            float4 kv;
            kv.x = k.x * _v;
            kv.y = k.y * _v;
            kv.z = k.z * _v;
            kv.w = k.w * _v;
            y += r.x * (tf.x * kv.x + s.x);
            y += r.y * (tf.y * kv.y + s.y);
            y += r.z * (tf.z * kv.z + s.z);
            y += r.w * (tf.w * kv.w + s.w);
            s.x = s.x * td.x + kv.x;
            s.y = s.y * td.y + kv.y;
            s.z = s.z * td.z + kv.z;
            s.w = s.w * td.w + kv.w;
        }
        dst[t] = y;
    }
    #pragma unroll
    for (int i = 0; i < head_size; i++) {
        dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i];
    }
 }
 void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const float * k_d  = (const float *)dst->src[0]->data;
    const float * v_d  = (const float *)dst->src[1]->data;
    const float * r_d  = (const float *)dst->src[2]->data;
    const float * tf_d = (const float *)dst->src[3]->data;
    const float * td_d = (const float *)dst->src[4]->data;
    const float * s_d  = (const float *)dst->src[5]->data;
    const int64_t B = dst->src[5]->ne[1];
    const int64_t T = dst->src[0]->ne[3];
    const int64_t C = dst->ne[0];
    const int64_t H = dst->src[0]->ne[2];
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32);
    GGML_ASSERT(C % H == 0);
    GGML_ASSERT(C / H == CUDA_WKV_BLOCK_SIZE);
    rwkv_wkv_f32<<<B * H, C / H, 0, stream>>>(B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d);
 }
--- a/ggml/src/ggml-cuda/rwkv-wkv.cuh
+++ b/ggml/src/ggml-cuda/rwkv-wkv.cuh
@ -0,0 +1,5 @@
 #include "common.cuh"
 #define CUDA_WKV_BLOCK_SIZE 64
 void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@ -95,6 +95,15 @@ static __global__ void hardswish_f32(const float * x, float * dst, const int k)
    dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f));
 }
 static __global__ void exp_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= k) {
        return;
    }
    dst[i] = expf(x[i]);
 }
 static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
    const int i  = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= k) {
@ -189,6 +198,11 @@ static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaSt
    hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }
 static void exp_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_EXP_BLOCK_SIZE - 1) / CUDA_EXP_BLOCK_SIZE;
    exp_f32<<<num_blocks, CUDA_EXP_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }
 static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
    leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
@ -354,6 +368,20 @@ void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
    hardswish_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 }
 void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    exp_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 }
 void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@ -8,6 +8,7 @@
 #define CUDA_RELU_BLOCK_SIZE 256
 #define CUDA_SIGMOID_BLOCK_SIZE 256
 #define CUDA_HARDSIGMOID_BLOCK_SIZE 256
 #define CUDA_EXP_BLOCK_SIZE 256
 #define CUDA_HARDSWISH_BLOCK_SIZE 256
 #define CUDA_SQR_BLOCK_SIZE 256
 #define CUDA_SQRT_BLOCK_SIZE 256
@ -32,6 +33,8 @@ void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@ -26,6 +26,7 @@
 #define cublasSetStream mublasSetStream
 #define cublasSgemm mublasSgemm
 #define cublasStatus_t mublasStatus_t
 #define cublasOperation_t mublasOperation_t
 #define cublasGetStatusString mublasStatus_to_string
 #define cudaDataType_t musaDataType_t
 #define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
@ -56,6 +57,7 @@
 #define cudaLaunchHostFunc musaLaunchHostFunc
 #define cudaMalloc musaMalloc
 #define cudaMallocHost musaMallocHost
 #define cudaMallocManaged musaMallocManaged
 #define cudaMemcpy musaMemcpy
 #define cudaMemcpyAsync musaMemcpyAsync
 #define cudaMemcpyPeerAsync musaMemcpyPeerAsync
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@ -2631,11 +2631,11 @@ kernel void kernel_flash_attn_ext_vec_f16(
        const short iv3 = iq3 / rv3;
        // load the queries from shared memory into local memory
-        half4 mq[D4];
+        float4 mq[D4];
        for (short ii = 0; ii < D4; ii += NW) {
            short i = ii + tiisg;
-            mq[i] = sq4[i];
+            mq[i] = (float4) sq4[i];
        }
        // pointer to the mask
@ -2661,11 +2661,11 @@ kernel void kernel_flash_attn_ext_vec_f16(
                    for (short ii = 0; ii < D4; ii += NW) {
                        const short i = ii + tiisg;
-                        half4x4 mk;
+                        float4x4 mk;
-                        mk[0] = pk4[i + 0*(nb11/8)];
+                        mk[0] = (float4) pk4[i + 0*(nb11/8)];
-                        mk[1] = pk4[i + 1*(nb11/8)];
+                        mk[1] = (float4) pk4[i + 1*(nb11/8)];
-                        mk[2] = pk4[i + 2*(nb11/8)];
+                        mk[2] = (float4) pk4[i + 2*(nb11/8)];
-                        mk[3] = pk4[i + 3*(nb11/8)];
+                        mk[3] = (float4) pk4[i + 3*(nb11/8)];
                        mqk += (float4) (mq[i] * mk);
                    }
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@ -3496,8 +3496,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
    bool use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
-        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE
+        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
        && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda || src1->ne[1] > MMVQ_MIN_BATCH_SIZE);
    bool use_mul_mat_q =  ggml_sycl_supports_mmq(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@ -134,7 +134,6 @@ typedef sycl::float2 dfloat2;
 #endif // GGML_SYCL_F16
 #define MMVQ_MAX_BATCH_SIZE  8
 #define MMVQ_MIN_BATCH_SIZE  4
 static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -63,6 +63,25 @@ int ggml_sve_cnt_b = 0;
 #pragma warning(disable: 4702)
 #endif
 // Note: once we move threading into a separate C++ file
 // will use std::hardware_destructive_interference_size instead of hardcoding it here
 // and we'll use C++ attribute syntax.
 #define GGML_CACHE_LINE  64
 #if defined(__clang__) || defined(__GNUC__)
 #define GGML_CACHE_ALIGN __attribute__((aligned(GGML_CACHE_LINE)))
 #endif
 #if defined(__has_feature)
 #if __has_feature(thread_sanitizer)
 #define GGML_TSAN_ENABLED 1
 #endif
 #else  // __has_feature
 #if defined(__SANITIZE_THREAD__)
 #define GGML_TSAN_ENABLED 1
 #endif
 #endif // __has_feature
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
@ -72,6 +91,8 @@ int ggml_sve_cnt_b = 0;
 #include <windows.h>
 #if !defined(__clang__)
 #define GGML_CACHE_ALIGN __declspec(align(GGML_CACHE_LINE))
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
 typedef atomic_int atomic_flag;
@ -114,6 +135,9 @@ static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
 static void atomic_flag_clear(atomic_flag * ptr) {
    InterlockedExchange(ptr, 0);
 }
 static void atomic_thread_fence(memory_order mo) {
    MemoryBarrier();
 }
 #else // clang
 #include <stdatomic.h>
 #endif
@ -289,7 +313,6 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
 #define GGML_DEBUG 0
 #define GGML_GELU_FP16
 #define GGML_GELU_QUICK_FP16
 #define GGML_N_TASKS_MAX (-1)
 #define GGML_SOFT_MAX_UNROLL 4
 #define GGML_VEC_DOT_UNROLL  2
@ -2015,8 +2038,8 @@ struct ggml_threadpool {
    // synchronization primitives
    atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
-    atomic_int n_barrier;
+    atomic_int GGML_CACHE_ALIGN n_barrier;
-    atomic_int n_barrier_passed;
+    atomic_int GGML_CACHE_ALIGN n_barrier_passed;
    atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
    // these are atomic as an annotation for thread-sanitizer
@ -3213,20 +3236,27 @@ static void ggml_barrier(struct ggml_threadpool * tp) {
    // enter barrier (full seq-cst fence)
    int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
    int last = 0;
    if (n_barrier == (n_threads - 1)) {
        // last thread
        atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
-        last = 1;
+
-    } else {
+        // exit barrier (fill seq-cst fence)
-        // wait for other threads
+        atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
-        while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
+        return;
-            ggml_thread_cpu_relax();
+    }
-        }
+
    // wait for other threads
    while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
        ggml_thread_cpu_relax();
    }
    // exit barrier (full seq-cst fence)
-    atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst);
+    // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
    #ifdef GGML_TSAN_ENABLED
    atomic_fetch_add_explicit(&tp->n_barrier_passed, 0, memory_order_seq_cst);
    #else
    atomic_thread_fence(memory_order_seq_cst);
    #endif
 #endif
 }
@ -20299,10 +20329,13 @@ static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * s
 // sync thread state after polling
 static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
-    struct ggml_threadpool * threadpool = state->threadpool;
+    // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead
-    // this should just be atomic_thread_fence(seq_cst) but it confuses thread-sanitizer
+    #ifdef GGML_TSAN_ENABLED
-    // so instead we just use a dummy read-modify-write
+    atomic_fetch_add_explicit(&state->threadpool->n_graph, 0, memory_order_seq_cst);
-    atomic_fetch_add_explicit(&threadpool->n_graph, 0, memory_order_seq_cst);
+    #else
    atomic_thread_fence(memory_order_seq_cst);
    #endif
    UNUSED(state);
 }
 static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -235,6 +235,7 @@ class MODEL_ARCH(IntEnum):
    NEMOTRON     = auto()
    EXAONE       = auto()
    GRANITE      = auto()
    GRANITE_MOE  = auto()
 class MODEL_TENSOR(IntEnum):
@ -392,6 +393,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.NEMOTRON:       "nemotron",
    MODEL_ARCH.EXAONE:         "exaone",
    MODEL_ARCH.GRANITE:        "granite",
    MODEL_ARCH.GRANITE_MOE:    "granitemoe",
 }
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -1232,6 +1234,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
    MODEL_ARCH.GRANITE: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
@ -1242,6 +1245,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.GRANITE_MOE: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.FFN_GATE_INP,
        MODEL_TENSOR.FFN_GATE_EXP,
        MODEL_TENSOR.FFN_DOWN_EXP,
        MODEL_TENSOR.FFN_UP_EXP,
    ],
    # TODO
 }
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -251,11 +251,12 @@ class TensorNameMap:
        ),
        MODEL_TENSOR.FFN_GATE_INP: (
-            "layers.{bid}.feed_forward.gate",             # mixtral
+            "layers.{bid}.feed_forward.gate",                   # mixtral
-            "model.layers.{bid}.block_sparse_moe.gate",   # mixtral
+            "model.layers.{bid}.block_sparse_moe.gate",         # mixtral
-            "model.layers.{bid}.mlp.gate",                # qwen2moe olmoe
+            "model.layers.{bid}.mlp.gate",                      # qwen2moe olmoe
-            "transformer.decoder_layer.{bid}.router",     # Grok
+            "transformer.decoder_layer.{bid}.router",           # Grok
-            "transformer.blocks.{bid}.ffn.router.layer",  # dbrx
+            "transformer.blocks.{bid}.ffn.router.layer",        # dbrx
            "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
        ),
        MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
@ -364,10 +365,11 @@ class TensorNameMap:
        ),
        MODEL_TENSOR.FFN_DOWN_EXP: (
-            "layers.{bid}.feed_forward.experts.w2",          # mixtral (merged)
+            "layers.{bid}.feed_forward.experts.w2",              # mixtral (merged)
-            "transformer.decoder_layer.{bid}.moe.linear_1",  # Grok (merged)
+            "transformer.decoder_layer.{bid}.moe.linear_1",      # Grok (merged)
-            "transformer.blocks.{bid}.ffn.experts.mlp.w2",   # dbrx
+            "transformer.blocks.{bid}.ffn.experts.mlp.w2",       # dbrx
-            "model.layers.{bid}.mlp.experts.down_proj",      # qwen2moe olmoe (merged)
+            "model.layers.{bid}.mlp.experts.down_proj",          # qwen2moe olmoe (merged)
            "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
        ),
        MODEL_TENSOR.FFN_DOWN_SHEXP: (
--- a/include/llama.h
+++ b/include/llama.h
@ -1068,6 +1068,7 @@ extern "C" {
    LLAMA_API struct llama_sampler * llama_sampler_init_dist       (uint32_t seed);
    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
    /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
    LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void);
    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
--- a/klite.embd
+++ b/klite.embd
@ -4185,6 +4185,7 @@ Current version indicated by LITEVER below.
 	const default_oai_image_endpoint = "/images/generations";
 	const default_oai_tts_endpoint = "/audio/speech";
 	const default_dalle_model_name = "dall-e-3";
 	const claude_submit_endpoint = "/complete";
 	const claude_submit_endpoint_v3 = "/messages";
@ -4325,6 +4326,7 @@ Current version indicated by LITEVER below.
 		saved_oai_addr: default_oai_base, //do not ever share this in save files!
 		saved_dalle_key: "",
 		saved_dalle_url: (default_oai_base + "/v1" + default_oai_image_endpoint),
 		saved_dalle_model: default_dalle_model_name,
 		saved_oai_tts_key: "",
 		saved_oai_tts_url: (default_oai_base + "/v1" + default_oai_tts_endpoint),
 		saved_openrouter_key: "",
@ -4557,16 +4559,23 @@ Current version indicated by LITEVER below.
 	},
 	{
 		"id":12,
-		"name":"Mistral Gen 1",
+		"name":"Mistral V1",
-		"user":"\\n[INST] ",
+		"user":"</s> [INST] ",
-		"assistant":" [/INST]\\n",
+		"assistant":" [/INST]",
 		"system":"",
 	},
 	{
 		"id":13,
-		"name":"Mistral Gen 2",
+		"name":"Mistral V2 & V3",
-		"user":"</s>\\n[INST]",
+		"user":"</s>[INST] ",
-		"assistant":"[/INST]\\n",
+		"assistant":"[/INST]",
 		"system":"",
 	},
 	{
 		"id":14,
 		"name":"Mistral V3-Tekken",
 		"user":"</s>[INST]",
 		"assistant":"[/INST]",
 		"system":"",
 	}
 	];
@ -5114,6 +5123,7 @@ Current version indicated by LITEVER below.
 		const foundChub = urlParams.get('chub');
 		const foundPyg = urlParams.get('pyg');
 		const foundAicc = urlParams.get('aicc');
 		const foundQuery = urlParams.get('query');
 		if (foundStory && foundStory != "") {
 			if (localsettings.persist_session && !safe_to_overwrite()) {
@ -5150,6 +5160,25 @@ Current version indicated by LITEVER below.
 			//purge url params
 			window.history.replaceState(null, null, window.location.pathname);
 		}
 		else if (foundQuery && foundQuery != "")
 		{
 			window.history.replaceState(null, null, window.location.pathname);
 			if (localsettings.persist_session && !safe_to_overwrite()) {
 				msgboxYesNo("You already have an existing persistent story. Do you want to overwrite it?","Overwrite Story Warning",()=>{
 					localsettings.opmode = 4;
 					restart_new_game(false);
 					document.getElementById("input_text").value = foundQuery;
 					submit_generation();
 				},null,false);
 			}
 			else
 			{
 				localsettings.opmode = 4;
 				restart_new_game(false);
 				document.getElementById("input_text").value = foundQuery;
 				submit_generation();
 			}
 		}
 	}
 	var image_models_fetched = false;
@ -5363,6 +5392,18 @@ Current version indicated by LITEVER below.
 			}
 		},false);
 	}
 	function set_dalle_model()
 	{
 		inputBox("Enter DALL-E API Model Identifier.","DALL-E API Model Identifier",localsettings.saved_dalle_model,"Input DALL-E Model Identifier", ()=>{
 			let userinput = getInputBoxValue();
 			userinput = userinput.trim();
 			if (userinput != null && userinput!="") {
 				localsettings.saved_dalle_model = userinput.trim();
 			}else{
 				localsettings.saved_dalle_model = default_dalle_model_name;
 			}
 		},false);
 	}
 	function set_oai_tts_key()
 	{
@ -5394,7 +5435,7 @@ Current version indicated by LITEVER below.
 		let prompt = splits[0].trim();
 		let dalle_payload = {
-			"model": "dall-e-3",
+			"model": localsettings.saved_dalle_model,
 			"prompt": prompt,
 			"n": 1,
 			"size": "1024x1024",
@ -12596,7 +12637,7 @@ Current version indicated by LITEVER below.
 					//mistral api does not support presence pen
 					oai_payload.presence_penalty = scaled_rep_pen;
 				}
-				if(targetep.toLowerCase().includes("featherless.ai"))
+				if(document.getElementById("useoainonstandard").checked || targetep.toLowerCase().includes("featherless.ai"))
 				{
 					//featherless api supports additional fields, include them
 					oai_payload.top_k = (submit_payload.params.top_k<1?300:submit_payload.params.top_k);
@ -12605,6 +12646,7 @@ Current version indicated by LITEVER below.
 					{
 						oai_payload.seed = submit_payload.params.sampler_seed;
 					}
 					oai_payload.top_a = localsettings.top_a;
 				}
 				if(submit_payload.params.logit_bias && JSON.stringify(submit_payload.params.logit_bias) != '{}')
 				{
@ -17982,11 +18024,13 @@ Current version indicated by LITEVER below.
 				<button type="button" class="btn btn-primary" style="display:inline;width:105px;" id="oaiusecustom" onclick="select_custom_oai_model()">Use Custom</button>
 				<div style="display:inline-flex">
 					<div><input type="checkbox" id="oaiaddversion" title="Add Endpoint Version Number" onchange="" checked>
-					<div class="box-label">Add Version Num</div></div>
+					<div class="box-label">Add Ver. Num</div></div>
 					<div><input type="checkbox" id="oaistreaming" title="Enable SSE Streaming" onchange="">
 					<div class="box-label">Streaming</div></div>
 					<div><input type="checkbox" id="useoaichatcompl" title="Use ChatCompletions API" onchange="toggleoaichatcompl()">
-					<div class="box-label" id="useoaichatcompllabel">ChatCompletions API</div></div>
+					<div class="box-label">Chat-Completions API</div></div>
 					<div><input type="checkbox" id="useoainonstandard" title="Send Non-Standard Fields">
 					<div class="box-label">Non-Standard Fields</div></div>
 				</div>
 				<span id="useoaichatcomplbox" class="hidden" onload="toggleoaichatcompl();">
 					<br>
@ -18694,8 +18738,9 @@ Current version indicated by LITEVER below.
 							</div>
 							<div id="generate_images_dalle_container" class="settinglabel hidden">
 								<table width="100%"><tr>
-								<td><button id="generate_images_dalle_setkey" type="button" class="btn btn-primary" style="width:100%; padding:2px 3px;margin-top:2px;font-size:11px;" onclick="set_dalle_url()">Set URL</button></td>
+								<td><button type="button" class="btn btn-primary" style="width:100%; padding:2px 3px;margin-top:2px;font-size:11px;" onclick="set_dalle_url()">Set URL</button></td>
-								<td><button id="generate_images_dalle_seturl" type="button" class="btn btn-primary" style="width:100%; padding:2px 3px;margin-top:2px;font-size:11px;" onclick="set_dalle_key()">Set Key</button></td>
+								<td><button type="button" class="btn btn-primary" style="width:100%; padding:2px 3px;margin-top:2px;font-size:11px;" onclick="set_dalle_key()">Set Key</button></td>
 								<td><button type="button" class="btn btn-primary" style="width:100%; padding:2px 3px;margin-top:2px;font-size:11px;" onclick="set_dalle_model()">Model</button></td>
 								</tr></table>
 							</div>
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -41,7 +41,7 @@ maxhordelen = 400
 modelbusy = threading.Lock()
 requestsinqueue = 0
 defaultport = 5001
-KcppVersion = "1.75.2"
+KcppVersion = "1.76"
 showdebug = True
 guimode = False
 showsamplerwarning = True
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@ -28,6 +28,8 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
 #define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
 #define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
 #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
 #define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LLAMA_LOG_CONT(...)  llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
 //
 // helpers
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@ -3,13 +3,14 @@
 #include "llama-vocab.h"
 #include "llama-grammar.h"
 #include <cassert>
 #include <algorithm>
-#include <cstring>
+#include <cassert>
 #include <ctime>
 #include <cfloat>
 #include <chrono>
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
 #include <ctime>
 #include <numeric>
 #include <random>
 #include <unordered_map>
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -1826,11 +1826,7 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
 }
 bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
-    return token != -1 && (
+    return token != -1 && vocab.special_eog_ids.count(token) > 0;
        token == llama_token_eos_impl(vocab) ||
        token == llama_token_eot_impl(vocab) ||
        token == llama_token_eom_impl(vocab)
    );
 }
 bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) {
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@ -6,6 +6,7 @@
 #include <vector>
 #include <unordered_map>
 #include <map>
 #include <set>
 struct llama_vocab {
    using id    = llama_token;
@ -49,12 +50,15 @@ struct llama_vocab {
    id special_eot_id    = -1; // TODO: move above after "eos_id", and here add "file separator" token
    id special_eom_id    = -1;
    // set of all tokens that cause "end of generation"
    std::set<id> special_eog_ids;
    // tokenizer flags
-    bool tokenizer_add_space_prefix = false;
+    bool tokenizer_add_space_prefix           = false;
-    bool tokenizer_add_bos          = false;
+    bool tokenizer_add_bos                    = false;
-    bool tokenizer_add_eos          = false;
+    bool tokenizer_add_eos                    = false;
-    bool tokenizer_ignore_merges    = false;
+    bool tokenizer_ignore_merges              = false;
-    bool tokenizer_clean_spaces     = false;  // clean_up_tokenization_spaces
+    bool tokenizer_clean_spaces               = false;  // clean_up_tokenization_spaces
    bool tokenizer_remove_extra_whitespaces   = false;
    bool tokenizer_escape_whitespaces         = true;
    bool tokenizer_treat_whitespace_as_suffix = false;
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -225,6 +225,7 @@ enum llm_arch {
    LLM_ARCH_EXAONE,
    LLM_ARCH_RWKV6,
    LLM_ARCH_GRANITE,
    LLM_ARCH_GRANITE_MOE,
    LLM_ARCH_UNKNOWN,
 };
@ -276,6 +277,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_EXAONE,          "exaone"       },
    { LLM_ARCH_RWKV6,           "rwkv6"        },
    { LLM_ARCH_GRANITE,         "granite"      },
    { LLM_ARCH_GRANITE_MOE,     "granitemoe"   },
    { LLM_ARCH_UNKNOWN,         "(unknown)"    },
 };
@ -1477,6 +1479,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
            { LLM_TENSOR_OUTPUT,          "output" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
@ -1488,6 +1491,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
    {
        LLM_ARCH_GRANITE_MOE,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
            { LLM_TENSOR_OUTPUT,          "output" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
        },
    },
    {
        LLM_ARCH_UNKNOWN,
        {
@ -2410,7 +2431,7 @@ struct llama_hparams {
    float f_max_alibi_bias = 0.0f;
    float f_logit_scale    = 0.0f;
-    // Additional scale factors (Granite)
+    // Additional scale factors (Granite/Granite MoE)
    float f_residual_scale  = 0.0f;
    float f_embedding_scale = 0.0f;
    float f_attention_scale = 0.0f;
@ -3070,18 +3091,14 @@ struct llama_sbatch {
        } else {
            // simple split
            if (batch->n_seq_id) {
-                for (size_t i = 0; i < length; ++i) {
+                ubatch.n_seq_id = batch->n_seq_id + seq.offset;
                    ubatch.n_seq_id = batch->n_seq_id + seq.offset;
                }
            } else {
                for (size_t i = 0; i < length; ++i) {
                    ubatch.n_seq_id[ubatch.n_seqs + i] = 1;
                }
            }
            if (batch->seq_id) {
-                for (size_t i = 0; i < length; ++i) {
+                ubatch.seq_id = batch->seq_id + seq.offset;
                    ubatch.seq_id = batch->seq_id + seq.offset;
                }
            } else {
                for (size_t i = 0; i < length; ++i) {
                    ubatch.seq_id[ubatch.n_seqs + i] = &seq.all_seq_id;
@ -6084,6 +6101,7 @@ static void llm_load_hparams(
                }
            } break;
        case LLM_ARCH_GRANITE:
        case LLM_ARCH_GRANITE_MOE:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
@ -6092,6 +6110,7 @@ static void llm_load_hparams(
                ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
                switch (hparams.n_layer) {
                    case 32: model.type = e_model::MODEL_3B; break;
                    case 40: model.type = e_model::MODEL_3B; break;
                    // Add additional layer/vocab/etc checks here for other model sizes
                    default: model.type = e_model::MODEL_UNKNOWN;
@ -6563,21 +6582,21 @@ static void llm_load_vocab(
        //       for now, we apply this workaround to find the EOT token based on its text
        if (vocab.special_eot_id == -1) {
            for (const auto & t : vocab.token_to_id) {
-                if (
+                if (false
                        // TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
                        //       need to fix convert script
                        //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
-                        (t.first == "<|eot_id|>" ||
+                        || t.first == "<|eot_id|>"
-                         t.first == "<|im_end|>" ||
+                        || t.first == "<|im_end|>"
-                         t.first == "<|end|>" ||
+                        || t.first == "<|end|>"
-                         t.first == "<end_of_turn>" ||
+                        || t.first == "<end_of_turn>"
-                         t.first == "<|endoftext|>"
+                        || t.first == "<|endoftext|>"
-                        )
+                        || t.first == "<EOT>"
                   ) {
                    vocab.special_eot_id = t.second;
                    if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                        LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
-                            __func__, t.first.c_str());
+                                __func__, t.first.c_str());
                        vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                    }
                    break;
@ -6600,6 +6619,44 @@ static void llm_load_vocab(
                }
            }
        }
        // maintain a list of tokens that cause end-of-generation
        // this is currently determined based on the token text, which is obviously not ideal
        // ref: https://github.com/ggerganov/llama.cpp/issues/9606
        vocab.special_eog_ids.clear();
        for (const auto & t : vocab.token_to_id) {
            if (false
                    || t.first == "<|eot_id|>"
                    || t.first == "<|im_end|>"
                    || t.first == "<|end|>"
                    || t.first == "<end_of_turn>"
                    || t.first == "<|endoftext|>"
                    || t.first == "<|eom_id|>"
                    || t.first == "<EOT>"
               ) {
                vocab.special_eog_ids.insert(t.second);
                if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
                    LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
                            __func__, t.first.c_str());
                    vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
                }
            }
        }
        if (vocab.special_eos_id != -1 && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
            vocab.special_eog_ids.insert(vocab.special_eos_id);
            LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
        }
        if (vocab.special_eot_id != -1 && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
            vocab.special_eog_ids.insert(vocab.special_eot_id);
            LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
        }
        if (vocab.special_eom_id != -1 && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
            vocab.special_eog_ids.insert(vocab.special_eom_id);
            LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
        }
    }
    // build special tokens cache
@ -6803,6 +6860,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
    if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token        = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
    if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token        = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
    if (vocab.special_eot_id    != -1) { LLAMA_LOG_INFO( "%s: EOT token        = %d '%s'\n", __func__, vocab.special_eot_id,    vocab.id_to_token[vocab.special_eot_id].text.c_str() );    }
    if (vocab.special_eom_id    != -1) { LLAMA_LOG_INFO( "%s: EOM token        = %d '%s'\n", __func__, vocab.special_eom_id,    vocab.id_to_token[vocab.special_eom_id].text.c_str() );    }
    for (const auto & id : vocab.special_eog_ids) {
        LLAMA_LOG_INFO( "%s: EOG token        = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
    }
    LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
@ -6821,7 +6883,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
        LLAMA_LOG_INFO("%s: n_ff_shexp       = %d\n",     __func__, hparams.n_ff_shexp);
    }
-    if (model.arch == LLM_ARCH_GRANITE) {
+    if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
        LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
        LLAMA_LOG_INFO("%s: f_residual_scale  = %f\n", __func__, hparams.f_residual_scale);
        LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@ -7004,6 +7066,7 @@ static bool llm_load_tensors(
            case LLM_ARCH_REFACT:
            case LLM_ARCH_MINICPM:
            case LLM_ARCH_GRANITE:
            case LLM_ARCH_GRANITE_MOE:
                {
                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@ -9993,17 +10056,36 @@ struct llm_build_context {
            const int64_t n_head_kv = hparams.n_head_kv(il);
            const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
            struct ggml_tensor * rope_factors = build_rope_factors(il);
-            struct ggml_tensor * tmp =
+            struct ggml_tensor * k =
-                // we rotate only the first n_rot dimensions
+                ggml_view_3d(ctx0, kv_self.k_l[il],
-                ggml_rope_ext_inplace(ctx0,
+                    n_embd_head_k, n_head_kv, n_ctx,
-                        ggml_view_3d(ctx0, kv_self.k_l[il],
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
-                            n_embd_head_k, n_head_kv, n_ctx,
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                            ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
+                    0);
-                            ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+
-                            0),
+            struct ggml_tensor * tmp;
            if (ggml_is_quantized(k->type)) {
                // dequantize to f32 -> RoPE -> quantize back
                tmp = ggml_cast(ctx0, k, GGML_TYPE_F32);
                cb(tmp, "K_f32", il);
                for (auto * backend : lctx.backends) {
                    // Figure out which backend KV cache belongs to
                    if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft)) {
                        ggml_backend_sched_set_tensor_backend(lctx.sched, tmp, backend);
                        break;
                    }
                }
                tmp = ggml_rope_ext_inplace(ctx0, tmp,
                        lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
-
+                cb(tmp, "K_shifted_f32", il);
                tmp = ggml_cpy(ctx0, tmp, k);
            } else {
                // we rotate only the first n_rot dimensions
                tmp = ggml_rope_ext_inplace(ctx0, k,
                        lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor, beta_fast, beta_slow);
            }
            cb(tmp, "K_shifted", il);
            ggml_build_forward_expand(gf, tmp);
        }
@ -15949,6 +16031,7 @@ static struct ggml_cgraph * llama_build_graph(
    switch (model.arch) {
        case LLM_ARCH_LLAMA:
        case LLM_ARCH_GRANITE:
        case LLM_ARCH_GRANITE_MOE:
            {
                result = llm.build_llama();
            } break;
@ -18719,9 +18802,9 @@ struct llama_model * llama_load_model_from_file(
            unsigned percentage = (unsigned) (100 * progress);
            while (percentage > *cur_percentage_p) {
                *cur_percentage_p = percentage;
-                LLAMA_LOG(".");
+                LLAMA_LOG_CONT(".");
                if (percentage >= 100) {
-                    LLAMA_LOG("\n");
+                    LLAMA_LOG_CONT("\n");
                }
            }
            return true;
@ -19236,6 +19319,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
        case LLM_ARCH_DEEPSEEK2:
        case LLM_ARCH_CHATGLM:
        case LLM_ARCH_GRANITE:
        case LLM_ARCH_GRANITE_MOE:
            return LLAMA_ROPE_TYPE_NORM;
        // the pairs of head values are offset by n_rot/2