survived the storm, again

2025-09-10 17:14:36 +00:00 · 2025-01-16 22:25:18 +08:00 · 2025-01-16 22:25:18 +08:00 · 11cd7c7bb0
commit 11cd7c7bb0
parent 2a00ee8fa8 adc5dd92e8
38 changed files with 31785 additions and 26630 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -131,17 +131,26 @@ std::string common_arg::to_string() {

 static void common_params_handle_model_default(
        std::string & model,
-        std::string & model_url,
+        const std::string & model_url,
        std::string & hf_repo,
-        std::string & hf_file) {
+        std::string & hf_file,
+        const std::string & hf_token) {
    if (!hf_repo.empty()) {
        // short-hand to avoid specifying --hf-file -> default it to --model
        if (hf_file.empty()) {
            if (model.empty()) {
-                throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
+                auto auto_detected = common_get_hf_file(hf_repo, hf_token);
+                if (auto_detected.first.empty() || auto_detected.second.empty()) {
+                    exit(1); // built without CURL, error message already printed
                }
+                hf_repo = auto_detected.first;
+                hf_file = auto_detected.second;
+            } else {
                hf_file = model;
-        } else if (model.empty()) {
+            }
+        }
+        // make sure model path is present (for caching purposes)
+        if (model.empty()) {
            // this is to avoid different repo having same file name, or same file name in different subdirs
            std::string filename = hf_repo + "_" + hf_file;
            // to make sure we don't have any slashes in the filename
@ -291,8 +300,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
    }

    // TODO: refactor model params in a common struct
-    common_params_handle_model_default(params.model,         params.model_url,         params.hf_repo,         params.hf_file);
-    common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file);
+    common_params_handle_model_default(params.model,         params.model_url,         params.hf_repo,         params.hf_file,         params.hf_token);
+    common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token);

    if (params.escape) {
        string_process_escapes(params.prompt);
@ -769,15 +778,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-cnv", "--conversation"},
-        string_format(
        "run in conversation mode:\n"
        "- does not print special tokens and suffix/prefix\n"
        "- interactive mode is also enabled\n"
-            "(default: %s)",
-            params.conversation ? "true" : "false"
-        ),
+        "(default: auto enabled if chat template is available)",
        [](common_params & params) {
-            params.conversation = true;
+            params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"-no-cnv", "--no-conversation"},
+        "force disable conversation mode (default: false)",
+        [](common_params & params) {
+            params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
        }
    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
@ -1584,21 +1597,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_MODEL_URL"));
    add_opt(common_arg(
-        {"-hfr", "--hf-repo"}, "REPO",
-        "Hugging Face model repository (default: unused)",
+        {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
+        "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
+        "example: unsloth/phi-4-GGUF:q4_k_m\n"
+        "(default: unused)",
        [](common_params & params, const std::string & value) {
            params.hf_repo = value;
        }
    ).set_env("LLAMA_ARG_HF_REPO"));
    add_opt(common_arg(
        {"-hff", "--hf-file"}, "FILE",
-        "Hugging Face model file (default: unused)",
+        "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
        [](common_params & params, const std::string & value) {
            params.hf_file = value;
        }
    ).set_env("LLAMA_ARG_HF_FILE"));
    add_opt(common_arg(
-        {"-hfrv", "--hf-repo-v"}, "REPO",
+        {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
        "Hugging Face model repository for the vocoder model (default: unused)",
        [](common_params & params, const std::string & value) {
            params.vocoder.hf_repo = value;
--- a/common/common.cpp
+++ b/common/common.cpp
@ -75,6 +75,22 @@
 #include <sys/syslimits.h>
 #endif
 #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
+
+//
+// CURL utils
+//
+
+using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
+
+// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
+struct curl_slist_ptr {
+    struct curl_slist * ptr = nullptr;
+    ~curl_slist_ptr() {
+        if (ptr) {
+            curl_slist_free_all(ptr);
+        }
+    }
+};
 #endif // LLAMA_USE_CURL

 using json = nlohmann::ordered_json;
@ -1132,7 +1148,8 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma

 static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
    // Initialize libcurl
-    std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_slist_ptr http_headers;
    if (!curl) {
        LOG_ERR("%s: error initializing libcurl\n", __func__);
        return false;
@ -1146,11 +1163,9 @@ static bool common_download_file(const std::string & url, const std::string & pa

    // Check if hf-token or bearer-token was specified
    if (!hf_token.empty()) {
-      std::string auth_header = "Authorization: Bearer ";
-      auth_header += hf_token.c_str();
-      struct curl_slist *http_headers = NULL;
-      http_headers = curl_slist_append(http_headers, auth_header.c_str());
-      curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
+        std::string auth_header = "Authorization: Bearer " + hf_token;
+        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
+        curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
    }

 #if defined(_WIN32)
@ -1446,6 +1461,80 @@ struct llama_model * common_load_model_from_hf(
    return common_load_model_from_url(model_url, local_path, hf_token, params);
 }

+/**
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
+ *
+ * Return pair of <repo, file> (with "repo" already having tag removed)
+ *
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
+ */
+std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
+    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
+    std::string tag = parts.size() > 1 ? parts.back() : "latest";
+    std::string hf_repo = parts[0];
+    if (string_split<std::string>(hf_repo, '/').size() != 2) {
+        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
+    }
+
+    // fetch model info from Hugging Face Hub API
+    json model_info;
+    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_slist_ptr http_headers;
+    std::string res_str;
+    std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
+    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
+    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
+    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
+        static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
+        return size * nmemb;
+    };
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
+#if defined(_WIN32)
+    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
+    if (!hf_token.empty()) {
+        std::string auth_header = "Authorization: Bearer " + hf_token;
+        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
+    }
+    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
+    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
+
+    CURLcode res = curl_easy_perform(curl.get());
+
+    if (res != CURLE_OK) {
+        throw std::runtime_error("error: cannot make GET request to HF API");
+    }
+
+    long res_code;
+    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
+    if (res_code == 200) {
+        model_info = json::parse(res_str);
+    } else if (res_code == 401) {
+        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
+    } else {
+        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
+    }
+
+    // check response
+    if (!model_info.contains("ggufFile")) {
+        throw std::runtime_error("error: model does not have ggufFile");
+    }
+    json & gguf_file = model_info.at("ggufFile");
+    if (!gguf_file.contains("rfilename")) {
+        throw std::runtime_error("error: ggufFile does not have rfilename");
+    }
+
+    return std::make_pair(hf_repo, gguf_file.at("rfilename"));
+}
+
 #else

 struct llama_model * common_load_model_from_url(
@ -1467,6 +1556,11 @@ struct llama_model * common_load_model_from_hf(
    return nullptr;
 }

+std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
+    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
+    return std::make_pair("", "");
+}
+
 #endif // LLAMA_USE_CURL

 //
@ -1638,15 +1732,8 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
 //

 std::string common_get_builtin_chat_template(const struct llama_model * model) {
-    static const char * template_key = "tokenizer.chat_template";
-    // call with NULL buffer to get the total size of the string
-    int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
-    if (res > 0) {
-        std::vector<char> model_template(res + 1, 0);
-        llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
-        return std::string(model_template.data(), model_template.size() - 1);
-    }
-    return "";
+    const char * ptr_tmpl = llama_model_chat_template(model);
+    return ptr_tmpl == nullptr ? "" : ptr_tmpl;
 }

 bool common_chat_verify_template(const std::string & tmpl) {
--- a/common/common.h
+++ b/common/common.h
@ -99,6 +99,12 @@ enum dimre_method {
    DIMRE_METHOD_MEAN,
 };

+enum common_conversation_mode {
+    COMMON_CONVERSATION_MODE_DISABLED = 0,
+    COMMON_CONVERSATION_MODE_ENABLED  = 1,
+    COMMON_CONVERSATION_MODE_AUTO     = 2,
+};
+
 // sampling parameters
 struct common_params_sampling {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@ -273,7 +279,6 @@ struct common_params {
    bool special           = false; // enable special token output
    bool interactive       = false; // interactive mode
    bool interactive_first = false; // wait for user input immediately
-    bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it

@ -299,6 +304,8 @@ struct common_params {
    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V

+    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
+
    // multimodal models (see examples/llava)
    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
    std::vector<std::string> image; // path to image file(s)
@ -452,6 +459,11 @@ static bool string_starts_with(const std::string & str,
    return str.rfind(prefix, 0) == 0;
 }

+static bool string_ends_with(const std::string & str,
+                               const std::string & suffix) {  // While we wait for C++20's std::string::ends_with...
+    return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
+}
+
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);

@ -499,6 +511,9 @@ struct llama_model * common_load_model_from_hf(
    const std::string & local_path,
    const std::string & hf_token,
    const struct llama_model_params & params);
+std::pair<std::string, std::string> common_get_hf_file(
+    const std::string & hf_repo_with_tag,
+    const std::string & hf_token);

 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@ -41,7 +41,7 @@ echo PASS
 echo

 # 2b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
 echo PASS
 echo

@ -51,7 +51,7 @@ echo PASS
 echo

 # 3b. Test the merged model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
 echo PASS
 echo

@ -61,7 +61,7 @@ echo PASS
 echo

 # 4b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
 echo PASS
 echo

@ -71,7 +71,7 @@ echo
 #echo

 # 5b. Test the merged model is loading properly
-#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
+#$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
 #echo PASS
 #echo

@ -81,7 +81,7 @@ echo PASS
 echo

 # 6b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
 echo PASS
 echo

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -31,6 +31,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+static const char * DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant";
+
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
 static common_sampler          ** g_smpl;
@ -205,8 +207,24 @@ int main(int argc, char ** argv) {
        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
    }

+    // auto enable conversation mode if chat template is available
+    const bool has_chat_template = !common_get_builtin_chat_template(model).empty() || !params.chat_template.empty();
+    if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
+        if (has_chat_template) {
+            LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
+            params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
+        } else {
+            params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
+        }
+    }
+
+    // in case user force-activate conversation mode (via -cnv) without proper chat template, we show a warning
+    if (params.conversation_mode && !has_chat_template) {
+        LOG_WRN("%s: chat template is not available or is not supported. This may cause the model to output suboptimal responses\n", __func__);
+    }
+
    // print chat template example in conversation mode
-    if (params.conversation) {
+    if (params.conversation_mode) {
        if (params.enable_chat_template) {
            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
        } else {
@ -253,8 +271,10 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd_inp;

    {
-        auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty())
-            ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
+        auto prompt = (params.conversation_mode && params.enable_chat_template)
+            // format the system prompt in conversation mode (fallback to default if empty)
+            ? chat_add_and_format(model, chat_msgs, "system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
+            // otherwise use the prompt as is
            : params.prompt;
        if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
            LOG_DBG("tokenize the prompt\n");
@ -328,7 +348,7 @@ int main(int argc, char ** argv) {
        params.n_keep += add_bos; // always keep the BOS token
    }

-    if (params.conversation) {
+    if (params.conversation_mode) {
        params.interactive_first = true;
    }

@ -452,7 +472,11 @@ int main(int argc, char ** argv) {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
        LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-        LOG_INF(       "%s\n", control_message);
+        LOG_INF(       "%s", control_message);
+        if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) {
+            LOG_INF(   " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n");
+        }
+        LOG_INF("\n");

        is_interacting = params.interactive_first;
    }
@ -764,7 +788,7 @@ int main(int argc, char ** argv) {
            }

            // if current token is not EOG, we add it to current assistant message
-            if (params.conversation) {
+            if (params.conversation_mode) {
                const auto id = common_sampler_last(smpl);
                assistant_ss << common_token_to_piece(ctx, id, false);
            }
@ -772,7 +796,7 @@ int main(int argc, char ** argv) {
            if (n_past > 0 && is_interacting) {
                LOG_DBG("waiting for user input\n");

-                if (params.conversation) {
+                if (params.conversation_mode) {
                    LOG("\n> ");
                }

@ -782,7 +806,7 @@ int main(int argc, char ** argv) {
                }

                std::string buffer;
-                if (!params.input_prefix.empty() && !params.conversation) {
+                if (!params.input_prefix.empty() && !params.conversation_mode) {
                    LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
                    LOG("%s", params.input_prefix.c_str());
                }
@ -806,7 +830,7 @@ int main(int argc, char ** argv) {
                // Entering a empty line lets the user pass control back
                if (buffer.length() > 1) {
                    // append input suffix if any
-                    if (!params.input_suffix.empty() && !params.conversation) {
+                    if (!params.input_suffix.empty() && !params.conversation_mode) {
                        LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                        LOG("%s", params.input_suffix.c_str());
                    }
@ -819,7 +843,7 @@ int main(int argc, char ** argv) {
                        string_process_escapes(buffer);
                    }

-                    bool format_chat = params.conversation && params.enable_chat_template;
+                    bool format_chat = params.conversation_mode && params.enable_chat_template;
                    std::string user_inp = format_chat
                        ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
                        : std::move(buffer);
--- a/examples/quantize/tests.sh
+++ b/examples/quantize/tests.sh
@ -47,7 +47,7 @@ echo PASS
 echo

 # 3a. Test the requanted model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32
 echo PASS
 echo

@ -57,7 +57,7 @@ echo PASS
 echo

 # 4b. Test the requanted model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32
+$MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32
 echo PASS
 echo

--- a/examples/server/public/index.html.gz
+++ b/examples/server/public/index.html.gz
--- a/examples/server/webui/index.html
+++ b/examples/server/webui/index.html
@ -37,7 +37,7 @@
          <div v-for="conv in conversations" :class="{
            'btn btn-ghost justify-start font-normal': true,
            'btn-active': conv.id === viewingConvId,
-          }" @click="setViewingConv(conv.id)">
+          }" @click="setViewingConv(conv.id)" dir="auto">
            <span class="truncate">{{ conv.messages[0].content }}</span>
          </div>
          <div class="text-center text-xs opacity-40 mt-auto mx-4">
@ -156,6 +156,7 @@
            @keydown.enter.shift.exact.prevent="inputMsg += '\n'"
            :disabled="isGenerating"
            id="msg-input"
+            dir="auto"
          ></textarea>
          <button v-if="!isGenerating" class="btn btn-primary ml-2" @click="sendMessage" :disabled="inputMsg.length === 0">Send</button>
          <button v-else class="btn btn-neutral ml-2" @click="stopGeneration">Stop</button>
@ -248,6 +249,7 @@
        <!-- textarea for editing message -->
        <template v-if="editingContent !== null">
          <textarea
+            dir="auto"
            class="textarea textarea-bordered bg-base-100 text-base-content w-[calc(90vw-8em)] lg:w-96"
            v-model="editingContent"></textarea>
          <br/>
@ -258,7 +260,9 @@
          <!-- show loading dots for pending message -->
          <span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
          <!-- render message as markdown -->
-          <vue-markdown v-else :source="msg.content"></vue-markdown>
+          <div v-else dir="auto">
+            <vue-markdown :source="msg.content"></vue-markdown>
+          </div>
          <!-- render timings if enabled -->
          <div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">
            <div tabindex="0" role="button" class="cursor-pointer font-semibold text-sm opacity-60">Speed: {{ timings.predicted_per_second.toFixed(1) }} t/s</div>
--- a/examples/server/webui/src/main.js
+++ b/examples/server/webui/src/main.js
@ -111,12 +111,12 @@ const VueMarkdown = defineComponent(
      highlight: function (str, lang) { // Add highlight.js
        if (lang && hljs.getLanguage(lang)) {
          try {
-            return '<pre><code class="hljs">' +
+            return '<pre dir="auto"><code class="hljs">' +
                   hljs.highlight(str, { language: lang, ignoreIllegals: true }).value +
                   '</code></pre>';
          } catch (__) {}
        }
-        return '<pre><code class="hljs">' + md.value.utils.escapeHtml(str) + '</code></pre>';
+        return '<pre dir="auto"><code class="hljs">' + md.value.utils.escapeHtml(str) + '</code></pre>';
      }
    }));
    // support latex with double dollar sign and square brackets
--- a/examples/tts/README.md
+++ b/examples/tts/README.md
@ -78,3 +78,40 @@ play the audio:
 $ aplay output.wav
 ```

+### Running the example with llama-server
+Running this example with `llama-server` is also possible and requires two
+server instances to be started. One will serve the LLM model and the other
+will serve the voice decoder model.
+
+The LLM model server can be started with the following command:
+```console
+$ ./build/bin/llama-server -m ./models/outetts-0.2-0.5B-q8_0.gguf --port 8020
+```
+
+And the voice decoder model server can be started using:
+```console
+./build/bin/llama-server -m ./models/wavtokenizer-large-75-f16.gguf --port 8021 --embeddings --pooling none
+```
+
+Then we can run [tts-outetts.py](tts-outetts.py) to generate the audio.
+
+First create a virtual environment for python and install the required
+dependencies (this in only required to be done once):
+```console
+$ python3 -m venv venv
+$ source venv/bin/activate
+(venv) pip install requests numpy
+```
+
+And then run the python script using:
+```conole
+(venv) python ./examples/tts/tts-outetts.py http://localhost:8020 http://localhost:8021 "Hello world"
+spectrogram generated: n_codes: 90, n_embd: 1282
+converting to audio ...
+audio generated: 28800 samples
+audio written to file "output.wav"
+```
+And to play the audio we can again use aplay or any other media player:
+```console
+$ aplay output.wav
+```
--- a/examples/tts/tts-outetts.py
+++ b/examples/tts/tts-outetts.py
@ -3,6 +3,121 @@ import sys
 #import struct
 import requests
 import re
+import struct
+import numpy as np
+from concurrent.futures import ThreadPoolExecutor
+
+
+def fill_hann_window(size, periodic=True):
+    if periodic:
+        return np.hanning(size + 1)[:-1]
+    return np.hanning(size)
+
+
+def irfft(n_fft, complex_input):
+    return np.fft.irfft(complex_input, n=n_fft)
+
+
+def fold(buffer, n_out, n_win, n_hop, n_pad):
+    result = np.zeros(n_out)
+    n_frames = len(buffer) // n_win
+
+    for i in range(n_frames):
+        start = i * n_hop
+        end = start + n_win
+        result[start:end] += buffer[i * n_win:(i + 1) * n_win]
+
+    return result[n_pad:-n_pad] if n_pad > 0 else result
+
+
+def process_frame(args):
+    l, n_fft, ST, hann = args
+    frame = irfft(n_fft, ST[l])
+    frame = frame * hann
+    hann2 = hann * hann
+    return frame, hann2
+
+
+def embd_to_audio(embd, n_codes, n_embd, n_thread=4):
+    embd = np.asarray(embd, dtype=np.float32).reshape(n_codes, n_embd)
+
+    n_fft = 1280
+    n_hop = 320
+    n_win = 1280
+    n_pad = (n_win - n_hop) // 2
+    n_out = (n_codes - 1) * n_hop + n_win
+
+    hann = fill_hann_window(n_fft, True)
+
+    E = np.zeros((n_embd, n_codes), dtype=np.float32)
+    for l in range(n_codes):
+        for k in range(n_embd):
+            E[k, l] = embd[l, k]
+
+    half_embd = n_embd // 2
+    S = np.zeros((n_codes, half_embd + 1), dtype=np.complex64)
+
+    for k in range(half_embd):
+        for l in range(n_codes):
+            mag = E[k, l]
+            phi = E[k + half_embd, l]
+
+            mag = np.clip(np.exp(mag), 0, 1e2)
+            S[l, k] = mag * np.exp(1j * phi)
+
+    res = np.zeros(n_codes * n_fft)
+    hann2_buffer = np.zeros(n_codes * n_fft)
+
+    with ThreadPoolExecutor(max_workers=n_thread) as executor:
+        args = [(l, n_fft, S, hann) for l in range(n_codes)]
+        results = list(executor.map(process_frame, args))
+
+        for l, (frame, hann2) in enumerate(results):
+            res[l*n_fft:(l+1)*n_fft] = frame
+            hann2_buffer[l*n_fft:(l+1)*n_fft] = hann2
+
+    audio = fold(res, n_out, n_win, n_hop, n_pad)
+    env = fold(hann2_buffer, n_out, n_win, n_hop, n_pad)
+
+    mask = env > 1e-10
+    audio[mask] /= env[mask]
+
+    return audio
+
+
+def save_wav(filename, audio_data, sample_rate):
+    num_channels = 1
+    bits_per_sample = 16
+    bytes_per_sample = bits_per_sample // 8
+    data_size = len(audio_data) * bytes_per_sample
+    byte_rate = sample_rate * num_channels * bytes_per_sample
+    block_align = num_channels * bytes_per_sample
+    chunk_size = 36 + data_size  # 36 = size of header minus first 8 bytes
+
+    header = struct.pack(
+        '<4sI4s4sIHHIIHH4sI',
+        b'RIFF',
+        chunk_size,
+        b'WAVE',
+        b'fmt ',
+        16,                # fmt chunk size
+        1,                 # audio format (PCM)
+        num_channels,
+        sample_rate,
+        byte_rate,
+        block_align,
+        bits_per_sample,
+        b'data',
+        data_size
+    )
+
+    audio_data = np.clip(audio_data * 32767, -32768, 32767)
+    pcm_data = audio_data.astype(np.int16)
+
+    with open(filename, 'wb') as f:
+        f.write(header)
+        f.write(pcm_data.tobytes())
+

 def process_text(text: str):
    text = re.sub(r'\d+(\.\d+)?', lambda x: x.group(), text.lower()) # TODO this needs to be fixed
@ -170,6 +285,15 @@ n_embd = len(embd[0])
 print('spectrogram generated: n_codes: %d, n_embd: %d' % (n_codes, n_embd))

 # post-process the spectrogram to convert to audio
-# TODO: see the tts.cpp:embd_to_audio() and implement it in Python
 print('converting to audio ...')
-print('TODO: see the tts.cpp:embd_to_audio() and implement it in Python')
+audio = embd_to_audio(embd, n_codes, n_embd)
+print('audio generated: %d samples' % len(audio))
+
+filename = "output.wav"
+sample_rate = 24000 # sampling rate
+
+# zero out first 0.25 seconds
+audio[:24000 // 4] = 0.0
+
+save_wav(filename, audio, sample_rate)
+print('audio written to file "%s"' % filename)
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -1513,7 +1513,7 @@ extern "C" {

    // rotary position embedding backward, i.e compute dx from dy
    // a - dy
-    GGML_API struct ggml_tensor * ggml_rope_back(
+    GGML_API struct ggml_tensor * ggml_rope_ext_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a, // gradients of ggml_rope result
            struct ggml_tensor  * b, // positions
@ -1528,6 +1528,23 @@ extern "C" {
            float                 beta_fast,
            float                 beta_slow);

+    GGML_API struct ggml_tensor * ggml_rope_multi_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * c,
+            int                   n_dims,
+            int                   sections[4],
+            int                   mode,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
+
+
    // clamp
    // in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_clamp(
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -13712,6 +13712,7 @@ struct ggml_cplan ggml_graph_plan(
                    } break;
                case GGML_OP_SOFT_MAX:
                case GGML_OP_ROPE:
+                case GGML_OP_ROPE_BACK:
                    {
                        cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
                    } break;
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@ -403,8 +403,6 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
                op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
        case GGML_OP_MUL_MAT:
            return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
-        case GGML_OP_ROPE_BACK:
-            return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
        case GGML_OP_IM2COL_BACK:
            return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
        case GGML_OP_OUT_PROD:
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -2146,6 +2146,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_ROPE:
            ggml_cuda_op_rope(ctx, dst);
            break;
+        case GGML_OP_ROPE_BACK:
+            ggml_cuda_op_rope_back(ctx, dst);
+            break;
        case GGML_OP_IM2COL:
            ggml_cuda_op_im2col(ctx, dst);
            break;
@ -2294,119 +2297,8 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
 }

 #ifdef USE_CUDA_GRAPH
-static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
-    graph_node_properties->node_address = node->data;
-    graph_node_properties->node_op = node->op;
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        graph_node_properties->ne[i] = node->ne[i];
-        graph_node_properties->nb[i] = node->nb[i];
-    }
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
-    }
-    memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);
-}
-
-static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
-    if (node->data != graph_node_properties->node_address &&
-          node->op != GGML_OP_CPY &&
-          node->op != GGML_OP_VIEW) {
-        return false;
-    }
-
-    if (node->op != graph_node_properties->node_op) {
-        return false;
-    }
-
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if (node->ne[i] != graph_node_properties->ne[i]) {
-            return false;
-        }
-        if (node->nb[i] != graph_node_properties->nb[i]) {
-            return false;
-        }
-    }
-
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (node->src[i] &&
-            node->src[i]->data != graph_node_properties->src_address[i] &&
-            node->op != GGML_OP_CPY &&
-            node->op != GGML_OP_VIEW
-        ) {
-            return false;
-        }
-    }
-
-    if (node->op == GGML_OP_SCALE &&
-        memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
-        return false;
-    }
-
-    return true;
-}
-#endif
-
-static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
-    ggml_cuda_set_device(cuda_ctx->device);
-
-#ifdef USE_CUDA_GRAPH
-    static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
-
-    // Objects required for CUDA Graph
-    if (cuda_ctx->cuda_graph == nullptr) {
-        cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
-    }
-
-    bool use_cuda_graph = true;
-    bool cuda_graph_update_required = false;
-    // vector of pointers to CUDA cpy kernels, which are required to identify
-    // kernel parameters which need updated in the graph for each token
-    std::vector<void *> ggml_cuda_cpy_fn_ptrs;
-
-    if (cuda_ctx->cuda_graph->graph == nullptr) {
-        if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
-            cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
-#ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
-#endif
-        }
-    }
-
-    // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
-    // or previous graph capture failure.
-    // Also disable for multi-gpu for now. TO DO investigate
-    if (disable_cuda_graphs_due_to_env
-        || cuda_ctx->cuda_graph->disable_due_to_gpu_arch
-        || cuda_ctx->cuda_graph->disable_due_to_too_many_updates
-        || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
-        use_cuda_graph = false;
-    }
-
-    if (use_cuda_graph) {
-        if (cuda_ctx->cuda_graph->instance == nullptr) {
-            cuda_graph_update_required = true;
-        }
-
-        // Check if the graph size has changed
-        if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
-            cuda_graph_update_required = true;
-            cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
-        }
-
-        // Loop over nodes in GGML graph to determine if CUDA graph update is required
-        // and store properties to allow this comparison for the next token
-        for (int i = 0; i < cgraph->n_nodes; i++) {
-            bool has_matching_properties = true;
-            if (!cuda_graph_update_required) {
-                has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
-            }
-            if (!has_matching_properties) {
-                cuda_graph_update_required = true;
-            }
-            set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
-        }
+static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
+    std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool use_cuda_graph) {

    // Loop over nodes in GGML graph to obtain info needed for CUDA graph
    cuda_ctx->cuda_graph->updated_kernel_arg.clear();
@ -2462,31 +2354,158 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
        }
    }

-        // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
-        if (use_cuda_graph && cuda_graph_update_required) {
-            cuda_ctx->cuda_graph->number_consecutive_updates++;
+    return use_cuda_graph;
+}
+
+static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
+    graph_node_properties->node_address = node->data;
+    graph_node_properties->node_op = node->op;
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        graph_node_properties->ne[i] = node->ne[i];
+        graph_node_properties->nb[i] = node->nb[i];
+    }
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
+    }
+    memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);
+}
+
+static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
+    if (node->data != graph_node_properties->node_address &&
+          node->op != GGML_OP_CPY &&
+          node->op != GGML_OP_VIEW) {
+        return false;
+    }
+
+    if (node->op != graph_node_properties->node_op) {
+        return false;
+    }
+
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if (node->ne[i] != graph_node_properties->ne[i]) {
+            return false;
+        }
+        if (node->nb[i] != graph_node_properties->nb[i]) {
+            return false;
+        }
+    }
+
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (node->src[i] &&
+            node->src[i]->data != graph_node_properties->src_address[i] &&
+            node->op != GGML_OP_CPY &&
+            node->op != GGML_OP_VIEW
+        ) {
+            return false;
+        }
+    }
+
+    if (node->op == GGML_OP_SCALE &&
+        memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
+        return false;
+    }
+
+    return true;
+}
+
+static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool cuda_graph_update_required) {
+
+    if (cuda_graph_update_required) {
+        // Extract nodes from graph
+        // First call with null argument gets number of nodes in graph
+        CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
+        // Subsequent call with non-null argument gets nodes
+        cuda_ctx->cuda_graph->nodes.clear();
+        cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
+        cuda_ctx->cuda_graph->params.clear();
+        cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
+        if (cuda_ctx->cuda_graph->num_nodes > 0) {
+            CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
+
+            // Loop over nodes, and extract kernel parameters from each node
+            for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
+                cudaGraphNodeType node_type;
+                CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
+                if (node_type == cudaGraphNodeTypeKernel) {
+                    cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
+                    if (stat == cudaErrorInvalidDeviceFunction) {
+                        // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
+                        // We don't need to update blas nodes, so clear error and move on.
+                        cudaGetLastError();
                    } else {
-            cuda_ctx->cuda_graph->number_consecutive_updates = 0;
+                        GGML_ASSERT(stat == cudaSuccess);
+                    }
+                }
+            }
+        }
+    } else {
+        // One of the arguments to the copy kernel is updated for each token, hence we need to
+        // replace that argument with the updated value in the CUDA graph
+        // on update steps, the live parameters will already be captured
+        int k = 0;
+        for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
+            if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
+                char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
+                cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
+                CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
+            }
        }
-
-        if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
-            cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
-#ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
-#endif
    }
 }

-    if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
-        CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
-    }
+static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {

-#else
-    bool use_cuda_graph = false;
    bool cuda_graph_update_required = false;
-#endif // USE_CUDA_GRAPH

-    bool graph_evaluated_or_captured = false;
+    if (cuda_ctx->cuda_graph->instance == nullptr) {
+        cuda_graph_update_required = true;
+    }
+
+    // Check if the graph size has changed
+    if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
+        cuda_graph_update_required = true;
+        cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
+    }
+
+    // Loop over nodes in GGML graph to determine if CUDA graph update is required
+    // and store properties to allow this comparison for the next token
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        bool has_matching_properties = true;
+        if (!cuda_graph_update_required) {
+            has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
+        }
+        if (!has_matching_properties) {
+            cuda_graph_update_required = true;
+        }
+        set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
+    }
+
+    return cuda_graph_update_required;
+}
+
+static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
+
+    cudaGraphExecUpdateResultInfo result_info;
+    cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
+    if (stat == cudaErrorGraphExecUpdateFailure) {
+#ifndef NDEBUG
+        GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
+#endif
+        // The pre-existing graph exec cannot be updated due to violated constraints
+        // so instead clear error and re-instantiate
+        cudaGetLastError();
+        CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
+        cuda_ctx->cuda_graph->instance = nullptr;
+        CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
+    } else {
+        GGML_ASSERT(stat == cudaSuccess);
+    }
+}
+#endif
+
+static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
+   [[maybe_unused]] std::vector<void *> & ggml_cuda_cpy_fn_ptrs,  bool & graph_evaluated_or_captured, bool & use_cuda_graph,
+    bool & cuda_graph_update_required) {

    while (!graph_evaluated_or_captured) {
        // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
@ -2524,19 +2543,8 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
                CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
                cuda_ctx->cuda_graph->graph = nullptr;
            }
-            CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));

-#if 0
-            if (disable_cuda_graphs_due_to_failed_capture) {
-                use_cuda_graph = false;
-                cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
-#ifndef NDEBUG
-                GGML_LOG_DEBUG("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
-#endif
-            } else {
-                graph_evaluated_or_captured = true; // CUDA graph has been captured
-            }
-#endif
+            CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
            graph_evaluated_or_captured = true; // CUDA graph has been captured
        } else {
            graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
@ -2549,72 +2557,91 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
        }

        // Perform update to graph (if required for this token), and change copy parameter (required for every token)
-
-        if (cuda_graph_update_required) {
-            // Extract nodes from graph
-            // First call with null argument gets number of nodes in graph
-            CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
-            // Subsequent call with non-null argument gets nodes
-            cuda_ctx->cuda_graph->nodes.clear();
-            cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
-            cuda_ctx->cuda_graph->params.clear();
-            cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
-            if (cuda_ctx->cuda_graph->num_nodes > 0) {
-                CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
-
-                // Loop over nodes, and extract kernel parameters from each node
-                for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
-                    cudaGraphNodeType node_type;
-                    CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
-                    if (node_type == cudaGraphNodeTypeKernel) {
-                        cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
-                        if (stat == cudaErrorInvalidDeviceFunction) {
-                            // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
-                            // We don't need to update blas nodes, so clear error and move on.
-                            cudaGetLastError();
-                        } else {
-                            GGML_ASSERT(stat == cudaSuccess);
-                        }
-                    }
-                }
-            }
-        }
-
-        // One of the arguments to the copy kernel is updated for each token, hence we need to
-        // replace that argument with the updated value in the CUDA graph
-        if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
-            int k = 0;
-            for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
-                if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
-                    char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
-                    cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
-                    CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
-                }
-            }
-        }
+        maintain_cuda_graph(cuda_ctx, ggml_cuda_cpy_fn_ptrs, cuda_graph_update_required);

        // Update graph executable
-        cudaGraphExecUpdateResultInfo result_info;
-        cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
-        if (stat == cudaErrorGraphExecUpdateFailure) {
-#ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
-#endif
-            // The pre-existing graph exec cannot be updated due to violated constraints
-            // so instead clear error and re-instantiate
-            cudaGetLastError();
-            CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
-            cuda_ctx->cuda_graph->instance = nullptr;
-            CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
-        } else {
-            GGML_ASSERT(stat == cudaSuccess);
-        }
+        update_cuda_graph_executable(cuda_ctx);
+
        // Launch graph
        CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
 #else
        graph_evaluated_or_captured = true;
 #endif  // USE_CUDA_GRAPH
    }
+}
+
+static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+
+    ggml_cuda_set_device(cuda_ctx->device);
+
+    // vector of pointers to CUDA cpy kernels, which are required to identify
+    // kernel parameters which need updated in the graph for each token
+    std::vector<void *> ggml_cuda_cpy_fn_ptrs;
+
+#ifdef USE_CUDA_GRAPH
+    static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
+
+    // Objects required for CUDA Graph
+    if (cuda_ctx->cuda_graph == nullptr) {
+        cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
+    }
+
+    bool use_cuda_graph = true;
+    bool cuda_graph_update_required = false;
+
+    if (cuda_ctx->cuda_graph->graph == nullptr) {
+        if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
+            cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
+#ifndef NDEBUG
+            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
+#endif
+        }
+    }
+
+    // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
+    // or previous graph capture failure.
+    // Also disable for multi-gpu for now. TO DO investigate
+    if (disable_cuda_graphs_due_to_env
+        || cuda_ctx->cuda_graph->disable_due_to_gpu_arch
+        || cuda_ctx->cuda_graph->disable_due_to_too_many_updates
+        || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
+        use_cuda_graph = false;
+    }
+
+    if (use_cuda_graph) {
+        cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
+
+        use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph,
+                             ggml_cuda_cpy_fn_ptrs, use_cuda_graph);
+
+        // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
+        if (use_cuda_graph && cuda_graph_update_required) {
+            cuda_ctx->cuda_graph->number_consecutive_updates++;
+        } else {
+            cuda_ctx->cuda_graph->number_consecutive_updates = 0;
+        }
+
+        if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
+            cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
+#ifndef NDEBUG
+            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
+#endif
+        }
+    }
+
+    if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
+        CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
+    }
+
+#else
+    bool use_cuda_graph = false;
+    bool cuda_graph_update_required = false;
+#endif // USE_CUDA_GRAPH
+
+    bool graph_evaluated_or_captured = false;
+
+    evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, ggml_cuda_cpy_fn_ptrs, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);

    return GGML_STATUS_SUCCESS;
 }
@ -3006,7 +3033,11 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_SOFT_MAX:
            return true;
        case GGML_OP_ROPE:
-            return ggml_is_contiguous(op->src[0]);
+        case GGML_OP_ROPE_BACK: {
+            const size_t ts = ggml_type_size(op->src[0]->type);
+            const int64_t ne0_012 = op->src[0]->ne[0] * op->src[0]->ne[1] * op->src[0]->ne[2];
+            return op->src[0]->nb[0] == ts && op->src[0]->nb[3] == ne0_012*ts;
+        }
        case GGML_OP_IM2COL:
        case GGML_OP_POOL_2D:
        case GGML_OP_SUM:
@ -3062,6 +3093,7 @@ static int64_t get_op_batch_size(const ggml_tensor * op) {
            return op->ne[1];
        case GGML_OP_MUL_MAT_ID:
        case GGML_OP_ROPE:
+        case GGML_OP_ROPE_BACK:
            return op->ne[2];
        default:
            return ggml_nrows(op);
--- a/ggml/src/ggml-cuda/rope.cu
+++ b/ggml/src/ggml-cuda/rope.cu
@ -16,9 +16,10 @@ static __device__ float rope_yarn_ramp(const float low, const float high, const

 // YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+template<bool forward>
 static __device__ void rope_yarn(
-    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
-    float * cos_theta, float * sin_theta) {
+        const float theta_extrap, const float freq_scale, const rope_corr_dims corr_dims, const int64_t i0, const float ext_factor,
+        float mscale, float & cos_theta, float & sin_theta) {
    // Get n-d rotational scaling corrected for extrapolation
    float theta_interp = freq_scale * theta_extrap;
    float theta = theta_interp;
@ -29,24 +30,28 @@ static __device__ void rope_yarn(
        // Get n-d magnitude scaling corrected for interpolation
        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
    }
-    *cos_theta = cosf(theta) * mscale;
-    *sin_theta = sinf(theta) * mscale;
+    cos_theta = cosf(theta) * mscale;
+    sin_theta = sinf(theta) * mscale;
+    if (!forward) {
+        sin_theta *= -1.0f;
+    }
 }

-template<typename T, bool has_ff>
+template<bool forward, bool has_ff, typename T>
 static __global__ void rope_norm(
-    const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors) {
+        const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
+        const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor,
+        const rope_corr_dims corr_dims, const float theta_scale, const float * __restrict__ freq_factors) {
    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);

    if (i0 >= ne0) {
        return;
    }

-    const int row = blockDim.x*blockIdx.x + threadIdx.x;
+    const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;

    if (i0 >= n_dims) {
-        const int i = row*ne0 + i0;
+        const int i = row_dst*ne0 + i0;

        dst[i + 0] = x[i + 0];
        dst[i + 1] = x[i + 1];
@ -54,39 +59,43 @@ static __global__ void rope_norm(
        return;
    }

-    const int i  = row*ne0 + i0;
-    const int i2 = row/p_delta_rows;
+    const int row_x     = row_dst % ne1;
+    const int channel_x = row_dst / ne1;

-    const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
+    const int idst = row_dst*ne0 + i0;
+    const int ix   = channel_x*s2 + row_x*s1 + i0;
+
+    const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);

    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;

    float cos_theta;
    float sin_theta;

-    rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+    rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta);

-    const float x0 = x[i + 0];
-    const float x1 = x[i + 1];
+    const float x0 = x[ix + 0];
+    const float x1 = x[ix + 1];

-    dst[i + 0] = x0*cos_theta - x1*sin_theta;
-    dst[i + 1] = x0*sin_theta + x1*cos_theta;
+    dst[idst + 0] = x0*cos_theta - x1*sin_theta;
+    dst[idst + 1] = x0*sin_theta + x1*cos_theta;
 }

-template<typename T, bool has_ff>
+template<bool forward, bool has_ff, typename T>
 static __global__ void rope_neox(
-    const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors) {
+        const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
+        const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor,
+        const rope_corr_dims corr_dims, const float theta_scale, const float * __restrict__ freq_factors) {
    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);

    if (i0 >= ne0) {
        return;
    }

-    const int row = blockDim.x*blockIdx.x + threadIdx.x;
+    const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;

    if (i0 >= n_dims) {
-        const int i = row*ne0 + i0;
+        const int i = row_dst*ne0 + i0;

        dst[i + 0] = x[i + 0];
        dst[i + 1] = x[i + 1];
@ -94,39 +103,43 @@ static __global__ void rope_neox(
        return;
    }

-    const int i  = row*ne0 + i0/2;
-    const int i2 = row/p_delta_rows;
+    const int row_x     = row_dst % ne1;
+    const int channel_x = row_dst / ne1;

-    const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
+    const int idst = row_dst*ne0 + i0/2;
+    const int ix   = channel_x*s2 + row_x*s1 + i0/2;
+
+    const float theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);

    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;

    float cos_theta;
    float sin_theta;

-    rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+    rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta);

-    const float x0 = x[i + 0];
-    const float x1 = x[i + n_dims/2];
+    const float x0 = x[ix + 0];
+    const float x1 = x[ix + n_dims/2];

-    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
-    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
+    dst[idst + 0]        = x0*cos_theta - x1*sin_theta;
+    dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta;
 }

-template<typename T, bool has_ff>
+template<bool forward, bool has_ff, typename T>
 static __global__ void rope_multi(
-    const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
+        const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2,
+        const int n_dims, const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor,
+        const rope_corr_dims corr_dims, const float theta_scale, const float * __restrict__ freq_factors, const mrope_sections sections) {
    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);

    if (i0 >= ne0) {
        return;
    }

-    const int row = blockDim.x*blockIdx.x + threadIdx.x;
+    const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;

    if (i0 >= n_dims) {
-        const int i = row*ne0 + i0;
+        const int i = row_dst*ne0 + i0;

        dst[i + 0] = x[i + 0];
        dst[i + 1] = x[i + 1];
@ -134,25 +147,28 @@ static __global__ void rope_multi(
        return;
    }

-    const int i  = row*ne0 + i0/2;
-    const int i2 = row/p_delta_rows;
+    const int row_x     = row_dst % ne1;
+    const int channel_x = row_dst / ne1;

-    int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
-    int sec_w = sections.v[1] + sections.v[0];
-    int sector = (i0 / 2) % sect_dims;
+    const int idst = row_dst*ne0 + i0/2;
+    const int ix   = channel_x*s2 + row_x*s1 + i0/2;
+
+    const int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
+    const int sec_w = sections.v[1] + sections.v[0];
+    const int sector = (i0 / 2) % sect_dims;

    float theta_base = 0.0;
    if (sector < sections.v[0]) {
-        theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
+        theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
    }
    else if (sector >= sections.v[0] && sector < sec_w) {
-        theta_base = pos[i2 + ne2 * 1]*powf(theta_scale, i0/2.0f);
+        theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
    }
    else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
-        theta_base = pos[i2 + ne2 * 2]*powf(theta_scale, i0/2.0f);
+        theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
    }
    else if (sector >= sec_w + sections.v[2]) {
-        theta_base = pos[i2 + ne2 * 3]*powf(theta_scale, i0/2.0f);
+        theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
    }

    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
@ -160,42 +176,46 @@ static __global__ void rope_multi(
    float cos_theta;
    float sin_theta;

-    rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+    rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta);

-    const float x0 = x[i + 0];
-    const float x1 = x[i + n_dims/2];
+    const float x0 = x[ix + 0];
+    const float x1 = x[ix + n_dims/2];

-    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
-    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
+    dst[idst + 0]        = x0*cos_theta - x1*sin_theta;
+    dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta;
 }

-template<typename T, bool has_ff>
+template<bool forward, bool has_ff, typename T>
 static __global__ void rope_vision(
-    const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
+        const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims,
+        const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
+        const float theta_scale, const float * __restrict__ freq_factors, const mrope_sections sections) {
    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);

    if (i0 >= ne0) {
        return;
    }

-    const int row = blockDim.x*blockIdx.x + threadIdx.x;
+    const int row_dst = blockDim.x*blockIdx.x + threadIdx.x;

-    const int i  = row*ne0 + i0/2;
-    const int i2 = row/p_delta_rows; // i2-th tokens
+    const int row_x     = row_dst % ne1;
+    const int channel_x = row_dst / ne1;

-    int sect_dims = sections.v[0] + sections.v[1];
-    int sec_w = sections.v[1] + sections.v[0];
-    int sector = (i0 / 2) % sect_dims;
+    const int idst = row_dst*ne0 + i0/2;
+    const int ix   = channel_x*s2 + row_x*s1 + i0/2;
+
+    const int sect_dims = sections.v[0] + sections.v[1];
+    const int sec_w = sections.v[1] + sections.v[0];
+    const int sector = (i0 / 2) % sect_dims;

    float theta_base = 0.0;
    if (sector < sections.v[0]) {
        const int p = sector;
-        theta_base = pos[i2]*powf(theta_scale, p);
+        theta_base = pos[channel_x]*powf(theta_scale, p);
    }
    else if (sector >= sections.v[0] && sector < sec_w) {
        const int p = sector - sections.v[0];
-        theta_base = pos[i2 + ne2]*powf(theta_scale, p);
+        theta_base = pos[channel_x + ne2]*powf(theta_scale, p);
    }

    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
@ -203,19 +223,20 @@ static __global__ void rope_vision(
    float cos_theta;
    float sin_theta;

-    rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+    rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta);

-    const float x0 = x[i + 0];
-    const float x1 = x[i + n_dims];
+    const float x0 = x[ix + 0];
+    const float x1 = x[ix + n_dims];

-    dst[i + 0]      = x0*cos_theta - x1*sin_theta;
-    dst[i + n_dims] = x0*sin_theta + x1*cos_theta;
+    dst[idst + 0]      = x0*cos_theta - x1*sin_theta;
+    dst[idst + n_dims] = x0*sin_theta + x1*cos_theta;
 }

-template<typename T>
+template<bool forward, typename T>
 static void rope_norm_cuda(
-    const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
+        const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr,
+        const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
+        const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, cudaStream_t stream) {
    GGML_ASSERT(ne0 % 2 == 0);
    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
@ -224,22 +245,21 @@ static void rope_norm_cuda(
    const float theta_scale = powf(freq_base, -2.0f/n_dims);

    if (freq_factors == nullptr) {
-        rope_norm<T, false><<<block_nums, block_dims, 0, stream>>>(
-                x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-                theta_scale, freq_factors
-                );
+        rope_norm<forward, false><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors);
    } else {
-        rope_norm<T, true><<<block_nums, block_dims, 0, stream>>>(
-                x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-                theta_scale, freq_factors
-                );
+        rope_norm<forward, true><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors);
    }
 }

-template<typename T>
+template<bool forward, typename T>
 static void rope_neox_cuda(
-    const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
+        const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr,
+        const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
+        const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, cudaStream_t stream) {
    GGML_ASSERT(ne0 % 2 == 0);
    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
@ -248,22 +268,21 @@ static void rope_neox_cuda(
    const float theta_scale = powf(freq_base, -2.0f/n_dims);

    if (freq_factors == nullptr) {
-        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
-                x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-                theta_scale, freq_factors
-                );
+        rope_neox<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors);
    } else {
-        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
-                x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-                theta_scale, freq_factors
-                );
+        rope_neox<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors);
    }
 }

-template<typename T>
+template<bool forward, typename T>
 static void rope_multi_cuda(
-    const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
+        const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
+        const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
+        const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, const mrope_sections sections, cudaStream_t stream) {
    GGML_ASSERT(ne0 % 2 == 0);
    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
@ -272,22 +291,21 @@ static void rope_multi_cuda(
    const float theta_scale = powf(freq_base, -2.0f/n_dims);

    if (freq_factors == nullptr) {
-        rope_multi<T, false><<<block_nums, block_dims, 0, stream>>>(
-                x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-                theta_scale, freq_factors, sections
-                );
+        rope_multi<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors, sections);
    } else {
-        rope_multi<T, true><<<block_nums, block_dims, 0, stream>>>(
-                x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-                theta_scale, freq_factors, sections
-                );
+        rope_multi<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors, sections);
    }
 }

-template<typename T>
+template<bool forward, typename T>
 static void rope_vision_cuda(
-    const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
+        const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
+        const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
+        const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, const mrope_sections sections, cudaStream_t stream) {
    GGML_ASSERT(ne0 % 2 == 0);
    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
@ -298,80 +316,18 @@ static void rope_vision_cuda(
    const float theta_scale = powf(freq_base, -2.0f/n_dims);

    if (freq_factors == nullptr) {
-        rope_vision<T, false><<<block_nums, block_dims, 0, stream>>>(
-                x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-                theta_scale, freq_factors, sections
-                );
+        rope_vision<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors, sections);
    } else {
-        rope_vision<T, true><<<block_nums, block_dims, 0, stream>>>(
-                x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-                theta_scale, freq_factors, sections
-                );
+        rope_vision<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
+            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
+            attn_factor, corr_dims, theta_scale, freq_factors, sections);
    }
 }

-static void rope_norm_cuda_f16(
-    const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
-
-    rope_norm_cuda<half>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
-}
-
-static void rope_norm_cuda_f32(
-    const float * x, float * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
-
-    rope_norm_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
-}
-
-static void rope_neox_cuda_f16(
-    const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
-
-    rope_neox_cuda<half>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
-}
-
-static void rope_neox_cuda_f32(
-    const float * x, float * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream
-) {
-
-    rope_neox_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
-}
-
-static void rope_multi_cuda_f16(
-    const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
-) {
-
-    rope_multi_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
-}
-
-static void rope_multi_cuda_f32(
-    const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
-) {
-
-    rope_multi_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
-}
-
-static void rope_vision_cuda_f16(
-    const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
-) {
-
-    rope_vision_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
-}
-
-static void rope_vision_cuda_f32(
-    const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
-) {
-
-    rope_vision_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
-}
-
-void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+template <bool forward>
+void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
    const ggml_tensor * src2 = dst->src[2];
@ -382,7 +338,6 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();

-    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
    GGML_ASSERT(src0->type == dst->type);
@ -392,6 +347,9 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const int64_t ne02 = src0->ne[2]; // num heads
    const int64_t nr = ggml_nrows(src0);

+    const size_t s01 = src0->nb[1] / ggml_type_size(src0->type);
+    const size_t s02 = src0->nb[2] / ggml_type_size(src0->type);
+
    //const int n_past     = ((int32_t *) dst->op_params)[0];
    const int n_dims     = ((int32_t *) dst->op_params)[1];
    const int mode       = ((int32_t *) dst->op_params)[2];
@ -440,59 +398,59 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    // compute
    if (is_neox) {
        if (src0->type == GGML_TYPE_F32) {
-            rope_neox_cuda_f32(
-                (const float *)src0_d, (float *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, freq_factors, stream
-            );
+            rope_neox_cuda<forward>(
+                (const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
        } else if (src0->type == GGML_TYPE_F16) {
-            rope_neox_cuda_f16(
-                (const half *)src0_d, (half *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, freq_factors, stream
-            );
+            rope_neox_cuda<forward>(
+                (const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
        } else {
            GGML_ABORT("fatal error");
        }
    } else if (is_mrope && !is_vision) {
        if (src0->type == GGML_TYPE_F32) {
-            rope_multi_cuda_f32(
-                (const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, freq_factors, sections, stream
-            );
+            rope_multi_cuda<forward>(
+                (const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
        } else if (src0->type == GGML_TYPE_F16) {
-            rope_multi_cuda_f16(
-                (const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, freq_factors, sections, stream
-            );
+            rope_multi_cuda<forward>(
+                (const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
        } else {
            GGML_ABORT("fatal error");
        }
    } else if (is_vision) {
        if (src0->type == GGML_TYPE_F32) {
-            rope_vision_cuda_f32(
-                (const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, freq_factors, sections, stream
-            );
+            rope_vision_cuda<forward>(
+                (const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
        } else if (src0->type == GGML_TYPE_F16) {
-            rope_vision_cuda_f16(
-                (const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, freq_factors, sections, stream
-            );
+            rope_vision_cuda<forward>(
+                (const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
        } else {
            GGML_ABORT("fatal error");
        }
    } else {
        if (src0->type == GGML_TYPE_F32) {
-            rope_norm_cuda_f32(
-                (const float *)src0_d, (float *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, freq_factors, stream
-            );
+            rope_norm_cuda<forward>(
+                (const float *) src0_d, (float *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
        } else if (src0->type == GGML_TYPE_F16) {
-            rope_norm_cuda_f16(
-                (const half *)src0_d, (half *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, freq_factors, stream
-            );
+            rope_norm_cuda<forward>(
+                (const half *) src0_d, (half *) dst_d, ne00, ne01, s01, s02, n_dims, nr, pos, freq_scale,
+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
        } else {
            GGML_ABORT("fatal error");
        }
    }
 }
+
+void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_rope_impl<true>(ctx, dst);
+}
+
+void ggml_cuda_op_rope_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_rope_impl<false>(ctx, dst);
+}
--- a/ggml/src/ggml-cuda/rope.cuh
+++ b/ggml/src/ggml-cuda/rope.cuh
@ -3,3 +3,5 @@
 #define CUDA_ROPE_BLOCK_SIZE 256

 void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_rope_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-sycl/backend.hpp
+++ b/ggml/src/ggml-sycl/backend.hpp
@ -29,5 +29,6 @@
 #include "wkv6.hpp"
 #include "outprod.hpp"
 #include "element_wise.hpp"
+#include "gla.hpp"

 #endif // GGML_SYCL_BACKEND_HPP
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@ -4040,6 +4040,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
        case GGML_OP_RWKV_WKV6:
            ggml_sycl_op_rwkv_wkv6(ctx, dst);
            break;
+        case GGML_OP_GATED_LINEAR_ATTN:
+            ggml_sycl_op_gated_linear_attn(ctx, dst);
+            break;
        default:
            return false;
    }
@ -4507,6 +4510,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_LEAKY_RELU:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_RWKV_WKV6:
+        case GGML_OP_GATED_LINEAR_ATTN:
            return true;
        default:
            return false;
--- a/ggml/src/ggml-sycl/gla.cpp
+++ b/ggml/src/ggml-sycl/gla.cpp
@ -0,0 +1,105 @@
+#include <sycl/sycl.hpp>
+
+#include "common.hpp"
+
+template <u_int HEAD_SIZE>
+static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B, u_int T, u_int C, u_int H, float scale,
+                                         const float * k, const float * v, const float * r, const float * td,
+                                         const float * s, float * dst) {
+    const u_int head_size    = HEAD_SIZE;
+    const u_int state_size   = C * head_size;
+    const u_int n_seq_tokens = T / B;
+    sycl::range<1> block_dims((C / H));
+    sycl::range<1> grid_dims((B * H));
+    stream->submit([&](sycl::handler & cgh) {
+        /* local memory accessors*/
+        auto _k  = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
+        auto _r  = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
+        auto _td = sycl::local_accessor<float, 1>(sycl::range<1>(head_size), cgh);
+
+        cgh.parallel_for(sycl::nd_range<1>(grid_dims * block_dims, block_dims), [=](sycl::nd_item<1> item) {
+            u_int tid = item.get_local_id(0);
+            u_int bid = item.get_group(0);
+
+            u_int batch_i = bid / H;
+            u_int head_i  = bid % H;
+
+            float state[head_size];
+
+#pragma unroll
+            for (u_int i = 0; i < head_size; i++) {
+                state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid];
+            }
+
+            for (u_int t = batch_i * n_seq_tokens * C + head_i * head_size + tid;
+                 t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; t += C) {
+
+                item.barrier(sycl::access::fence_space::local_space);  //sync threads
+                _k[tid]  = k[t];
+                _r[tid]  = r[t];
+                _td[tid] = td[t];
+                item.barrier(sycl::access::fence_space::local_space);  //sync threads
+
+                const float _v = v[t];
+                float       y  = 0;
+
+                for (u_int j = 0; j < head_size; j += 4) {
+                    const sycl::float4 & k  = (sycl::float4 &) (_k[j]);
+                    const sycl::float4 & r  = (sycl::float4 &) (_r[j]);
+                    const sycl::float4 & td = (sycl::float4 &) (_td[j]);
+                    sycl::float4 &       s  = (sycl::float4 &) (state[j]);
+                    sycl::float4         kv;
+
+                    kv.x() = k.x() * _v;
+                    kv.y() = k.y() * _v;
+                    kv.z() = k.z() * _v;
+                    kv.w() = k.w() * _v;
+
+                    s.x() = s.x() * td.x() + kv.x();
+                    s.y() = s.y() * td.y() + kv.y();
+                    s.z() = s.z() * td.z() + kv.z();
+                    s.w() = s.w() * td.w() + kv.w();
+
+                    y += r.x() * s.x();
+                    y += r.y() * s.y();
+                    y += r.z() * s.z();
+                    y += r.w() * s.w();
+                }
+                dst[t] = y * scale;
+            }
+#pragma unroll
+            for (u_int i = 0; i < head_size; i++) {
+                dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i];
+            }
+        });
+    });
+}
+
+void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    const float * k_d  = static_cast<const float *>(dst->src[0]->data);
+    const float * v_d  = static_cast<const float *>(dst->src[1]->data);
+    const float * r_d  = static_cast<const float *>(dst->src[2]->data);
+    const float * td_d = static_cast<const float *>(dst->src[3]->data);
+    const float * s_d  = static_cast<const float *>(dst->src[4]->data);
+
+    const int64_t B = dst->src[4]->ne[1];
+    const int64_t T = dst->src[0]->ne[2];
+    const int64_t C = dst->ne[0];
+    const int64_t H = dst->src[0]->ne[1];
+
+    dpct::queue_ptr stream = ctx.stream();
+    GGML_ASSERT(dst->src[4]->type == GGML_TYPE_F32);
+    GGML_ASSERT(C % H == 0);
+    GGML_ASSERT(C / H == 64 || C / H == 128);
+
+    float scale;
+    memcpy(&scale, dst->op_params, sizeof(float));
+
+    float * dst_d = (float *) dst->data;
+
+    if (C / H == 64) {
+        gated_linear_attn_f32_kernel<64>(stream, B, T, C, H, scale, k_d, v_d, r_d, td_d, s_d, dst_d);
+    } else {
+        gated_linear_attn_f32_kernel<128>(stream, B, T, C, H, scale, k_d, v_d, r_d, td_d, s_d, dst_d);
+    }
+}
--- a/ggml/src/ggml-sycl/gla.hpp
+++ b/ggml/src/ggml-sycl/gla.hpp
@ -0,0 +1,8 @@
+#ifndef GGML_SYCL_GLA_HPP
+#define GGML_SYCL_GLA_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+#endif  // GGML_SYCL_GLA_HPP
--- a/ggml/src/ggml-vulkan-shaders.cpp
+++ b/ggml/src/ggml-vulkan-shaders.cpp
--- a/ggml/src/ggml-vulkan-shaders.hpp
+++ b/ggml/src/ggml-vulkan-shaders.hpp
@ -1683,62 +1683,62 @@ const uint64_t get_rows_q8_0_len = 3704;
 extern unsigned char get_rows_q8_0_f32_data[3688];
 const uint64_t get_rows_q8_0_f32_len = 3688;

-extern unsigned char mul_mat_vec_q2_k_f32_f32_data[19540];
-const uint64_t mul_mat_vec_q2_k_f32_f32_len = 19540;
+extern unsigned char mul_mat_vec_q2_k_f32_f32_data[29788];
+const uint64_t mul_mat_vec_q2_k_f32_f32_len = 29788;

-extern unsigned char mul_mat_vec_q2_k_f16_f32_data[19780];
-const uint64_t mul_mat_vec_q2_k_f16_f32_len = 19780;
+extern unsigned char mul_mat_vec_q2_k_f16_f32_data[30284];
+const uint64_t mul_mat_vec_q2_k_f16_f32_len = 30284;

-extern unsigned char mul_mat_vec_id_q2_k_f32_data[19276];
-const uint64_t mul_mat_vec_id_q2_k_f32_len = 19276;
+extern unsigned char mul_mat_vec_id_q2_k_f32_data[29556];
+const uint64_t mul_mat_vec_id_q2_k_f32_len = 29556;

 extern unsigned char dequant_q2_k_data[3960];
 const uint64_t dequant_q2_k_len = 3960;

-extern unsigned char mul_mat_vec_q3_k_f32_f32_data[26868];
-const uint64_t mul_mat_vec_q3_k_f32_f32_len = 26868;
+extern unsigned char mul_mat_vec_q3_k_f32_f32_data[32520];
+const uint64_t mul_mat_vec_q3_k_f32_f32_len = 32520;

-extern unsigned char mul_mat_vec_q3_k_f16_f32_data[27140];
-const uint64_t mul_mat_vec_q3_k_f16_f32_len = 27140;
+extern unsigned char mul_mat_vec_q3_k_f16_f32_data[33048];
+const uint64_t mul_mat_vec_q3_k_f16_f32_len = 33048;

-extern unsigned char mul_mat_vec_id_q3_k_f32_data[26604];
-const uint64_t mul_mat_vec_id_q3_k_f32_len = 26604;
+extern unsigned char mul_mat_vec_id_q3_k_f32_data[32304];
+const uint64_t mul_mat_vec_id_q3_k_f32_len = 32304;

 extern unsigned char dequant_q3_k_data[4828];
 const uint64_t dequant_q3_k_len = 4828;

-extern unsigned char mul_mat_vec_q4_k_f32_f32_data[18444];
-const uint64_t mul_mat_vec_q4_k_f32_f32_len = 18444;
+extern unsigned char mul_mat_vec_q4_k_f32_f32_data[17156];
+const uint64_t mul_mat_vec_q4_k_f32_f32_len = 17156;

-extern unsigned char mul_mat_vec_q4_k_f16_f32_data[18588];
-const uint64_t mul_mat_vec_q4_k_f16_f32_len = 18588;
+extern unsigned char mul_mat_vec_q4_k_f16_f32_data[17300];
+const uint64_t mul_mat_vec_q4_k_f16_f32_len = 17300;

-extern unsigned char mul_mat_vec_id_q4_k_f32_data[18180];
-const uint64_t mul_mat_vec_id_q4_k_f32_len = 18180;
+extern unsigned char mul_mat_vec_id_q4_k_f32_data[16892];
+const uint64_t mul_mat_vec_id_q4_k_f32_len = 16892;

 extern unsigned char dequant_q4_k_data[5984];
 const uint64_t dequant_q4_k_len = 5984;

-extern unsigned char mul_mat_vec_q5_k_f32_f32_data[19988];
-const uint64_t mul_mat_vec_q5_k_f32_f32_len = 19988;
+extern unsigned char mul_mat_vec_q5_k_f32_f32_data[18660];
+const uint64_t mul_mat_vec_q5_k_f32_f32_len = 18660;

-extern unsigned char mul_mat_vec_q5_k_f16_f32_data[20228];
-const uint64_t mul_mat_vec_q5_k_f16_f32_len = 20228;
+extern unsigned char mul_mat_vec_q5_k_f16_f32_data[18900];
+const uint64_t mul_mat_vec_q5_k_f16_f32_len = 18900;

-extern unsigned char mul_mat_vec_id_q5_k_f32_data[19724];
-const uint64_t mul_mat_vec_id_q5_k_f32_len = 19724;
+extern unsigned char mul_mat_vec_id_q5_k_f32_data[18396];
+const uint64_t mul_mat_vec_id_q5_k_f32_len = 18396;

 extern unsigned char dequant_q5_k_data[6032];
 const uint64_t dequant_q5_k_len = 6032;

-extern unsigned char mul_mat_vec_q6_k_f32_f32_data[19772];
-const uint64_t mul_mat_vec_q6_k_f32_f32_len = 19772;
+extern unsigned char mul_mat_vec_q6_k_f32_f32_data[24812];
+const uint64_t mul_mat_vec_q6_k_f32_f32_len = 24812;

-extern unsigned char mul_mat_vec_q6_k_f16_f32_data[19916];
-const uint64_t mul_mat_vec_q6_k_f16_f32_len = 19916;
+extern unsigned char mul_mat_vec_q6_k_f16_f32_data[25084];
+const uint64_t mul_mat_vec_q6_k_f16_f32_len = 25084;

-extern unsigned char mul_mat_vec_id_q6_k_f32_data[19508];
-const uint64_t mul_mat_vec_id_q6_k_f32_len = 19508;
+extern unsigned char mul_mat_vec_id_q6_k_f32_data[24580];
+const uint64_t mul_mat_vec_id_q6_k_f32_len = 24580;

 extern unsigned char dequant_q6_k_data[4264];
 const uint64_t dequant_q6_k_len = 4264;
--- a/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in
+++ b/ggml/src/ggml-vulkan/cmake/host-toolchain.cmake.in
@ -0,0 +1,15 @@
+set(CMAKE_BUILD_TYPE Release)
+set(CMAKE_C_FLAGS -O2)
+set(CMAKE_CXX_FLAGS -O2)
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
+set(CMAKE_C_COMPILER @HOST_C_COMPILER@)
+set(CMAKE_CXX_COMPILER @HOST_CXX_COMPILER@)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY @CMAKE_RUNTIME_OUTPUT_DIRECTORY@)
+
+if("@CMAKE_C_COMPILER_ID@" STREQUAL "MSVC")
+    foreach(CONFIG IN ITEMS DEBUG RELEASE MINSIZEREL RELWITHDEBINFO)
+        set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+    endforeach()
+endif()
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp
@ -5,63 +5,45 @@

 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

-void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
-    uint a_offset, b_offset, d_offset;
-    get_offsets(a_offset, b_offset, d_offset);
-
-    const uint num_blocks_per_row = p.ncols / QUANT_K;
-
-    // 16 threads are used to process each block
-    const uint it_size = gl_WorkGroupSize.x/16;
-    const uint tid = gl_LocalInvocationID.x;
-    const uint itid = tid%16;  // 0...16
-    const uint ix  = tid/16;
-
-    const uint step = 8;
-
-    const uint v_im = itid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const uint v_in = itid - step*v_im;                      // 0...15 or 0...7
-
-    const uint l0 = 2*v_in;                                  // 0...15
-    const uint q_offset = 32*v_im + l0;
-    const uint s_offset = 8*v_im;
-    const uint y_offset = 128*v_im + l0;
+shared FLOAT_TYPE sccache1[BLOCK_SIZE/16][16];
+shared FLOAT_TYPE sccache2[BLOCK_SIZE/16][16];

 FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];

-    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
-        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
-            temp[j][i] = FLOAT_TYPE(0);
-        }
-    }
-
-    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
+void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint v_im, const uint ix, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
    const uint y_idx = i * QUANT_K + y_offset;

    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
        const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
+
+        barrier();
+        if (!all_threads) { // when we don't have enough blocks to use all threads
+            if (i < num_blocks_per_row) {
+                const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
+                sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF);
+                sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
+            }
+            barrier();
+
+            if (i >= num_blocks_per_row)
+                continue;
+        } else {
+            const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
+            sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF);
+            sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
+            barrier();
+        }
+
+        const uint32_t qs_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16);
+        const vec4 qs_u32_0 = vec4(unpack8(qs_u32 & 0x03030303));
+        const vec4 qs_u32_2 = vec4(unpack8((qs_u32 >> 2) & 0x03030303));
+        const vec4 qs_u32_4 = vec4(unpack8((qs_u32 >> 4) & 0x03030303));
+        const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));
+
        vec2 d = vec2(data_a[ib0 + i].d);
        const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
        const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);

-            uint32_t s0_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 0];
-            uint32_t s4_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 1];
-
-            uint32_t s0_lo4_u32 = s0_u32 & 0x0F0F0F0F;
-            uint32_t s0_hi4_u32 = (s0_u32 >> 4) & 0x0F0F0F0F;
-            uint32_t s4_lo4_u32 = s4_u32 & 0x0F0F0F0F;
-            uint32_t s4_hi4_u32 = (s4_u32 >> 4) & 0x0F0F0F0F;
-
-            uvec4 s0_lo4 = uvec4(unpack8(s0_lo4_u32));
-            uvec4 s4_lo4 = uvec4(unpack8(s4_lo4_u32));
-            uvec4 s0_hi4 = uvec4(unpack8(s0_hi4_u32));
-            uvec4 s4_hi4 = uvec4(unpack8(s4_hi4_u32));
-
-            uint16_t qs0_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 0];
-            uint16_t qs16_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 8];
-            uvec2 qs0 =  uvec2(unpack8(qs0_u16));
-            uvec2 qs16 = uvec2(unpack8(qs16_u16));
-
        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
            vec2 b0 =   vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 +  0]);
            vec2 b16 =  vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 +  8]);
@ -75,28 +57,60 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
            FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
            FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
            [[unroll]] for (int l = 0; l < 2; ++l) {
-                    sum1 = fma(FLOAT_TYPE(b0[l]),   FLOAT_TYPE(s0_lo4[0]) * FLOAT_TYPE((qs0[l]  >> 0) & 3),
-                           fma(FLOAT_TYPE(b16[l]),  FLOAT_TYPE(s0_lo4[1]) * FLOAT_TYPE((qs16[l] >> 0) & 3),
-                           fma(FLOAT_TYPE(b32[l]),  FLOAT_TYPE(s0_lo4[2]) * FLOAT_TYPE((qs0[l]  >> 2) & 3),
-                           fma(FLOAT_TYPE(b48[l]),  FLOAT_TYPE(s0_lo4[3]) * FLOAT_TYPE((qs16[l] >> 2) & 3),
-                           fma(FLOAT_TYPE(b64[l]),  FLOAT_TYPE(s4_lo4[0]) * FLOAT_TYPE((qs0[l]  >> 4) & 3),
-                           fma(FLOAT_TYPE(b80[l]),  FLOAT_TYPE(s4_lo4[1]) * FLOAT_TYPE((qs16[l] >> 4) & 3),
-                           fma(FLOAT_TYPE(b96[l]),  FLOAT_TYPE(s4_lo4[2]) * FLOAT_TYPE((qs0[l]  >> 6) & 3),
-                           fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_lo4[3]) * FLOAT_TYPE((qs16[l] >> 6) & 3), sum1))))))));
-                    sum2 = fma(FLOAT_TYPE(b0[l]),   FLOAT_TYPE(s0_hi4[0]),
-                           fma(FLOAT_TYPE(b16[l]),  FLOAT_TYPE(s0_hi4[1]),
-                           fma(FLOAT_TYPE(b32[l]),  FLOAT_TYPE(s0_hi4[2]),
-                           fma(FLOAT_TYPE(b48[l]),  FLOAT_TYPE(s0_hi4[3]),
-                           fma(FLOAT_TYPE(b64[l]),  FLOAT_TYPE(s4_hi4[0]),
-                           fma(FLOAT_TYPE(b80[l]),  FLOAT_TYPE(s4_hi4[1]),
-                           fma(FLOAT_TYPE(b96[l]),  FLOAT_TYPE(s4_hi4[2]),
-                           fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_hi4[3]), sum2))))))));
+                sum1 = fma(FLOAT_TYPE(b0[l]),   sccache1[ix][    8*v_im] * qs_u32_0[l  ],
+                       fma(FLOAT_TYPE(b16[l]),  sccache1[ix][1 + 8*v_im] * qs_u32_0[l+2],
+                       fma(FLOAT_TYPE(b32[l]),  sccache1[ix][2 + 8*v_im] * qs_u32_2[l  ],
+                       fma(FLOAT_TYPE(b48[l]),  sccache1[ix][3 + 8*v_im] * qs_u32_2[l+2],
+                       fma(FLOAT_TYPE(b64[l]),  sccache1[ix][4 + 8*v_im] * qs_u32_4[l  ],
+                       fma(FLOAT_TYPE(b80[l]),  sccache1[ix][5 + 8*v_im] * qs_u32_4[l+2],
+                       fma(FLOAT_TYPE(b96[l]),  sccache1[ix][6 + 8*v_im] * qs_u32_6[l  ],
+                       fma(FLOAT_TYPE(b112[l]), sccache1[ix][7 + 8*v_im] * qs_u32_6[l+2], sum1))))))));
+                sum2 = fma(FLOAT_TYPE(b0[l]),   sccache2[ix][    8*v_im],
+                       fma(FLOAT_TYPE(b16[l]),  sccache2[ix][1 + 8*v_im],
+                       fma(FLOAT_TYPE(b32[l]),  sccache2[ix][2 + 8*v_im],
+                       fma(FLOAT_TYPE(b48[l]),  sccache2[ix][3 + 8*v_im],
+                       fma(FLOAT_TYPE(b64[l]),  sccache2[ix][4 + 8*v_im],
+                       fma(FLOAT_TYPE(b80[l]),  sccache2[ix][5 + 8*v_im],
+                       fma(FLOAT_TYPE(b96[l]),  sccache2[ix][6 + 8*v_im],
+                       fma(FLOAT_TYPE(b112[l]), sccache2[ix][7 + 8*v_im], sum2))))))));
            }
            temp[j][n] = fma(dall, sum1, fma(-dmin, sum2, temp[j][n]));
        }
    }
 }

+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+    uint a_offset, b_offset, d_offset;
+    get_offsets(a_offset, b_offset, d_offset);
+
+    const uint num_blocks_per_row = p.ncols / QUANT_K;
+
+    // 16 threads are used to process each block
+    const uint it_size = gl_WorkGroupSize.x/16;
+    const uint tid = gl_LocalInvocationID.x;
+    const uint itid = tid%16;  // 0...15
+    const uint ix = tid/16;
+
+    const uint v_im = itid/8;                                // 0 or 1. 0 computes 0..., 1 computes 128...
+    const uint v_in = itid - 8*v_im;                         // 0...7
+
+    const uint l0 = 2*v_in;                                  // 0...15
+    const uint q_offset = 32*v_im + l0;
+    const uint y_offset = 128*v_im + l0;
+
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+            temp[j][i] = FLOAT_TYPE(0);
+        }
+    }
+
+    const uint nbr_par_th = num_blocks_per_row%it_size;
+    const uint nbr_all_th = num_blocks_per_row - nbr_par_th;
+    uint i0 = 0;
+    [[unroll]] for (; i0 < nbr_all_th; i0 += it_size)
+        calc_superblock(a_offset, b_offset, itid, v_im, ix, q_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, true);
+    calc_superblock(a_offset, b_offset, itid, v_im, ix, q_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, false);
+
    reduce_result(temp, d_offset, first_row, num_rows, tid);
 }

--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp
@ -5,61 +5,49 @@

 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

-void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
-    uint a_offset, b_offset, d_offset;
-    get_offsets(a_offset, b_offset, d_offset);
-
-    const uint num_blocks_per_row = p.ncols / QUANT_K;
-
-    // 16 threads are used to process each block
-    const uint it_size = gl_WorkGroupSize.x/16;
-    const uint tid = gl_LocalInvocationID.x;
-    const uint itid = tid%16;  // 0...16
-    const uint ix  = tid/16;
-
-    const uint step = 8;
-
-    const uint v_im = itid/step;                            // 0 or 1. 0 computes 0..., 1 computes 128...
-    const uint v_in = itid - step*v_im;                     // 0...15 or 0...7
-
-    const uint8_t m = uint8_t(1 << (4 * v_im));
-
-    const uint l0 = 2*v_in;                                 // 0...15
-    const uint q_offset = 32*v_im + l0;
-    const uint y_offset = 128*v_im + l0;
+shared FLOAT_TYPE sccache[BLOCK_SIZE/16][2][8];

 FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];

-    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
-        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
-            temp[j][i] = FLOAT_TYPE(0);
-        }
-    }
-
-    const uint s_shift = 4 * v_im;
-
-    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
+void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, const uint itid8, const uint v_im, const uint v_im4, const uint v_in, const uint32_t hm_m[4], const uint q_offset, const uint y_offset, const uint s_shift, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
    const uint y_idx = i * QUANT_K + y_offset;

    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
        const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
+
+        if (!all_threads) { // when we don't have enough blocks to use all threads
+            barrier();
+            if (i < num_blocks_per_row)
+                sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
+            barrier();
+
+            if (i >= num_blocks_per_row)
+                continue;
+        }
+
+        const uint32_t hmk = ~(uint32_t(data_a_packed16[ib0 + i].hmask[v_in]) | (uint32_t(data_a_packed16[ib0 + i].hmask[v_in + 8]) << 16));
+        const vec4 hmk_0 = vec4(unpack8(((hmk & hm_m[0]) >> (    v_im4)) << 2));
+        const vec4 hmk_1 = vec4(unpack8(((hmk & hm_m[1]) >> (1 + v_im4)) << 2));
+        const vec4 hmk_2 = vec4(unpack8(((hmk & hm_m[2]) >> (2 + v_im4)) << 2));
+        const vec4 hmk_3 = vec4(unpack8(((hmk & hm_m[3]) >> (3 + v_im4)) << 2));
+
+        // 0, 1, 16, 17
+        uint32_t qs_u32 = uint32_t(data_a[ib0 + i].qs[q_offset]) | (uint32_t(data_a[ib0 + i].qs[q_offset + 1]) << 8);
+        qs_u32 |= (uint32_t(data_a[ib0 + i].qs[q_offset + 16]) | (uint32_t(data_a[ib0 + i].qs[q_offset + 17]) << 8)) << 16;
+        const vec4 qs_u32_0 = vec4(unpack8(qs_u32 & 0x03030303));
+        const vec4 qs_u32_2 = vec4(unpack8((qs_u32 >> 2) & 0x03030303));
+        const vec4 qs_u32_4 = vec4(unpack8((qs_u32 >> 4) & 0x03030303));
+        const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));
+
+        if (all_threads) {
+            barrier();
+            sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
+            barrier();
+        }
+
        const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);

-            uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0];
-            uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1];
-            uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2];
-            uint16_t s6_16 = data_a_packed16[ib0 + i].scales[3];
-            uint16_t s8_16 = data_a_packed16[ib0 + i].scales[4];
-            uint16_t s10_16 = data_a_packed16[ib0 + i].scales[5];
-            u8vec2 s0 = unpack8(s0_16);
-            u8vec2 s2 = unpack8(s2_16);
-            u8vec2 s4 = unpack8(s4_16);
-            u8vec2 s6 = unpack8(s6_16);
-            u8vec2 s8 = unpack8(s8_16);
-            u8vec2 s10 = unpack8(s10_16);
-
        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
-
            vec2 b0 =   vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 +  0]);
            vec2 b16 =  vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 +  8]);
            vec2 b32 =  vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16]);
@ -71,20 +59,61 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {

            FLOAT_TYPE sum = FLOAT_TYPE(0.0);
            [[unroll]] for (int l = 0; l < 2; ++l) {
-                    sum = fma(FLOAT_TYPE(b0[l])   * FLOAT_TYPE(int8_t(((s0[0] >> s_shift) & 0xF) | ((s8[0]  >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 0)) != 0) ? 0 : 4)),
-                          fma(FLOAT_TYPE(b32[l])  * FLOAT_TYPE(int8_t(((s2[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 1)) != 0) ? 0 : 4)),
-                          fma(FLOAT_TYPE(b64[l])  * FLOAT_TYPE(int8_t(((s4[0] >> s_shift) & 0xF) | ((s8[0]  >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 2)) != 0) ? 0 : 4)),
-                          fma(FLOAT_TYPE(b96[l])  * FLOAT_TYPE(int8_t(((s6[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 3)) != 0) ? 0 : 4)),
-                          fma(FLOAT_TYPE(b16[l])  * FLOAT_TYPE(int8_t(((s0[1] >> s_shift) & 0xF) | ((s8[1]  >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)),
-                          fma(FLOAT_TYPE(b48[l])  * FLOAT_TYPE(int8_t(((s2[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)),
-                          fma(FLOAT_TYPE(b80[l])  * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) | ((s8[1]  >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
-                          fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
+                sum = fma(FLOAT_TYPE(  b0[l]) * sccache[ix][v_im][0], qs_u32_0[l  ] - hmk_0[l  ],
+                      fma(FLOAT_TYPE( b16[l]) * sccache[ix][v_im][1], qs_u32_0[l+2] - hmk_0[l+2],
+                      fma(FLOAT_TYPE( b32[l]) * sccache[ix][v_im][2], qs_u32_2[l  ] - hmk_1[l  ],
+                      fma(FLOAT_TYPE( b48[l]) * sccache[ix][v_im][3], qs_u32_2[l+2] - hmk_1[l+2],
+                      fma(FLOAT_TYPE( b64[l]) * sccache[ix][v_im][4], qs_u32_4[l  ] - hmk_2[l  ],
+                      fma(FLOAT_TYPE( b80[l]) * sccache[ix][v_im][5], qs_u32_4[l+2] - hmk_2[l+2],
+                      fma(FLOAT_TYPE( b96[l]) * sccache[ix][v_im][6], qs_u32_6[l  ] - hmk_3[l  ],
+                      fma(FLOAT_TYPE(b112[l]) * sccache[ix][v_im][7], qs_u32_6[l+2] - hmk_3[l+2], sum))))))));
            }
            temp[j][n] = fma(d, sum, temp[j][n]);
        }
    }
 }

+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+    uint a_offset, b_offset, d_offset;
+    get_offsets(a_offset, b_offset, d_offset);
+
+    const uint num_blocks_per_row = p.ncols / QUANT_K;
+
+    // 16 threads are used to process each block
+    const uint it_size = gl_WorkGroupSize.x/16;
+    const uint tid = gl_LocalInvocationID.x;
+    const uint itid = tid%16;  // 0...15
+    const uint ix = tid/16;
+    const uint itid8 = itid%8;
+
+    const uint v_im = itid/8;                               // 0 or 1. 0 computes 0..., 1 computes 128...
+    const uint v_im4 = v_im*4;
+    const uint v_in = itid - 8*v_im;                        // 0...7
+
+    const uint32_t m = 0x01010101 << (4 * v_im);
+    uint32_t hm_m[4];
+    [[unroll]] for (uint j = 0; j < 4; ++j)
+        hm_m[j] = m << j;
+
+    const uint l0 = 2*v_in;                                 // 0...15
+    const uint q_offset = 32*v_im + l0;
+    const uint y_offset = 128*v_im + l0;
+
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+            temp[j][i] = FLOAT_TYPE(0);
+        }
+    }
+
+    const uint s_shift = v_im4 + 2*(itid8/4);
+
+    const uint nbr_par_th = num_blocks_per_row%it_size;
+    const uint nbr_all_th = num_blocks_per_row - nbr_par_th;
+    uint i0 = 0;
+    [[unroll]] for (; i0 < nbr_all_th; i0 += it_size)
+        calc_superblock(a_offset, b_offset, ix, itid8, v_im, v_im4, v_in, hm_m, q_offset, y_offset, s_shift, i0 + ix, num_blocks_per_row, first_row, num_rows, true);
+    calc_superblock(a_offset, b_offset, ix, itid8, v_im, v_im4, v_in, hm_m, q_offset, y_offset, s_shift, i0 + ix, num_blocks_per_row, first_row, num_rows, false);
+
    reduce_result(temp, d_offset, first_row, num_rows, tid);
 }

--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp
@ -6,40 +6,9 @@

 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

-void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
-    uint a_offset, b_offset, d_offset;
-    get_offsets(a_offset, b_offset, d_offset);
-
-    const uint num_blocks_per_row = p.ncols / QUANT_K;
-
-    // 16 threads are used to process each block
-    const uint it_size = gl_WorkGroupSize.x/16;
-    const uint tid = gl_LocalInvocationID.x;
-    const uint itid = tid%16;  // 0...16
-    const uint ix  = tid/16;
-
-    const uint step = 4;
-
-    const uint il = itid/step;                      // 0...3
-    const uint ir = itid - step*il;                 // 0...7 or 0...3
-    const uint n =  4;
-
-    const uint v_im = il / 2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const uint v_in = il % 2;
-
-    const uint l0 = n * (2 * ir + v_in);            // 0...15
-    const uint q_offset = 32*v_im + l0;
-    const uint y_offset = 64*v_im + l0;
-
 FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];

-    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
-        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
-            temp[j][i] = FLOAT_TYPE(0);
-        }
-    }
-
-    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
+void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
    const uint y1_idx = i * QUANT_K + y_offset;
    const uint y2_idx = y1_idx + 128;

@ -49,51 +18,53 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
        const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
        const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);

-            uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im    ];
-            uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
-            uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
-            uvec4 scale0 = uvec4(unpack8(scale0_u32));
-            uvec4 scale4 = uvec4(unpack8(scale4_u32));
-            uvec4 scale8 = uvec4(unpack8(scale8_u32));
+        const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im    ];
+        const uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
+        const uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];

-            const uint32_t sc0 = (  scale0.x       & 0x3f);
-            const uint32_t sc1 = (  scale0.y       & 0x3f);
-            const uint32_t sc2 = (  scale4.x       & 0x3f);
-            const uint32_t sc3 = (  scale4.y       & 0x3f);
-            const uint32_t sc4 = (( scale8.x       & 0x0f) | ((scale0.x & 0xc0) >> 2));
-            const uint32_t sc5 = (( scale8.y       & 0x0f) | ((scale0.y & 0xc0) >> 2));
-            const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
-            const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
+        const uint32_t scale_0_4_l = (scale4_u32 << 16) | scale0_u32;
+        const uint32_t scale_0_4_h = (scale_0_4_l & 0xC0C0C0C0) >> 2;
+        const vec4 scale_0_4_l_f = vec4(unpack8(scale_0_4_l & 0x3F3F3F3F));
+        const vec4 scale8_f = vec4(unpack8((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0F) | scale_0_4_h));

-            uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4];
-            uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16];
+        const FLOAT_TYPE sc0 = scale_0_4_l_f.x;
+        const FLOAT_TYPE sc1 = scale_0_4_l_f.y;
+        const FLOAT_TYPE sc2 = scale_0_4_l_f.z;
+        const FLOAT_TYPE sc3 = scale_0_4_l_f.w;
+        const FLOAT_TYPE sc4 = scale8_f.x;
+        const FLOAT_TYPE sc5 = scale8_f.y;
+        const FLOAT_TYPE sc6 = scale8_f.z;
+        const FLOAT_TYPE sc7 = scale8_f.w;

-            uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F;
-            uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F;
-            uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F;
-            uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F;
+        const uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4];
+        const uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16];

-            uvec4 qs0_lo4 = uvec4(unpack8(qs0_u32_lo4));
-            uvec4 qs64_lo4 = uvec4(unpack8(qs64_u32_lo4));
-            uvec4 qs0_hi4 = uvec4(unpack8(qs0_u32_hi4));
-            uvec4 qs64_hi4 = uvec4(unpack8(qs64_u32_hi4));
+        const uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F;
+        const uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F;
+        const uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F;
+        const uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F;

-            const uint32_t q4_0  = qs0_lo4.x;
-            const uint32_t q4_1  = qs0_lo4.y;
-            const uint32_t q4_2  = qs0_lo4.z;
-            const uint32_t q4_3  = qs0_lo4.w;
-            const uint32_t q4_4  = qs0_hi4.x;
-            const uint32_t q4_5  = qs0_hi4.y;
-            const uint32_t q4_6  = qs0_hi4.z;
-            const uint32_t q4_7  = qs0_hi4.w;
-            const uint32_t q4_8  = qs64_lo4.x;
-            const uint32_t q4_9  = qs64_lo4.y;
-            const uint32_t q4_10 = qs64_lo4.z;
-            const uint32_t q4_11 = qs64_lo4.w;
-            const uint32_t q4_12 = qs64_hi4.x;
-            const uint32_t q4_13 = qs64_hi4.y;
-            const uint32_t q4_14 = qs64_hi4.z;
-            const uint32_t q4_15 = qs64_hi4.w;
+        const vec4 qs0_lo4 = vec4(unpack8(qs0_u32_lo4));
+        const vec4 qs64_lo4 = vec4(unpack8(qs64_u32_lo4));
+        const vec4 qs0_hi4 = vec4(unpack8(qs0_u32_hi4));
+        const vec4 qs64_hi4 = vec4(unpack8(qs64_u32_hi4));
+
+        const FLOAT_TYPE q4_0  = qs0_lo4.x;
+        const FLOAT_TYPE q4_1  = qs0_lo4.y;
+        const FLOAT_TYPE q4_2  = qs0_lo4.z;
+        const FLOAT_TYPE q4_3  = qs0_lo4.w;
+        const FLOAT_TYPE q4_4  = qs0_hi4.x;
+        const FLOAT_TYPE q4_5  = qs0_hi4.y;
+        const FLOAT_TYPE q4_6  = qs0_hi4.z;
+        const FLOAT_TYPE q4_7  = qs0_hi4.w;
+        const FLOAT_TYPE q4_8  = qs64_lo4.x;
+        const FLOAT_TYPE q4_9  = qs64_lo4.y;
+        const FLOAT_TYPE q4_10 = qs64_lo4.z;
+        const FLOAT_TYPE q4_11 = qs64_lo4.w;
+        const FLOAT_TYPE q4_12 = qs64_hi4.x;
+        const FLOAT_TYPE q4_13 = qs64_hi4.y;
+        const FLOAT_TYPE q4_14 = qs64_hi4.z;
+        const FLOAT_TYPE q4_15 = qs64_hi4.w;

        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
            vec4 by10 =  vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4    ]);
@ -115,6 +86,38 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
    }
 }

+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+    uint a_offset, b_offset, d_offset;
+    get_offsets(a_offset, b_offset, d_offset);
+
+    const uint num_blocks_per_row = p.ncols / QUANT_K;
+
+    // 16 threads are used to process each block
+    const uint it_size = gl_WorkGroupSize.x/16;
+    const uint tid = gl_LocalInvocationID.x;
+    const uint itid = tid%16;  // 0...15
+    const uint ix = tid/16;
+
+    const uint il = itid/4;                         // 0...3
+    const uint ir = itid - 4*il;                    // 0...3
+    const uint n =  4;
+
+    const uint v_im = il / 2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const uint v_in = il % 2;
+
+    const uint l0 = n * (2 * ir + v_in);            // 0...15
+    const uint q_offset = 32*v_im + l0;
+    const uint y_offset = 64*v_im + l0;
+
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+            temp[j][i] = FLOAT_TYPE(0);
+        }
+    }
+
+    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size)
+        calc_superblock(a_offset, b_offset, v_im, q_offset, y_offset, i, num_blocks_per_row, first_row, num_rows);
+
    reduce_result(temp, d_offset, first_row, num_rows, tid);
 }

--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp
@ -6,37 +6,9 @@

 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

-void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
-    uint a_offset, b_offset, d_offset;
-    get_offsets(a_offset, b_offset, d_offset);
-
-    const uint num_blocks_per_row = p.ncols / QUANT_K;
-
-    // 16 threads are used to process each block
-    const uint it_size = gl_WorkGroupSize.x/16;
-    const uint tid = gl_LocalInvocationID.x;
-    const uint itid = tid%16;  // 0...16
-    const uint ix  = tid/16;
-
-    const uint il = itid/4;                          // 0...3
-    const uint ir = itid - 4*il;                     // 0...7 or 0...3
-
-    const uint v_im = il / 2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const uint v_in = il % 2;
-
-    const uint l0 = 4*ir + 2*v_in;                   // 0...15
-    const uint q_offset = 32*v_im + l0;
-    const uint y_offset = 64*v_im + l0;
-
 FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];

-    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
-        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
-            temp[j][i] = FLOAT_TYPE(0);
-        }
-    }
-
-    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
+void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im, const uint l0, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
    const uint y1_idx = i * QUANT_K + y_offset;
    const uint y2_idx = y1_idx + 128;

@ -46,63 +18,65 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
        const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
        const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);

-            uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im    ];
-            uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
-            uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
-            uvec4 scale0 = uvec4(unpack8(scale0_u32));
-            uvec4 scale4 = uvec4(unpack8(scale4_u32));
-            uvec4 scale8 = uvec4(unpack8(scale8_u32));
+        const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im    ];
+        const uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
+        const uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];

-            const uint32_t sc0 = (  scale0.x       & 0x3f);
-            const uint32_t sc1 = (  scale0.y       & 0x3f);
-            const uint32_t sc2 = (  scale4.x       & 0x3f);
-            const uint32_t sc3 = (  scale4.y       & 0x3f);
-            const uint32_t sc4 = (( scale8.x       & 0x0f) | ((scale0.x & 0xc0) >> 2));
-            const uint32_t sc5 = (( scale8.y       & 0x0f) | ((scale0.y & 0xc0) >> 2));
-            const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
-            const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
+        const uint32_t scale_0_4_l = (scale4_u32 << 16) | scale0_u32;
+        const uint32_t scale_0_4_h = (scale_0_4_l & 0xC0C0C0C0) >> 2;
+        const vec4 scale_0_4_l_f = vec4(unpack8(scale_0_4_l & 0x3F3F3F3F));
+        const vec4 scale8_f = vec4(unpack8((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0F) | scale_0_4_h));

-            uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16);
-            uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16);
+        const FLOAT_TYPE sc0 = scale_0_4_l_f.x;
+        const FLOAT_TYPE sc1 = scale_0_4_l_f.y;
+        const FLOAT_TYPE sc2 = scale_0_4_l_f.z;
+        const FLOAT_TYPE sc3 = scale_0_4_l_f.w;
+        const FLOAT_TYPE sc4 = scale8_f.x;
+        const FLOAT_TYPE sc5 = scale8_f.y;
+        const FLOAT_TYPE sc6 = scale8_f.z;
+        const FLOAT_TYPE sc7 = scale8_f.w;
+
+        const uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16);
+        const uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16);

        uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F;
        uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F;
        uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F;
        uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F;

-            uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8]));
+        const uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8]));

-            uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4;
-            uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3;
-            uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010) << 0;
-            uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1;
+        const uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4;
+        const uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3;
+        const uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010);
+        const uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1;

        qs0_16_u32_lo4 += qs0_16_lo4_offset16;
        qs0_16_u32_hi4 += qs0_16_hi4_offset16;
        qs64_80_u32_lo4 += qs64_80_lo4_offset16;
        qs64_80_u32_hi4 += qs64_80_hi4_offset16;

-            uvec4 qs0_16_lo4 = uvec4(unpack8(qs0_16_u32_lo4));
-            uvec4 qs64_80_lo4 = uvec4(unpack8(qs64_80_u32_lo4));
-            uvec4 qs0_16_hi4 = uvec4(unpack8(qs0_16_u32_hi4));
-            uvec4 qs64_80_hi4 = uvec4(unpack8(qs64_80_u32_hi4));
+        const vec4 qs0_16_lo4 = vec4(unpack8(qs0_16_u32_lo4));
+        const vec4 qs64_80_lo4 = vec4(unpack8(qs64_80_u32_lo4));
+        const vec4 qs0_16_hi4 = vec4(unpack8(qs0_16_u32_hi4));
+        const vec4 qs64_80_hi4 = vec4(unpack8(qs64_80_u32_hi4));

-            const uint32_t q4_0  = qs0_16_lo4.x;
-            const uint32_t q4_1  = qs0_16_lo4.y;
-            const uint32_t q4_2  = qs0_16_lo4.z;
-            const uint32_t q4_3  = qs0_16_lo4.w;
-            const uint32_t q4_4  = qs0_16_hi4.x;
-            const uint32_t q4_5  = qs0_16_hi4.y;
-            const uint32_t q4_6  = qs0_16_hi4.z;
-            const uint32_t q4_7  = qs0_16_hi4.w;
-            const uint32_t q4_8  = qs64_80_lo4.x;
-            const uint32_t q4_9  = qs64_80_lo4.y;
-            const uint32_t q4_10 = qs64_80_lo4.z;
-            const uint32_t q4_11 = qs64_80_lo4.w;
-            const uint32_t q4_12 = qs64_80_hi4.x;
-            const uint32_t q4_13 = qs64_80_hi4.y;
-            const uint32_t q4_14 = qs64_80_hi4.z;
-            const uint32_t q4_15 = qs64_80_hi4.w;
+        const FLOAT_TYPE q4_0  = qs0_16_lo4.x;
+        const FLOAT_TYPE q4_1  = qs0_16_lo4.y;
+        const FLOAT_TYPE q4_2  = qs0_16_lo4.z;
+        const FLOAT_TYPE q4_3  = qs0_16_lo4.w;
+        const FLOAT_TYPE q4_4  = qs0_16_hi4.x;
+        const FLOAT_TYPE q4_5  = qs0_16_hi4.y;
+        const FLOAT_TYPE q4_6  = qs0_16_hi4.z;
+        const FLOAT_TYPE q4_7  = qs0_16_hi4.w;
+        const FLOAT_TYPE q4_8  = qs64_80_lo4.x;
+        const FLOAT_TYPE q4_9  = qs64_80_lo4.y;
+        const FLOAT_TYPE q4_10 = qs64_80_lo4.z;
+        const FLOAT_TYPE q4_11 = qs64_80_lo4.w;
+        const FLOAT_TYPE q4_12 = qs64_80_hi4.x;
+        const FLOAT_TYPE q4_13 = qs64_80_hi4.y;
+        const FLOAT_TYPE q4_14 = qs64_80_hi4.z;
+        const FLOAT_TYPE q4_15 = qs64_80_hi4.w;

        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
            vec2 by10 =  vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2     ]);
@ -144,6 +118,37 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
    }
 }

+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+    uint a_offset, b_offset, d_offset;
+    get_offsets(a_offset, b_offset, d_offset);
+
+    const uint num_blocks_per_row = p.ncols / QUANT_K;
+
+    // 16 threads are used to process each block
+    const uint it_size = gl_WorkGroupSize.x/16;
+    const uint tid = gl_LocalInvocationID.x;
+    const uint itid = tid%16;  // 0...15
+    const uint ix = tid/16;
+
+    const uint il = itid/4;                          // 0...3
+    const uint ir = itid - 4*il;                     // 0...3
+
+    const uint v_im = il / 2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const uint v_in = il % 2;
+
+    const uint l0 = 4*ir + 2*v_in;                   // 0...15
+    const uint q_offset = 32*v_im + l0;
+    const uint y_offset = 64*v_im + l0;
+
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+            temp[j][i] = FLOAT_TYPE(0);
+        }
+    }
+
+    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size)
+        calc_superblock(a_offset, b_offset, v_im, l0, q_offset, y_offset, i, num_blocks_per_row, first_row, num_rows);
+
    reduce_result(temp, d_offset, first_row, num_rows, tid);
 }

--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
@ -6,7 +6,77 @@

 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

-void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16];
+
+FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
+
+void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint ix, const uint ql_offset, const uint qh_offset, const uint s_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
+    const uint y_idx = i * QUANT_K + y_offset;
+
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
+
+        if (!all_threads) { // when we don't have enough blocks to use all threads
+            barrier();
+            if (i < num_blocks_per_row)
+                sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
+            barrier();
+
+            if (i >= num_blocks_per_row)
+                continue;
+        }
+
+        const uint32_t ql0_u32 =  uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16);
+        const uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16);
+
+        const uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F;
+        const uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F;
+        const uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F;
+        const uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F;
+
+        const uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16);
+        const uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4;
+        const uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2;
+        const uint32_t qh4_u32 = (qh_u32 & 0x30303030);
+        const uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2;
+
+        const uint32_t q0_u32 = ql0_u32_lo4  | qh0_u32;
+        const uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32;
+        const uint32_t q2_u32 = ql0_u32_hi4  | qh4_u32;
+        const uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32;
+
+        const vec4 q0 = vec4(unpack8(q0_u32)) - 32;
+        const vec4 q1 = vec4(unpack8(q1_u32)) - 32;
+        const vec4 q2 = vec4(unpack8(q2_u32)) - 32;
+        const vec4 q3 = vec4(unpack8(q3_u32)) - 32;
+
+        if (all_threads) {
+            barrier();
+            sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
+            barrier();
+        }
+
+        const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
+
+        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+            vec4 by0  = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4     ]);
+            vec4 by32 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 +  8]);
+            vec4 by64 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 16]);
+            vec4 by96 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 24]);
+
+            FLOAT_TYPE sum[4] = {0, 0, 0, 0};
+            [[unroll]] for (uint l = 0; l < 4; ++l) {
+                sum[0] = fma(FLOAT_TYPE(by0[l]), q0[l], sum[0]);
+                sum[1] = fma(FLOAT_TYPE(by32[l]), q1[l], sum[1]);
+                sum[2] = fma(FLOAT_TYPE(by64[l]), q2[l], sum[2]);
+                sum[3] = fma(FLOAT_TYPE(by96[l]), q3[l], sum[3]);
+            }
+            temp[j][n] = fma(fma(sum[0], sccache[ix][s_offset], fma(sum[1], sccache[ix][s_offset + 2], fma(sum[2], sccache[ix][s_offset + 4], sum[3] * sccache[ix][s_offset + 6]))), d, temp[j][n]);
+        }
+    }
+}
+
+void compute_outputs(const uint first_row, const uint num_rows) {
    uint a_offset, b_offset, d_offset;
    get_offsets(a_offset, b_offset, d_offset);

@ -15,13 +85,11 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
    // 16 threads are used to process each block
    const uint it_size = gl_WorkGroupSize.x/16;
    const uint tid = gl_LocalInvocationID.x;
-    const uint itid = tid%16;  // 0...16
+    const uint itid = tid%16;  // 0...15
    const uint ix = tid/16;

-    const uint step = 8;
-
-    const uint v_im = itid/step;                            // 0 or 1. 0 computes 0..., 1 computes 128...
-    const uint v_in = itid - step*v_im;                     // 0...15 or 0...7
+    const uint v_im = itid/8;                               // 0 or 1. 0 computes 0..., 1 computes 128...
+    const uint v_in = itid - 8*v_im;                        // 0...7

    const uint l0 = 4 * v_in;                               // 0, 4, 8, ..., 28
    const uint is = v_in / 4;
@ -31,68 +99,18 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
    const uint s_offset  =  8*v_im + is;
    const uint y_offset = 128*v_im + l0;

-    FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
-
    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
            temp[j][i] = FLOAT_TYPE(0);
        }
    }

-    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
-        const uint y_idx = i * QUANT_K + y_offset;
-
-        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
-            const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
-            const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
-
-            FLOAT_TYPE scales[4];
-            scales[0] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]);
-            scales[1] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]);
-            scales[2] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]);
-            scales[3] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]);
-
-            uint32_t ql0_u32 =  uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16);
-            uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16);
-
-            uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F;
-            uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F;
-            uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F;
-            uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F;
-
-            uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16);
-            uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4;
-            uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2;
-            uint32_t qh4_u32 = (qh_u32 & 0x30303030) << 0;
-            uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2;
-
-            uint32_t q0_u32 = ql0_u32_lo4  | qh0_u32;
-            uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32;
-            uint32_t q2_u32 = ql0_u32_hi4  | qh4_u32;
-            uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32;
-
-            uvec4 q0 = uvec4(unpack8(q0_u32));
-            uvec4 q1 = uvec4(unpack8(q1_u32));
-            uvec4 q2 = uvec4(unpack8(q2_u32));
-            uvec4 q3 = uvec4(unpack8(q3_u32));
-
-            [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
-                vec4 by0  = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4     ]);
-                vec4 by32 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 +  8]);
-                vec4 by64 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 16]);
-                vec4 by96 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 24]);
-
-                FLOAT_TYPE sum = FLOAT_TYPE(0.0);
-                [[unroll]] for (int l = 0; l < 4; ++l) {
-                    sum = fma(FLOAT_TYPE(by0[l])  * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32),
-                          fma(FLOAT_TYPE(by32[l]) * scales[1], FLOAT_TYPE(int8_t(q1[l]) - 32),
-                          fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32),
-                          fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum))));
-                }
-                temp[j][n] += sum * d;
-            }
-        }
-    }
+    const uint nbr_par_th = num_blocks_per_row%it_size;
+    const uint nbr_all_th = num_blocks_per_row - nbr_par_th;
+    uint i0 = 0;
+    [[unroll]] for (; i0 < nbr_all_th; i0 += it_size)
+        calc_superblock(a_offset, b_offset, itid, ix, ql_offset, qh_offset, s_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, true);
+    calc_superblock(a_offset, b_offset, itid, ix, ql_offset, qh_offset, s_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, false);

    reduce_result(temp, d_offset, first_row, num_rows, tid);
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@ -31,8 +31,6 @@
    #include <fcntl.h>
 #endif

-#include <vulkan/vulkan_core.h>
-
 #define ASYNCIO_CONCURRENCY 64
 #ifndef GGML_VULKAN_COOPMAT_GLSLC_SUPPORT
 #define GGML_VULKAN_COOPMAT_GLSLC_SUPPORT
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -3708,7 +3708,7 @@ void ggml_rope_yarn_corr_dims(

 // ggml_rope_back

-struct ggml_tensor * ggml_rope_back(
+struct ggml_tensor * ggml_rope_ext_back(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
@ -3722,29 +3722,32 @@ struct ggml_tensor * ggml_rope_back(
        float                 attn_factor,
        float                 beta_fast,
        float                 beta_slow) {
-    GGML_ASSERT(ggml_is_vector(b));
-    GGML_ASSERT(b->type == GGML_TYPE_I32);
-    GGML_ASSERT(a->ne[2] == b->ne[0]);
-
-    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-
-    int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
-    memcpy(params +  5, &freq_base,    sizeof(float));
-    memcpy(params +  6, &freq_scale,   sizeof(float));
-    memcpy(params +  7, &ext_factor,   sizeof(float));
-    memcpy(params +  8, &attn_factor,  sizeof(float));
-    memcpy(params +  9, &beta_fast,    sizeof(float));
-    memcpy(params + 10, &beta_slow,    sizeof(float));
-    ggml_set_op_params(result, params, sizeof(params));
-
+    struct ggml_tensor * result = ggml_rope_ext(
+        ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
    result->op = GGML_OP_ROPE_BACK;
-    result->src[0] = a;
-    result->src[1] = b;
-    result->src[2] = c;
-
    return result;
 }

+struct ggml_tensor * ggml_rope_multi_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        struct ggml_tensor  * c,
+        int                   n_dims,
+        int                   sections[4],
+        int                   mode,
+        int                   n_ctx_orig,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    struct ggml_tensor * result = ggml_rope_multi(
+        ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+    result->op = GGML_OP_ROPE_BACK;
+    return result;
+}
 // ggml_clamp

 struct ggml_tensor * ggml_clamp(
@ -5607,6 +5610,7 @@ static void ggml_compute_backward(
                //const int n_ctx      = ((int32_t *) tensor->op_params)[3];
                const int n_ctx_orig = ((const int32_t *) tensor->op_params)[4];
                float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+                int sections[4] = {0, 0, 0, 0};

                memcpy(&freq_base,   (const float *) tensor->op_params +  5, sizeof(float));
                memcpy(&freq_scale,  (const float *) tensor->op_params +  6, sizeof(float));
@ -5614,10 +5618,14 @@ static void ggml_compute_backward(
                memcpy(&attn_factor, (const float *) tensor->op_params +  8, sizeof(float));
                memcpy(&beta_fast,   (const float *) tensor->op_params +  9, sizeof(float));
                memcpy(&beta_slow,   (const float *) tensor->op_params + 10, sizeof(float));
+                memcpy(&sections,                    tensor->op_params + 11, sizeof(sections));

-                ggml_add_or_set(ctx, cgraph, isrc0,
-                    ggml_rope_back(ctx, grad, src1, src2, n_dims, mode, n_ctx_orig, freq_base,
-                        freq_scale, ext_factor, attn_factor, beta_fast, beta_slow));
+                struct ggml_tensor * rope_back = grad->ne[2] == src1->ne[0] ?
+                    ggml_rope_ext_back(ctx, grad, src1, src2, n_dims,
+                        mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow) :
+                    ggml_rope_multi_back(ctx, grad, src1, src2, n_dims, sections,
+                        mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+                ggml_add_or_set(ctx, cgraph, isrc0, rope_back);
            }
            GGML_ASSERT((!src2 || !src2_needs_grads) && "gradients for freq factors not implemented");
        } break;
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -184,7 +184,6 @@ class Keys:
        UNK_ID               = "tokenizer.ggml.unknown_token_id"
        SEP_ID               = "tokenizer.ggml.seperator_token_id"
        PAD_ID               = "tokenizer.ggml.padding_token_id"
-        CLS_ID               = "tokenizer.ggml.cls_token_id"
        MASK_ID              = "tokenizer.ggml.mask_token_id"
        ADD_BOS              = "tokenizer.ggml.add_bos_token"
        ADD_EOS              = "tokenizer.ggml.add_eos_token"
@ -1837,7 +1836,6 @@ KEY_TOKENIZER_EOM_ID     = Keys.Tokenizer.EOM_ID
 KEY_TOKENIZER_UNK_ID     = Keys.Tokenizer.UNK_ID
 KEY_TOKENIZER_SEP_ID     = Keys.Tokenizer.SEP_ID
 KEY_TOKENIZER_PAD_ID     = Keys.Tokenizer.PAD_ID
-KEY_TOKENIZER_CLS_ID     = Keys.Tokenizer.CLS_ID
 KEY_TOKENIZER_MASK_ID    = Keys.Tokenizer.MASK_ID
 KEY_TOKENIZER_HF_JSON    = Keys.Tokenizer.HF_JSON
 KEY_TOKENIZER_RWKV       = Keys.Tokenizer.RWKV
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -857,9 +857,6 @@ class GGUFWriter:
    def add_pad_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.PAD_ID, id)

-    def add_cls_token_id(self, id: int) -> None:
-        self.add_uint32(Keys.Tokenizer.CLS_ID, id)
-
    def add_mask_token_id(self, id: int) -> None:
        self.add_uint32(Keys.Tokenizer.MASK_ID, id)

--- a/include/llama.h
+++ b/include/llama.h
@ -939,7 +939,6 @@ extern "C" {
    LLAMA_API llama_token llama_vocab_bos(const struct llama_vocab * vocab); // beginning-of-sentence
    LLAMA_API llama_token llama_vocab_eos(const struct llama_vocab * vocab); // end-of-sentence
    LLAMA_API llama_token llama_vocab_eot(const struct llama_vocab * vocab); // end-of-turn
-    LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab); // classification
    LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
    LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
    LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
@ -975,6 +974,10 @@ extern "C" {
    DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead");
    DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead");

+    // CLS is equivalent to BOS
+    DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification
+            "use llama_vocab_bos instead");
+
    //
    // Tokenization
    //
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@ -178,7 +178,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap"     },
    { LLM_KV_TOKENIZER_HF_JSON,              "tokenizer.huggingface.json"              },
    { LLM_KV_TOKENIZER_RWKV,                 "tokenizer.rwkv.world"                    },
-    { LLM_KV_TOKENIZER_CHAT_TEMPLATE,        "tokenizer.chat.template"                 },
+    { LLM_KV_TOKENIZER_CHAT_TEMPLATE,        "tokenizer.chat_template"                 },
    { LLM_KV_TOKENIZER_FIM_PRE_ID,           "tokenizer.ggml.fim_pre_token_id"         },
    { LLM_KV_TOKENIZER_FIM_SUF_ID,           "tokenizer.ggml.fim_suf_token_id"         },
    { LLM_KV_TOKENIZER_FIM_MID_ID,           "tokenizer.ggml.fim_mid_token_id"         },
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -1443,7 +1443,6 @@ struct llama_vocab::impl {
    llama_token special_unk_id  = 0;
    llama_token special_sep_id  = LLAMA_TOKEN_NULL;
    llama_token special_pad_id  = LLAMA_TOKEN_NULL;
-    llama_token special_cls_id  = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
    llama_token special_mask_id = LLAMA_TOKEN_NULL;

    llama_token linefeed_id = 13;
@ -1577,14 +1576,14 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            special_unk_id  = LLAMA_TOKEN_NULL;
            special_sep_id  = LLAMA_TOKEN_NULL;
            special_pad_id  = LLAMA_TOKEN_NULL;
-            special_cls_id  = LLAMA_TOKEN_NULL;
            special_mask_id = LLAMA_TOKEN_NULL;
            linefeed_id     = LLAMA_TOKEN_NULL;

            // read vocab size from metadata
            uint32_t n_tokens = 0;
-            if (!ml.get_key(LLM_KV_VOCAB_SIZE, n_tokens, false)) {
-                LLAMA_LOG_WARN("%s: there is no vocab_size in metadata\n", __func__);
+            if (ml.get_key(LLM_KV_VOCAB_SIZE, n_tokens, false)) {
+                LLAMA_LOG_WARN("%s: adding %u dummy tokens\n", __func__, n_tokens);
+                id_to_token.resize(n_tokens);
            }

            return;
@ -1599,18 +1598,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            special_unk_id  = 0;
            special_sep_id  = LLAMA_TOKEN_NULL;
            special_pad_id  = LLAMA_TOKEN_NULL;
-            special_cls_id  = LLAMA_TOKEN_NULL;
            special_mask_id = LLAMA_TOKEN_NULL;
        } else if (tokenizer_model == "bert") {
            type = LLAMA_VOCAB_TYPE_WPM;

            // default special tokens
-            special_bos_id  = LLAMA_TOKEN_NULL;
+            special_bos_id  = 101;
            special_eos_id  = LLAMA_TOKEN_NULL;
            special_unk_id  = 100;
            special_sep_id  = 102;
            special_pad_id  = 0;
-            special_cls_id  = 101;
            special_mask_id = 103;
        } else if (tokenizer_model == "gpt2") {
            type = LLAMA_VOCAB_TYPE_BPE;
@ -1655,7 +1652,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            special_unk_id  = LLAMA_TOKEN_NULL;
            special_sep_id  = LLAMA_TOKEN_NULL;
            special_pad_id  = LLAMA_TOKEN_NULL;
-            special_cls_id  = LLAMA_TOKEN_NULL;
            special_mask_id = LLAMA_TOKEN_NULL;
        } else if (tokenizer_model == "t5") {
            type = LLAMA_VOCAB_TYPE_UGM;
@ -1666,7 +1662,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            special_unk_id  = 2;
            special_sep_id  = LLAMA_TOKEN_NULL;
            special_pad_id  = 0;
-            special_cls_id  = LLAMA_TOKEN_NULL;
            special_mask_id = LLAMA_TOKEN_NULL;

            const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
@ -1950,7 +1945,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            { LLM_KV_TOKENIZER_UNK_ID,     special_unk_id     },
            { LLM_KV_TOKENIZER_SEP_ID,     special_sep_id     },
            { LLM_KV_TOKENIZER_PAD_ID,     special_pad_id     },
-            { LLM_KV_TOKENIZER_CLS_ID,     special_cls_id     },
            { LLM_KV_TOKENIZER_MASK_ID,    special_mask_id    },
            { LLM_KV_TOKENIZER_FIM_PRE_ID, special_fim_pre_id },
            { LLM_KV_TOKENIZER_FIM_SUF_ID, special_fim_suf_id },
@ -1974,7 +1968,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                continue;
            }
            if (new_id >= id_to_token.size()) {
-                LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
+                LLAMA_LOG_WARN("%s: bad special token: '%s' = %u, using default id %d\n",
                    __func__, key.c_str(), new_id, id);
            } else {
                id = new_id;
@ -2673,8 +2667,8 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
        case LLAMA_VOCAB_TYPE_WPM:
            {
                if (add_special) {
-                    GGML_ASSERT(special_cls_id != LLAMA_TOKEN_NULL);
-                    output.push_back(special_cls_id);
+                    GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
+                    output.push_back(special_bos_id);
                }

                llm_tokenizer_wpm_session session(vocab);
@ -2971,7 +2965,6 @@ void llama_vocab::impl::print_info() const {
    if (special_unk_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, special_unk_id,     id_to_token[special_unk_id].text.c_str() );  }
    if (special_sep_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, special_sep_id,     id_to_token[special_sep_id].text.c_str() );  }
    if (special_pad_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, special_pad_id,     id_to_token[special_pad_id].text.c_str() );  }
-    if (special_cls_id  != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: CLS token        = %d '%s'\n", __func__, special_cls_id,     id_to_token[special_cls_id].text.c_str() );  }
    if (special_mask_id != LLAMA_TOKEN_NULL)    { LLAMA_LOG_INFO( "%s: MASK token       = %d '%s'\n", __func__, special_mask_id,    id_to_token[special_mask_id].text.c_str() ); }

    if (linefeed_id != LLAMA_TOKEN_NULL)        { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, linefeed_id,        id_to_token[linefeed_id].text.c_str() ); }
@ -3106,7 +3099,7 @@ llama_token_attr llama_vocab::token_get_attr(llama_token id) const {
 }

 llama_token llama_vocab::token_bos() const {
-    return pimpl->type != LLAMA_VOCAB_TYPE_WPM ? pimpl->special_bos_id : pimpl->special_cls_id;
+    return pimpl->special_bos_id;
 }

 llama_token llama_vocab::token_eos() const {
@ -3125,10 +3118,6 @@ llama_token llama_vocab::token_unk() const {
    return pimpl->special_unk_id;
 }

-llama_token llama_vocab::token_cls() const {
-    return pimpl->special_cls_id;
-}
-
 llama_token llama_vocab::token_sep() const {
    return pimpl->special_sep_id;
 }
@ -3348,8 +3337,9 @@ llama_token llama_vocab_eot(const struct llama_vocab * vocab) {
    return vocab->token_eot();
 }

+// deprecated
 llama_token llama_vocab_cls(const struct llama_vocab * vocab) {
-    return vocab->token_cls();
+    return vocab->token_bos();
 }

 llama_token llama_vocab_sep(const struct llama_vocab * vocab) {
@ -3438,7 +3428,8 @@ llama_token llama_token_eot(const struct llama_vocab * vocab) {

 // deprecated
 llama_token llama_token_cls(const struct llama_vocab * vocab) {
-    return llama_vocab_cls(vocab);
+    //return llama_vocab_cls(vocab);
+    return llama_vocab_bos(vocab); // avoid deprecation warning
 }

 // deprecated
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@ -54,7 +54,6 @@ struct llama_vocab {
    llama_token token_eot() const;
    llama_token token_eom() const;
    llama_token token_unk() const;
-    llama_token token_cls() const;
    llama_token token_sep() const;
    llama_token token_nl () const;
    llama_token token_pad() const;
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -4678,7 +4678,7 @@ struct llm_build_context {
                    0);
                cb(v_states, "v_states", il);

-                q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
+                q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
                q_pe = ggml_rope_ext(
                    ctx0, q_pe, inp_pos, rope_factors,
                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@ -4687,7 +4687,7 @@ struct llm_build_context {
                cb(q_pe, "q_pe", il);

                // shared RoPE key
-                k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
+                k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
                k_pe = ggml_rope_ext(
                    ctx0, k_pe, inp_pos, rope_factors,
                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@ -6532,7 +6532,7 @@ struct llm_build_context {
                    0);
                cb(v_states, "v_states", il);

-                q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
+                q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
                q_pe = ggml_rope_ext(
                    ctx0, q_pe, inp_pos, nullptr,
                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@ -6541,7 +6541,7 @@ struct llm_build_context {
                cb(q_pe, "q_pe", il);

                // shared RoPE key
-                k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
+                k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
                k_pe = ggml_rope_ext(
                    ctx0, k_pe, inp_pos, nullptr,
                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,