diff --git a/Makefile b/Makefile
index 40e2a67c4..8b1982823 100644
--- a/Makefile
+++ b/Makefile
@@ -235,7 +235,7 @@ else
 	HCC         := $(ROCM_PATH)/llvm/bin/clang
 	HCXX        := $(ROCM_PATH)/llvm/bin/clang++
 endif
-	HIPFLAGS   += -DGGML_USE_HIP -DGGML_USE_CUDA -DSD_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
+	HIPFLAGS   += -DGGML_USE_HIP -DGGML_HIP_NO_VMM -DGGML_USE_CUDA -DSD_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
 	HIPLDFLAGS    += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	HIPLDFLAGS    += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
 	HIPLDFLAGS    += -lhipblas -lamdhip64 -lrocblas
diff --git a/colab.ipynb b/colab.ipynb
index 95d420885..4960bad68 100644
--- a/colab.ipynb
+++ b/colab.ipynb
@@ -48,7 +48,7 @@
    "source": [
     "#@title <b>v-- Enter your model below and then click this to start Koboldcpp</b>\n",
     "\n",
-    "Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\",\"https://huggingface.co/KoboldAI/LLaMA2-13B-Estopia-GGUF/resolve/main/LLaMA2-13B-Estopia.Q4_K_S.gguf\",\"https://huggingface.co/mradermacher/Fimbulvetr-11B-v2-GGUF/resolve/main/Fimbulvetr-11B-v2.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/mradermacher/mini-magnum-12b-v1.1-GGUF/resolve/main/mini-magnum-12b-v1.1.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-Kimiko-v2-13B-GGUF/resolve/main/mythomax-l2-kimiko-v2-13b.Q4_K_M.gguf\",\"https://huggingface.co/bartowski/Rocinante-12B-v1.1-GGUF/resolve/main/Rocinante-12B-v1.1-Q4_K_S.gguf\",\"https://huggingface.co/KoboldAI/Llama-3.1-8B-BookAdventures-GGUF/resolve/main/Llama-3.1-8B-BookAdventures.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/MistRP-Airoboros-7B-GGUF/resolve/main/mistrp-airoboros-7b.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf\",\"https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf\",\"https://huggingface.co/grimjim/kukulemon-7B-GGUF/resolve/main/kukulemon-7B.Q8_0.gguf\",\"https://huggingface.co/mradermacher/LemonKunoichiWizardV3-GGUF/resolve/main/LemonKunoichiWizardV3.Q4_K_M.gguf\",\"https://huggingface.co/Lewdiculous/Kunoichi-DPO-v2-7B-GGUF-Imatrix/resolve/main/Kunoichi-DPO-v2-7B-Q4_K_M-imatrix.gguf\",\"https://huggingface.co/mradermacher/L3-8B-Stheno-v3.2-i1-GGUF/resolve/main/L3-8B-Stheno-v3.2.i1-Q4_K_M.gguf\",\"https://huggingface.co/Lewdiculous/Llama-3-Lumimaid-8B-v0.1-OAS-GGUF-IQ-Imatrix/resolve/main/v2-Llama-3-Lumimaid-8B-v0.1-OAS-Q4_K_M-imat.gguf\",\"https://huggingface.co/bartowski/NeuralDaredevil-8B-abliterated-GGUF/resolve/main/NeuralDaredevil-8B-abliterated-Q4_K_M.gguf\",\"https://huggingface.co/bartowski/L3-8B-Lunaris-v1-GGUF/resolve/main/L3-8B-Lunaris-v1-Q4_K_M.gguf\",\"https://huggingface.co/mradermacher/L3-Umbral-Mind-RP-v2.0-8B-GGUF/resolve/main/L3-Umbral-Mind-RP-v2.0-8B.Q4_K_M.gguf\",\"https://huggingface.co/bartowski/TheDrummer_Cydonia-24B-v2-GGUF/resolve/main/TheDrummer_Cydonia-24B-v2-Q4_K_S.gguf\"]{allow-input: true}\n",
+    "Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\",\"https://huggingface.co/KoboldAI/LLaMA2-13B-Estopia-GGUF/resolve/main/LLaMA2-13B-Estopia.Q4_K_S.gguf\",\"https://huggingface.co/mradermacher/Fimbulvetr-11B-v2-GGUF/resolve/main/Fimbulvetr-11B-v2.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/mradermacher/mini-magnum-12b-v1.1-GGUF/resolve/main/mini-magnum-12b-v1.1.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-Kimiko-v2-13B-GGUF/resolve/main/mythomax-l2-kimiko-v2-13b.Q4_K_M.gguf\",\"https://huggingface.co/bartowski/Rocinante-12B-v1.1-GGUF/resolve/main/Rocinante-12B-v1.1-Q4_K_S.gguf\",\"https://huggingface.co/KoboldAI/Llama-3.1-8B-BookAdventures-GGUF/resolve/main/Llama-3.1-8B-BookAdventures.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/MistRP-Airoboros-7B-GGUF/resolve/main/mistrp-airoboros-7b.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf\",\"https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf\",\"https://huggingface.co/grimjim/kukulemon-7B-GGUF/resolve/main/kukulemon-7B.Q8_0.gguf\",\"https://huggingface.co/mradermacher/LemonKunoichiWizardV3-GGUF/resolve/main/LemonKunoichiWizardV3.Q4_K_M.gguf\",\"https://huggingface.co/Lewdiculous/Kunoichi-DPO-v2-7B-GGUF-Imatrix/resolve/main/Kunoichi-DPO-v2-7B-Q4_K_M-imatrix.gguf\",\"https://huggingface.co/mradermacher/L3-8B-Stheno-v3.2-i1-GGUF/resolve/main/L3-8B-Stheno-v3.2.i1-Q4_K_M.gguf\",\"https://huggingface.co/Lewdiculous/Llama-3-Lumimaid-8B-v0.1-OAS-GGUF-IQ-Imatrix/resolve/main/v2-Llama-3-Lumimaid-8B-v0.1-OAS-Q4_K_M-imat.gguf\",\"https://huggingface.co/bartowski/NeuralDaredevil-8B-abliterated-GGUF/resolve/main/NeuralDaredevil-8B-abliterated-Q4_K_M.gguf\",\"https://huggingface.co/bartowski/L3-8B-Lunaris-v1-GGUF/resolve/main/L3-8B-Lunaris-v1-Q4_K_M.gguf\",\"https://huggingface.co/mradermacher/L3-Umbral-Mind-RP-v2.0-8B-GGUF/resolve/main/L3-Umbral-Mind-RP-v2.0-8B.Q4_K_M.gguf\",\"https://huggingface.co/bartowski/TheDrummer_Cydonia-24B-v2-GGUF/resolve/main/TheDrummer_Cydonia-24B-v2-Q4_K_S.gguf\",\"https://huggingface.co/bartowski/PocketDoc_Dans-PersonalityEngine-V1.2.0-24b-GGUF/resolve/main/PocketDoc_Dans-PersonalityEngine-V1.2.0-24b-IQ4_XS.gguf\"]{allow-input: true}\n",
     "Layers = 99 #@param [99]{allow-input: true}\n",
     "ContextSize = 4096 #@param [4096,8192] {allow-input: true}\n",
     "FlashAttention = True #@param {type:\"boolean\"}\n",
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 8b7c75d85..6358a94e9 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -699,6 +699,9 @@ class Model:
         if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
             # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
             res = "deepseek-r1-qwen"
+        if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
+            # ref: https://huggingface.co/Xenova/gpt-4o
+            res = "gpt-4o"
 
         if res is None:
             logger.warning("\n")
@@ -2512,7 +2515,8 @@ class Phi3MiniModel(Model):
         rms_eps = self.find_hparam(["rms_norm_eps"])
         max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
         orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rope_dims = n_embd // n_head
+        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
+        rope_dims = int(rot_pct * n_embd) // n_head
 
         self.gguf_writer.add_context_length(max_pos_embds)
         self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
@@ -2536,7 +2540,8 @@ class Phi3MiniModel(Model):
         n_head = self.find_hparam(["num_attention_heads", "n_head"])
         max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
         orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
-        rope_dims = n_embd // n_head
+        rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
+        rope_dims = int(rot_pct * n_embd) // n_head
 
         # write rope scaling for long context (128k) model
         rope_scaling = self.find_hparam(['rope_scaling'], True)
@@ -2565,7 +2570,7 @@ class Phi3MiniModel(Model):
             raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
 
         if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
-            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
+            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}.')
 
         yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
         yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index fa4989a80..07d3ce0e4 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -109,6 +109,7 @@ models = [
     {"name": "megrez",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
     {"name": "deepseek-v3",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
     {"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
+    {"name": "gpt-4o",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
 ]
 
 
@@ -131,6 +132,10 @@ def download_model(model):
 
     files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
 
+    if name == "gpt-4o":
+        # Xenova/gpt-4o is tokenizer-only, it does not contain config.json
+        files = ["tokenizer.json", "tokenizer_config.json"]
+
     if tokt == TOKENIZER_TYPE.SPM:
         files.append("tokenizer.model")
 
diff --git a/docs/function-calling.md b/docs/function-calling.md
new file mode 100644
index 000000000..92cb6531a
--- /dev/null
+++ b/docs/function-calling.md
@@ -0,0 +1,390 @@
+# Function Calling
+
+[chat.h](../common/chat.h) (https://github.com/ggml-org/llama.cpp/pull/9639) adds support for [OpenAI-style function calling](https://platform.openai.com/docs/guides/function-calling) and is used in:
+- `llama-server` when started w/ `--jinja` flag
+- `llama-cli` (WIP: https://github.com/ggml-org/llama.cpp/pull/11556)
+
+## Universal support w/ Native & Generic handlers
+
+Function calling is supported for all models (see https://github.com/ggml-org/llama.cpp/pull/9639):
+
+- Native tool call formats supported:
+  - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
+  - Functionary v3.1 / v3.2
+  - Hermes 2/3, Qwen 2.5
+  - Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034)
+  - Mistral Nemo
+  - Firefunction v2
+  - Command R7B
+  - DeepSeek R1 (WIP / seems reluctant to call any tools?)
+
+- Generic tool call is supported when the template isn't recognized by native format handlers (you'll see `Chat format: Generic` in the logs).
+  - Use `--chat-template-file` to override the template when appropriate (see examples below)
+  - Generic support may consume more tokens and be less efficient than a model's native format.
+
+<details>
+<summary>Show some common templates and which format handler they use</summary>
+
+| Template | Format |
+|----------|--------|
+| Almawave-Velvet-14B.jinja | Hermes 2 Pro |
+| AtlaAI-Selene-1-Mini-Llama-3.1-8B.jinja | Llama 3.x |
+| CohereForAI-aya-expanse-8b.jinja | Generic |
+| CohereForAI-c4ai-command-r-plus-default.jinja | Generic |
+| CohereForAI-c4ai-command-r-plus-rag.jinja | Generic |
+| CohereForAI-c4ai-command-r-plus-tool_use.jinja | Generic |
+| CohereForAI-c4ai-command-r7b-12-2024-default.jinja | Command R7B (extract reasoning) |
+| CohereForAI-c4ai-command-r7b-12-2024-rag.jinja | Command R7B (extract reasoning) |
+| CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja | Command R7B (extract reasoning) |
+| CohereForAI-c4ai-command-r7b-12-2024.jinja | Generic |
+| DavieLion-Llama-3.2-1B-SPIN-iter3.jinja | Generic |
+| Delta-Vector-Rei-12B.jinja | Mistral Nemo |
+| EpistemeAI-Mistral-Nemo-Instruct-12B-Philosophy-Math.jinja | Mistral Nemo |
+| FlofloB-83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.jinja | Hermes 2 Pro |
+| FlofloB-test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit.jinja | Generic |
+| HelpingAI-HAI-SER.jinja | Generic |
+| HuggingFaceTB-SmolLM2-1.7B-Instruct.jinja | Generic |
+| HuggingFaceTB-SmolLM2-135M-Instruct.jinja | Generic |
+| HuggingFaceTB-SmolLM2-360M-Instruct.jinja | Generic |
+| INSAIT-Institute-BgGPT-Gemma-2-27B-IT-v1.0.jinja | Generic |
+| Ihor-Text2Graph-R1-Qwen2.5-0.5b.jinja | Hermes 2 Pro |
+| Infinigence-Megrez-3B-Instruct.jinja | Generic |
+| Josephgflowers-TinyLlama_v1.1_math_code-world-test-1.jinja | Generic |
+| LGAI-EXAONE-EXAONE-3.5-2.4B-Instruct.jinja | Generic |
+| LGAI-EXAONE-EXAONE-3.5-7.8B-Instruct.jinja | Generic |
+| LatitudeGames-Wayfarer-12B.jinja | Generic |
+| Magpie-Align-Llama-3-8B-Magpie-Align-v0.1.jinja | Generic |
+| Magpie-Align-Llama-3.1-8B-Magpie-Align-v0.1.jinja | Generic |
+| MaziyarPanahi-calme-3.2-instruct-78b.jinja | Generic |
+| MiniMaxAI-MiniMax-Text-01.jinja | Generic |
+| MiniMaxAI-MiniMax-VL-01.jinja | Generic |
+| NaniDAO-deepseek-r1-qwen-2.5-32B-ablated.jinja | DeepSeek R1 (extract reasoning) |
+| NexaAIDev-Octopus-v2.jinja | Generic |
+| NousResearch-Hermes-2-Pro-Llama-3-8B-default.jinja | Generic |
+| NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja | Hermes 2 Pro |
+| NousResearch-Hermes-2-Pro-Mistral-7B-default.jinja | Generic |
+| NousResearch-Hermes-2-Pro-Mistral-7B-tool_use.jinja | Hermes 2 Pro |
+| NousResearch-Hermes-3-Llama-3.1-70B-default.jinja | Generic |
+| NousResearch-Hermes-3-Llama-3.1-70B-tool_use.jinja | Hermes 2 Pro |
+| NovaSky-AI-Sky-T1-32B-Flash.jinja | Hermes 2 Pro |
+| NovaSky-AI-Sky-T1-32B-Preview.jinja | Hermes 2 Pro |
+| OnlyCheeini-greesychat-turbo.jinja | Generic |
+| Orenguteng-Llama-3.1-8B-Lexi-Uncensored-V2.jinja | Llama 3.x |
+| OrionStarAI-Orion-14B-Chat.jinja | Generic |
+| PowerInfer-SmallThinker-3B-Preview.jinja | Generic |
+| PrimeIntellect-INTELLECT-1-Instruct.jinja | Generic |
+| Qwen-QVQ-72B-Preview.jinja | Generic |
+| Qwen-QwQ-32B-Preview.jinja | Hermes 2 Pro |
+| Qwen-Qwen1.5-7B-Chat.jinja | Generic |
+| Qwen-Qwen2-7B-Instruct.jinja | Generic |
+| Qwen-Qwen2-VL-72B-Instruct.jinja | Generic |
+| Qwen-Qwen2-VL-7B-Instruct.jinja | Generic |
+| Qwen-Qwen2.5-0.5B.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-1.5B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-14B-Instruct-1M.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-14B.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-32B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-32B.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-3B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-72B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-7B-Instruct-1M.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-7B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-7B.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-Coder-32B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-Coder-7B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-Math-1.5B.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-Math-7B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-VL-3B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-VL-72B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-VL-7B-Instruct.jinja | Hermes 2 Pro |
+| RWKV-Red-Team-ARWKV-7B-Preview-0.1.jinja | Hermes 2 Pro |
+| SakanaAI-TinySwallow-1.5B-Instruct.jinja | Hermes 2 Pro |
+| SakanaAI-TinySwallow-1.5B.jinja | Hermes 2 Pro |
+| Sao10K-70B-L3.3-Cirrus-x1.jinja | Llama 3.x |
+| SentientAGI-Dobby-Mini-Leashed-Llama-3.1-8B.jinja | Llama 3.x |
+| SentientAGI-Dobby-Mini-Unhinged-Llama-3.1-8B.jinja | Llama 3.x |
+| Steelskull-L3.3-Damascus-R1.jinja | Llama 3.x |
+| Steelskull-L3.3-MS-Nevoria-70b.jinja | Llama 3.x |
+| Steelskull-L3.3-Nevoria-R1-70b.jinja | Llama 3.x |
+| THUDM-glm-4-9b-chat.jinja | Generic |
+| THUDM-glm-edge-1.5b-chat.jinja | Generic |
+| Tarek07-Progenitor-V1.1-LLaMa-70B.jinja | Llama 3.x |
+| TheBloke-FusionNet_34Bx2_MoE-AWQ.jinja | Generic |
+| TinyLlama-TinyLlama-1.1B-Chat-v1.0.jinja | Generic |
+| UCLA-AGI-Mistral7B-PairRM-SPPO-Iter3.jinja | Generic |
+| ValiantLabs-Llama3.1-8B-Enigma.jinja | Llama 3.x |
+| abacusai-Fewshot-Metamath-OrcaVicuna-Mistral.jinja | Generic |
+| ai21labs-AI21-Jamba-1.5-Large.jinja | Generic |
+| allenai-Llama-3.1-Tulu-3-405B-SFT.jinja | Generic |
+| allenai-Llama-3.1-Tulu-3-405B.jinja | Generic |
+| allenai-Llama-3.1-Tulu-3-8B.jinja | Generic |
+| arcee-ai-Virtuoso-Lite.jinja | Hermes 2 Pro |
+| arcee-ai-Virtuoso-Medium-v2.jinja | Hermes 2 Pro |
+| arcee-ai-Virtuoso-Small-v2.jinja | Hermes 2 Pro |
+| avemio-GRAG-NEMO-12B-ORPO-HESSIAN-AI.jinja | Generic |
+| bespokelabs-Bespoke-Stratos-7B.jinja | Hermes 2 Pro |
+| bfuzzy1-acheron-m1a-llama.jinja | Generic |
+| bofenghuang-vigogne-2-70b-chat.jinja | Generic |
+| bytedance-research-UI-TARS-72B-DPO.jinja | Generic |
+| bytedance-research-UI-TARS-7B-DPO.jinja | Generic |
+| bytedance-research-UI-TARS-7B-SFT.jinja | Generic |
+| carsenk-phi3.5_mini_exp_825_uncensored.jinja | Generic |
+| cyberagent-DeepSeek-R1-Distill-Qwen-14B-Japanese.jinja | DeepSeek R1 (extract reasoning) |
+| cyberagent-DeepSeek-R1-Distill-Qwen-32B-Japanese.jinja | DeepSeek R1 (extract reasoning) |
+| databricks-dbrx-instruct.jinja | Generic |
+| deepseek-ai-DeepSeek-Coder-V2-Instruct.jinja | Generic |
+| deepseek-ai-DeepSeek-Coder-V2-Lite-Base.jinja | Generic |
+| deepseek-ai-DeepSeek-Coder-V2-Lite-Instruct.jinja | Generic |
+| deepseek-ai-DeepSeek-R1-Distill-Llama-70B.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1-Distill-Qwen-14B.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1-Distill-Qwen-7B.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1-Zero.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-V2-Lite.jinja | Generic |
+| deepseek-ai-DeepSeek-V2.5.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-V3.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-deepseek-coder-33b-instruct.jinja | Generic |
+| deepseek-ai-deepseek-coder-6.7b-instruct.jinja | Generic |
+| deepseek-ai-deepseek-coder-7b-instruct-v1.5.jinja | Generic |
+| deepseek-ai-deepseek-llm-67b-chat.jinja | Generic |
+| deepseek-ai-deepseek-llm-7b-chat.jinja | Generic |
+| dicta-il-dictalm2.0-instruct.jinja | Generic |
+| ehristoforu-Falcon3-8B-Franken-Basestruct.jinja | Hermes 2 Pro |
+| fireworks-ai-llama-3-firefunction-v2.jinja | FireFunction v2 |
+| godlikehhd-alpaca_data_sampled_ifd_new_5200.jinja | Hermes 2 Pro |
+| godlikehhd-alpaca_data_score_max_0.7_2600.jinja | Hermes 2 Pro |
+| google-gemma-2-27b-it.jinja | Generic |
+| google-gemma-2-2b-it.jinja | Generic |
+| google-gemma-2-2b-jpn-it.jinja | Generic |
+| google-gemma-7b-it.jinja | Generic |
+| huihui-ai-DeepSeek-R1-Distill-Llama-70B-abliterated.jinja | DeepSeek R1 (extract reasoning) |
+| huihui-ai-DeepSeek-R1-Distill-Llama-8B-abliterated.jinja | DeepSeek R1 (extract reasoning) |
+| huihui-ai-DeepSeek-R1-Distill-Qwen-14B-abliterated-v2.jinja | DeepSeek R1 (extract reasoning) |
+| huihui-ai-DeepSeek-R1-Distill-Qwen-32B-abliterated.jinja | DeepSeek R1 (extract reasoning) |
+| huihui-ai-DeepSeek-R1-Distill-Qwen-7B-abliterated-v2.jinja | DeepSeek R1 (extract reasoning) |
+| huihui-ai-Qwen2.5-14B-Instruct-1M-abliterated.jinja | Hermes 2 Pro |
+| ibm-granite-granite-3.1-8b-instruct.jinja | Generic |
+| indischepartij-MiniCPM-3B-OpenHermes-2.5-v2.jinja | Generic |
+| inflatebot-MN-12B-Mag-Mell-R1.jinja | Generic |
+| jinaai-ReaderLM-v2.jinja | Generic |
+| kms7530-chemeng_qwen-math-7b_24_1_100_1_nonmath.jinja | Hermes 2 Pro |
+| knifeayumu-Cydonia-v1.3-Magnum-v4-22B.jinja | Mistral Nemo |
+| langgptai-qwen1.5-7b-chat-sa-v0.1.jinja | Generic |
+| lightblue-DeepSeek-R1-Distill-Qwen-7B-Japanese.jinja | DeepSeek R1 (extract reasoning) |
+| mattshumer-Reflection-Llama-3.1-70B.jinja | Generic |
+| meetkai-functionary-medium-v3.1.jinja | Functionary v3.1 Llama 3.1 |
+| meetkai-functionary-medium-v3.2.jinja | Functionary v3.2 |
+| meta-llama-Llama-2-7b-chat-hf.jinja | Generic |
+| meta-llama-Llama-3.1-8B-Instruct.jinja | Llama 3.x |
+| meta-llama-Llama-3.2-11B-Vision-Instruct.jinja | Llama 3.x |
+| meta-llama-Llama-3.2-1B-Instruct.jinja | Llama 3.x |
+| meta-llama-Llama-3.2-3B-Instruct.jinja | Llama 3.x |
+| meta-llama-Llama-3.3-70B-Instruct.jinja | Llama 3.x |
+| meta-llama-Meta-Llama-3-8B-Instruct.jinja | Generic |
+| meta-llama-Meta-Llama-3.1-8B-Instruct.jinja | Llama 3.x |
+| microsoft-Phi-3-medium-4k-instruct.jinja | Generic |
+| microsoft-Phi-3-mini-4k-instruct.jinja | Generic |
+| microsoft-Phi-3-small-8k-instruct.jinja | Generic |
+| microsoft-Phi-3.5-mini-instruct.jinja | Generic |
+| microsoft-Phi-3.5-vision-instruct.jinja | Generic |
+| microsoft-phi-4.jinja | Generic |
+| migtissera-Tess-3-Mistral-Nemo-12B.jinja | Generic |
+| ministral-Ministral-3b-instruct.jinja | Generic |
+| mistralai-Codestral-22B-v0.1.jinja | Generic |
+| mistralai-Mistral-7B-Instruct-v0.1.jinja | Generic |
+| mistralai-Mistral-7B-Instruct-v0.2.jinja | Generic |
+| mistralai-Mistral-7B-Instruct-v0.3.jinja | Mistral Nemo |
+| mistralai-Mistral-Large-Instruct-2407.jinja | Mistral Nemo |
+| mistralai-Mistral-Large-Instruct-2411.jinja | Generic |
+| mistralai-Mistral-Nemo-Instruct-2407.jinja | Mistral Nemo |
+| mistralai-Mistral-Small-24B-Instruct-2501.jinja | Generic |
+| mistralai-Mixtral-8x7B-Instruct-v0.1.jinja | Generic |
+| mkurman-Qwen2.5-14B-DeepSeek-R1-1M.jinja | Hermes 2 Pro |
+| mlabonne-AlphaMonarch-7B.jinja | Generic |
+| mlx-community-Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32.jinja | Hermes 2 Pro |
+| mlx-community-Qwen2.5-VL-7B-Instruct-8bit.jinja | Hermes 2 Pro |
+| mobiuslabsgmbh-DeepSeek-R1-ReDistill-Qwen-1.5B-v1.1.jinja | DeepSeek R1 (extract reasoning) |
+| netcat420-MFANNv0.20.jinja | Generic |
+| netcat420-MFANNv0.24.jinja | Generic |
+| netease-youdao-Confucius-o1-14B.jinja | Hermes 2 Pro |
+| nvidia-AceMath-7B-RM.jinja | Hermes 2 Pro |
+| nvidia-Eagle2-1B.jinja | Hermes 2 Pro |
+| nvidia-Eagle2-9B.jinja | Hermes 2 Pro |
+| nvidia-Llama-3.1-Nemotron-70B-Instruct-HF.jinja | Llama 3.x |
+| onnx-community-DeepSeek-R1-Distill-Qwen-1.5B-ONNX.jinja | DeepSeek R1 (extract reasoning) |
+| open-thoughts-OpenThinker-7B.jinja | Hermes 2 Pro |
+| openchat-openchat-3.5-0106.jinja | Generic |
+| pankajmathur-orca_mini_v6_8b.jinja | Generic |
+| princeton-nlp-Mistral-7B-Base-SFT-RDPO.jinja | Generic |
+| princeton-nlp-Mistral-7B-Instruct-DPO.jinja | Generic |
+| princeton-nlp-Mistral-7B-Instruct-RDPO.jinja | Generic |
+| prithivMLmods-Bellatrix-Tiny-1.5B-R1.jinja | Hermes 2 Pro |
+| prithivMLmods-Bellatrix-Tiny-1B-R1.jinja | Llama 3.x |
+| prithivMLmods-Bellatrix-Tiny-1B-v3.jinja | Generic |
+| prithivMLmods-Bellatrix-Tiny-3B-R1.jinja | Llama 3.x |
+| prithivMLmods-Blaze-14B-xElite.jinja | Generic |
+| prithivMLmods-Calcium-Opus-14B-Elite2-R1.jinja | Hermes 2 Pro |
+| prithivMLmods-Calme-Ties-78B.jinja | Generic |
+| prithivMLmods-Calme-Ties2-78B.jinja | Generic |
+| prithivMLmods-Calme-Ties3-78B.jinja | Generic |
+| prithivMLmods-ChemQwen2-vL.jinja | Generic |
+| prithivMLmods-GWQ2b.jinja | Generic |
+| prithivMLmods-LatexMind-2B-Codec.jinja | Generic |
+| prithivMLmods-Llama-3.2-6B-AlgoCode.jinja | Llama 3.x |
+| prithivMLmods-Megatron-Opus-14B-Exp.jinja | Hermes 2 Pro |
+| prithivMLmods-Megatron-Opus-14B-Stock.jinja | Hermes 2 Pro |
+| prithivMLmods-Megatron-Opus-7B-Exp.jinja | Hermes 2 Pro |
+| prithivMLmods-Omni-Reasoner-Merged.jinja | Hermes 2 Pro |
+| prithivMLmods-Omni-Reasoner4-Merged.jinja | Hermes 2 Pro |
+| prithivMLmods-Primal-Opus-14B-Optimus-v1.jinja | Hermes 2 Pro |
+| prithivMLmods-QwQ-Math-IO-500M.jinja | Hermes 2 Pro |
+| prithivMLmods-Qwen-7B-Distill-Reasoner.jinja | DeepSeek R1 (extract reasoning) |
+| prithivMLmods-Qwen2.5-1.5B-DeepSeek-R1-Instruct.jinja | Hermes 2 Pro |
+| prithivMLmods-Qwen2.5-14B-DeepSeek-R1-1M.jinja | Hermes 2 Pro |
+| prithivMLmods-Qwen2.5-32B-DeepSeek-R1-Instruct.jinja | Hermes 2 Pro |
+| prithivMLmods-Qwen2.5-7B-DeepSeek-R1-1M.jinja | Hermes 2 Pro |
+| prithivMLmods-Triangulum-v2-10B.jinja | Hermes 2 Pro |
+| qingy2024-Falcon3-2x10B-MoE-Instruct.jinja | Hermes 2 Pro |
+| rubenroy-Zurich-14B-GCv2-5m.jinja | Hermes 2 Pro |
+| rubenroy-Zurich-7B-GCv2-5m.jinja | Hermes 2 Pro |
+| silma-ai-SILMA-Kashif-2B-Instruct-v1.0.jinja | Generic |
+| simplescaling-s1-32B.jinja | Hermes 2 Pro |
+| sometimesanotion-Lamarck-14B-v0.7.jinja | Hermes 2 Pro |
+| sonthenguyen-zephyr-sft-bnb-4bit-DPO-mtbr-180steps.jinja | Generic |
+| sthenno-tempesthenno-icy-0130.jinja | Generic |
+| sumink-qwft.jinja | Hermes 2 Pro |
+| teknium-OpenHermes-2.5-Mistral-7B.jinja | Generic |
+| thirdeyeai-elevate360m.jinja | Generic |
+| tiiuae-Falcon3-10B-Instruct.jinja | Hermes 2 Pro |
+| unsloth-DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit.jinja | DeepSeek R1 (extract reasoning) |
+| unsloth-DeepSeek-R1-Distill-Llama-8B.jinja | DeepSeek R1 (extract reasoning) |
+| unsloth-DeepSeek-R1.jinja | DeepSeek R1 (extract reasoning) |
+| unsloth-Mistral-Small-24B-Instruct-2501-unsloth-bnb-4bit.jinja | Generic |
+| upstage-solar-pro-preview-instruct.jinja | Generic |
+| whyhow-ai-PatientSeek.jinja | Generic |
+| xwen-team-Xwen-72B-Chat.jinja | Hermes 2 Pro |
+| xwen-team-Xwen-7B-Chat.jinja | Hermes 2 Pro |
+
+This table can be generated with:
+
+```bash
+./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null
+```
+
+</details>
+
+# Usage - need tool-aware Jinja template
+
+First, start a server with any model, but make sure it has a tools-enabled template: you can verify this by inspecting the `chat_template` or `chat_template_tool_use` properties in `http://localhost:8080/props`).
+
+Here are some models known to work (w/ chat template override when needed):
+
+```shell
+# Native support:
+
+llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
+llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L
+llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
+llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M
+
+# Native support for DeepSeek R1 works best w/ our own template (official template buggy)
+
+llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L \
+--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
+
+llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M \
+--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
+
+# Native support requires the right template for these GGUFs:
+
+llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \
+--chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use )
+
+llama-server --jinja -fa -hf bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M \
+--chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use )
+
+llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \
+--chat-template-file <( python scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2 tool_use )
+
+llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L \
+--chat-template-file <( python scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 tool_use )
+
+# Generic format support
+llama-server --jinja -fa -hf bartowski/phi-4-GGUF:Q4_0
+llama-server --jinja -fa -hf bartowski/gemma-2-2b-it-GGUF:Q8_0
+llama-server --jinja -fa -hf bartowski/c4ai-command-r-v01-GGUF:Q2_K
+```
+
+> [!TIP]
+> If there is no official `tool_use` Jinja template, you may want to set `--chat-template chatml` to use a default that works with many models (YMMV!), or write your own (e.g. we provide a custom [llama-cpp-deepseek-r1.jinja](../models/templates/llama-cpp-deepseek-r1.jinja) for DeepSeek R1 distills)
+
+Test in CLI (or with any library / software that can use OpenAI-compatible API backends):
+
+```bash
+curl http://localhost:8080/v1/chat/completions -d '{
+"model": "gpt-3.5-turbo",
+"tools": [
+    {
+    "type":"function",
+    "function":{
+        "name":"python",
+        "description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
+        "parameters":{
+        "type":"object",
+        "properties":{
+            "code":{
+            "type":"string",
+            "description":"The code to run in the ipython interpreter."
+            }
+        },
+        "required":["code"]
+        }
+    }
+    }
+],
+"messages": [
+    {
+    "role": "user",
+    "content": "Print a hello world message with python."
+    }
+]
+}'
+```
+
+<details>
+<summary>Show output</summary>
+
+```json
+{
+"choices": [
+    {
+    "finish_reason": "tool",
+    "index": 0,
+    "message": {
+        "content": null,
+        "tool_calls": [
+        {
+            "name": "python",
+            "arguments": "{\"code\":\" \\nprint(\\\"Hello, World!\\\")\"}"
+        }
+        ],
+        "role": "assistant"
+    }
+    }
+],
+"created": 1727287211,
+"model": "gpt-3.5-turbo",
+"object": "chat.completion",
+"usage": {
+    "completion_tokens": 16,
+    "prompt_tokens": 44,
+    "total_tokens": 60
+},
+"id": "chatcmpl-Htbgh9feMmGM0LEH2hmQvwsCxq3c6Ni8"
+}
+```
+
+</details>
diff --git a/examples/llava/README-granitevision.md b/examples/llava/README-granitevision.md
new file mode 100644
index 000000000..f08a21cc1
--- /dev/null
+++ b/examples/llava/README-granitevision.md
@@ -0,0 +1,190 @@
+# Granite Vision
+
+Download the model and point your `GRANITE_MODEL` environment variable to the path.
+
+```bash
+$ git clone https://huggingface.co/ibm-granite/granite-vision-3.2-2b
+$ export GRANITE_MODEL=./granite-vision-3.2-2b
+```
+
+
+### 1. Running llava surgery v2.
+First, we need to run the llava surgery script as shown below:
+
+`python llava_surgery_v2.py -C -m $GRANITE_MODEL`
+
+You should see two new files (`llava.clip` and `llava.projector`) written into your model's directory, as shown below.
+
+```bash
+$ ls $GRANITE_MODEL | grep -i llava
+llava.clip
+llava.projector
+```
+
+We should see that the projector and visual encoder get split out into the llava files. Quick check to make sure they aren't empty:
+```python
+import os
+import torch
+
+MODEL_PATH = os.getenv("GRANITE_MODEL")
+if not MODEL_PATH:
+    raise ValueError("env var GRANITE_MODEL is unset!")
+
+encoder_tensors = torch.load(os.path.join(MODEL_PATH, "llava.clip"))
+projector_tensors = torch.load(os.path.join(MODEL_PATH, "llava.projector"))
+
+assert len(encoder_tensors) > 0
+assert len(projector_tensors) > 0
+```
+
+If you actually inspect the `.keys()` of the loaded tensors, you should see a lot of `vision_model` tensors in the `encoder_tensors`, and 5 tensors (`'multi_modal_projector.linear_1.bias'`, `'multi_modal_projector.linear_1.weight'`, `'multi_modal_projector.linear_2.bias'`, `'multi_modal_projector.linear_2.weight'`, `'image_newline'`) in the multimodal `projector_tensors`.
+
+
+### 2. Creating the Visual Component GGUF
+Next, create a new directory to hold the visual components, and copy the llava.clip/projector files, as shown below.
+
+```bash
+$ ENCODER_PATH=$PWD/visual_encoder
+$ mkdir $ENCODER_PATH
+
+$ cp $GRANITE_MODEL/llava.clip $ENCODER_PATH/pytorch_model.bin
+$ cp $GRANITE_MODEL/llava.projector $ENCODER_PATH/
+```
+
+Now, we need to write a config for the visual encoder. In order to convert the model, be sure to use the correct `image_grid_pinpoints`, as these may vary based on the model. You can find the `image_grid_pinpoints` in `$GRANITE_MODEL/config.json`.
+
+```json
+{
+    "_name_or_path": "siglip-model",
+    "architectures": [
+      "SiglipVisionModel"
+    ],
+    "image_grid_pinpoints": [
+        [384,384],
+        [384,768],
+        [384,1152],
+        [384,1536],
+        [384,1920],
+        [384,2304],
+        [384,2688],
+        [384,3072],
+        [384,3456],
+        [384,3840],
+        [768,384],
+        [768,768],
+        [768,1152],
+        [768,1536],
+        [768,1920],
+        [1152,384],
+        [1152,768],
+        [1152,1152],
+        [1536,384],
+        [1536,768],
+        [1920,384],
+        [1920,768],
+        [2304,384],
+        [2688,384],
+        [3072,384],
+        [3456,384],
+        [3840,384]
+    ],
+    "mm_patch_merge_type": "spatial_unpad",
+    "hidden_size": 1152,
+    "image_size": 384,
+    "intermediate_size": 4304,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 27,
+    "patch_size": 14,
+    "layer_norm_eps": 1e-6,
+    "hidden_act": "gelu_pytorch_tanh",
+    "projection_dim": 0,
+    "vision_feature_layer": [-24, -20, -12, -1]
+}
+```
+
+At this point you should have something like this:
+```bash
+$ ls $ENCODER_PATH
+config.json             llava.projector         pytorch_model.bin
+```
+
+Now convert the components to GGUF; Note that we also override the image mean/std dev to `[.5,.5,.5]` since we use the SigLIP visual encoder - in the transformers model, you can find these numbers in the `preprocessor_config.json`.
+```bash
+$ python convert_image_encoder_to_gguf.py \
+    -m $ENCODER_PATH \
+    --llava-projector $ENCODER_PATH/llava.projector \
+    --output-dir $ENCODER_PATH \
+    --clip-model-is-vision \
+    --clip-model-is-siglip \
+    --image-mean 0.5 0.5 0.5 \
+    --image-std 0.5 0.5 0.5
+```
+
+This will create the first GGUF file at `$ENCODER_PATH/mmproj-model-f16.gguf`; we will refer to the absolute path of this file as the `$VISUAL_GGUF_PATH.`
+
+
+### 3. Creating the LLM GGUF.
+The granite vision model contains a granite LLM as its language model. For now, the easiest way to get the GGUF for LLM is by loading the composite model in `transformers` and exporting the LLM so that it can be directly converted with the normal conversion path.
+
+First, set the `LLM_EXPORT_PATH` to the path to export the `transformers` LLM to.
+```bash
+$ export LLM_EXPORT_PATH=$PWD/granite_vision_llm
+```
+
+```python
+import os
+import transformers
+
+MODEL_PATH = os.getenv("GRANITE_MODEL")
+if not MODEL_PATH:
+    raise ValueError("env var GRANITE_MODEL is unset!")
+
+LLM_EXPORT_PATH = os.getenv("LLM_EXPORT_PATH")
+if not LLM_EXPORT_PATH:
+    raise ValueError("env var LLM_EXPORT_PATH is unset!")
+
+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
+
+# NOTE: granite vision support was added to transformers very recently (4.49);
+# if you get size mismatches, your version is too old.
+# If you are running with an older version, set `ignore_mismatched_sizes=True`
+# as shown below; it won't be loaded correctly, but the LLM part of the model that
+# we are exporting will be loaded correctly.
+model = transformers.AutoModelForImageTextToText.from_pretrained(MODEL_PATH, ignore_mismatched_sizes=True)
+
+tokenizer.save_pretrained(LLM_EXPORT_PATH)
+model.language_model.save_pretrained(LLM_EXPORT_PATH)
+```
+
+Now you can convert the exported LLM to GGUF with the normal converter in the root of the llama cpp project.
+```bash
+$ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm.gguf
+...
+$ python convert_hf_to_gguf.py --outfile $LLM_GGUF_PATH $LLM_EXPORT_PATH
+```
+
+
+### 4. Quantization
+If you want to quantize the LLM, you can do so with `llama-quantize` as you would any other LLM. For example:
+```bash
+$ ./build/bin/llama-quantize $LLM_EXPORT_PATH/granite_llm.gguf $LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf Q4_K_M
+$ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf
+```
+
+Note that currently you cannot quantize the visual encoder because granite vision models use SigLIP as the visual encoder, which has tensor dimensions that are not divisible by 32.
+
+
+### 5. Running the Model in Llama cpp
+Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
+
+```bash
+$ ./build/bin/llama-llava-cli -m $LLM_GGUF_PATH \
+    --mmproj $VISUAL_GGUF_PATH \
+    --image ./media/llama0-banner.png \
+    -c 16384 \
+    -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\<image>\nWhat does the text in this image say?\n<|assistant|>\n" \
+    --temp 0
+```
+
+Sample output: `The text in the image reads "LLAMA C++ Can it run DOOM Llama?"`
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 3ddd4ce5b..ef5de2df1 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -43,6 +43,7 @@
 #include <map>
 #include <regex>
 #include <stdexcept>
+#include <unordered_set>
 #include <vector>
 #include <sstream>
 #include <cinttypes>
@@ -123,6 +124,7 @@ static std::string format(const char * fmt, ...) {
 #define KEY_IMAGE_MEAN          "clip.vision.image_mean"
 #define KEY_IMAGE_STD           "clip.vision.image_std"
 #define KEY_PROJ_TYPE           "clip.projector_type"
+#define KEY_FEATURE_LAYER       "clip.vision.feature_layer"
 
 #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
 #define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
@@ -447,8 +449,9 @@ struct clip_hparams {
 
     char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)
 
-    int32_t image_grid_pinpoints[32];
+    std::vector<int32_t> image_grid_pinpoints;
     int32_t image_crop_resolution;
+    std::unordered_set<int32_t> vision_feature_layer;
 };
 
 struct clip_layer {
@@ -588,6 +591,7 @@ struct clip_ctx {
     struct clip_vision_model vision_model;
     projector_type proj_type = PROJECTOR_TYPE_MLP;
 
+    int32_t max_feature_layer;
     float image_mean[3];
     float image_std[3];
     bool use_gelu = false;
@@ -654,7 +658,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     const int hidden_size          = hparams.hidden_size;
     const int n_head               = hparams.n_head;
     const int d_head               = hidden_size / n_head;
-    int n_layer                    = hparams.n_layer;
     const float eps                = hparams.eps;
     int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
 
@@ -755,13 +758,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
         embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
     }
 
+    std::vector<struct ggml_tensor *> embedding_stack;
+    const auto & vision_feature_layer = hparams.vision_feature_layer;
+
     // loop over layers
-    if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
-        n_layer += 1;
-    }
-    for (int il = 0; il < n_layer - 1; il++) {
+    for (int il = 0; il < ctx->max_feature_layer; il++) {
         struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
 
+        // If this is an embedding feature layer, save the output.
+        // NOTE: 0 index here refers to the input to the encoder.
+        if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
+            embedding_stack.push_back(embeddings);
+        }
+
         //const size_t nb_q_w = model.layers[il].q_w->nb[0];
 
         // layernorm1
@@ -849,7 +858,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
         cur = ggml_add(ctx0, embeddings, cur);
 
         embeddings = cur;
-
     }
 
     // post-layernorm
@@ -860,6 +868,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
         embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
     }
 
+    // final layer is a vision feature layer
+    if (vision_feature_layer.find(ctx->max_feature_layer) != vision_feature_layer.end()) {
+        embedding_stack.push_back(embeddings);
+    }
+
+    // If feature layers are explicitly set, stack them (if we have multiple)
+    if (!embedding_stack.empty()) {
+        embeddings = embedding_stack[0];
+        for (size_t i = 1; i < embedding_stack.size(); i++) {
+            embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
+        }
+    }
+
     // llava projector
     if (ctx->has_llava_projector) {
         embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@@ -1455,14 +1476,26 @@ if(enable_gpu_clip)
             int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
             int n = gguf_get_arr_n(ctx, idx);
             const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
-            for (int i = 0; i < 32 && i < n && pinpoints[i] != 0; ++i) {
-                hparams.image_grid_pinpoints[i] = pinpoints[i];
+            for (int i = 0; i < n; ++i) {
+                hparams.image_grid_pinpoints.push_back(pinpoints[i]);
             }
-            if (n < 32)
-                hparams.image_grid_pinpoints[n] = 0;
-        } catch (std::runtime_error & /*e*/) {
-            hparams.image_grid_pinpoints[0]=0;
-        }
+        } catch (std::runtime_error & /*e*/) { }
+
+        // Load the vision feature layer indices if they are explicitly provided;
+        // if multiple vision feature layers are present, the values will be concatenated
+        // to form the final visual features.
+        // NOTE: gguf conversions should standardize the values of the vision feature layer to
+        // be non-negative, since we use -1 to mark values as unset here.
+        try {
+            int idx = get_key_idx(ctx, KEY_FEATURE_LAYER);
+            int n = gguf_get_arr_n(ctx, idx);
+
+            const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data(ctx, idx);
+
+            for (int i = 0; i < n; ++i) {
+                hparams.vision_feature_layer.insert(vision_feature_layer[i]);
+            }
+        } catch (std::runtime_error & /*e*/) { }
 
         try {
             int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
@@ -1488,6 +1521,9 @@ if(enable_gpu_clip)
             new_clip->image_std[i]  = std_data[i];
         }
 
+        // Calculate the deepest feature layer based on hparams and projector type
+        new_clip->max_feature_layer = get_deepest_feature_layer(new_clip);
+
         if (verbosity >= 2) {
             LOG_INF("\n%s: vision model hparams\n", __func__);
             LOG_INF("image_size         %d\n", hparams.image_size);
@@ -1501,8 +1537,13 @@ if(enable_gpu_clip)
             LOG_INF("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
             LOG_INF("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
             LOG_INF("v_image_grid_pinpoints: ");
-            for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
-                LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
+            for (const auto & pp : hparams.image_grid_pinpoints) {
+                LOG_INF("%d ", pp);
+            }
+            LOG_INF("\n");
+            LOG_INF("v_vision_feature_layer: ");
+            for (const auto & feature_layer: hparams.vision_feature_layer) {
+                LOG_INF("%d ", feature_layer);
             }
             LOG_INF("\n");
             LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
@@ -1741,11 +1782,11 @@ void clip_image_f32_batch_free(struct clip_image_f32_batch  * batch) {
     }
 }
 
-static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
+void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
     img->nx = nx;
     img->ny = ny;
     img->buf.resize(3 * nx * ny);
-    memcpy(img->buf.data(), data, img->buf.size());
+    memcpy(img->buf.data(), rgb_pixels, img->buf.size());
 }
 
 bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
@@ -1755,7 +1796,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
         LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
         return false;
     }
-    build_clip_img_from_data(data, nx, ny, img);
+    clip_build_img_from_pixels(data, nx, ny, img);
     stbi_image_free(data);
     return true;
 }
@@ -1846,14 +1887,14 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
         uint8_t* letterboxed_image = make_new_letterbox_img(data, nx, ny, nc, new_width, new_height);
         if(letterboxed_image!=nullptr)
         {
-            build_clip_img_from_data(letterboxed_image, new_width, new_height, img);
+            clip_build_img_from_pixels(letterboxed_image, new_width, new_height, img);
             free(letterboxed_image);
             letterboxed_image = nullptr;
         }
     }
     else
     {
-        build_clip_img_from_data(data, nx, ny, img);
+        clip_build_img_from_pixels(data, nx, ny, img);
     }
     stbi_image_free(data);
     return true;
@@ -2334,10 +2375,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
             }
         }
     } else {
-        if (params.image_grid_pinpoints[0] != 0) {
+        if (!params.image_grid_pinpoints.empty()) {
             // "spatial_unpad" with "anyres" processing for llava-1.6
             std::vector<std::pair<int, int>> possible_resolutions;
-            for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
+            for (size_t i = 0; i < params.image_grid_pinpoints.size(); i+=2) {
                 possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
             }
             std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
@@ -2503,7 +2544,14 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
 }
 
 const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
-    return ctx->vision_model.hparams.image_grid_pinpoints;
+    if (ctx->vision_model.hparams.image_grid_pinpoints.size()) {
+        return &ctx->vision_model.hparams.image_grid_pinpoints.front();
+    }
+    return nullptr;
+}
+
+size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
+    return ctx->vision_model.hparams.image_grid_pinpoints.size();
 }
 
 int clip_n_patches(const struct clip_ctx * ctx) {
@@ -3038,6 +3086,28 @@ bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
     return ctx->has_qwen2vl_merger;
 }
 
+// Determine the number of encoder layers to iterate over
+int get_deepest_feature_layer(const struct clip_ctx * ctx) {
+    // Get the index of the second to last layer; this is the
+    // default for models that have a llava projector
+    const auto & hparams = ctx->vision_model.hparams;
+    int n_layer = hparams.n_layer - 1;
+    int deepest_feature_layer = -1;
+
+    // Handle other projectors; incrementing here indicates that we
+    // should use the last encoder layer for the vision features.
+    if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
+        n_layer += 1;
+    }
+
+    // If we set explicit vision feature layers, only go up to the deepest one
+    for (const auto & feature_layer : hparams.vision_feature_layer) {
+        if (feature_layer > deepest_feature_layer) {
+            deepest_feature_layer = feature_layer;
+        }
+    }
+    return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
+}
 
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
     clip_image_f32 clip_img;
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index d6f865dde..76ca99716 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -55,6 +55,7 @@ CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
 CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
 
 CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
+CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
 
 CLIP_API int clip_n_patches        (const struct clip_ctx * ctx);
 CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
@@ -73,6 +74,12 @@ CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
 CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
 CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
 
+/**
+ * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
+ * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
+ */
+CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
+
 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
 
 /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
@@ -89,11 +96,13 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
 CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
 
 CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
+CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
 CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
 
+CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
+
 CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
 
-CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
 
 CLIP_API void set_clip_uses_gpu(bool usegpu);
 
diff --git a/examples/llava/convert_image_encoder_to_gguf.py b/examples/llava/convert_image_encoder_to_gguf.py
index 4fa1d6cea..de29687ec 100644
--- a/examples/llava/convert_image_encoder_to_gguf.py
+++ b/examples/llava/convert_image_encoder_to_gguf.py
@@ -6,7 +6,7 @@ import re
 import torch
 import numpy as np
 from gguf import *
-from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
+from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel, SiglipVisionModel
 
 TEXT = "clip.text"
 VISION = "clip.vision"
@@ -37,6 +37,18 @@ def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: b
 
 
 def get_tensor_name(name: str) -> str:
+    # Standardize the transformers llava next keys for
+    # image newline / mm projector with the classes in haotian-liu LLaVA
+    if name == "image_newline":
+        return "model.image_newline"
+    if name.startswith("multi_modal_projector"):
+        name = name.replace("multi_modal_projector", "mm")
+        if "linear_1" in name:
+            name = name.replace("linear_1", "0")
+        if "linear_2" in name:
+            name = name.replace("linear_2", "2")
+        return name
+
     if "projection" in name:
         return name
     if "mm_projector" in name:
@@ -83,8 +95,14 @@ ap.add_argument("--vision-only", action="store_true", required=False,
                 help="Save a vision-only model. It can't be used to encode texts")
 ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
                 help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
-ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
+
+# Selectable visual encoders that are compatible with this script
+encoder_group = ap.add_mutually_exclusive_group()
+encoder_group.add_argument("--clip-model-is-openclip", action="store_true", required=False,
                 help="The clip model is from openclip (for ViT-SO400M type))")
+encoder_group.add_argument("--clip-model-is-siglip", action="store_true", required=False,
+                help="the visual encoder is Siglip.")
+
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
 ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
@@ -109,7 +127,12 @@ if args.use_f32:
 # output in the same directory as the model if output_dir is None
 dir_model = args.model_dir
 
-if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
+if (
+    args.clip_model_is_vision or
+    not os.path.exists(dir_model + "/vocab.json") or
+    args.clip_model_is_openclip or
+    args.clip_model_is_siglip
+):
     vocab = None
     tokens = None
 else:
@@ -137,7 +160,10 @@ ftype = 1
 if args.use_f32:
     ftype = 0
 
-if args.clip_model_is_vision or args.clip_model_is_openclip:
+if args.clip_model_is_siglip:
+    model = SiglipVisionModel.from_pretrained(dir_model)
+    processor = None
+elif args.clip_model_is_vision or args.clip_model_is_openclip:
     model = CLIPVisionModel.from_pretrained(dir_model)
     processor = None
 else:
@@ -187,26 +213,71 @@ else:
 if has_text_encoder:
     assert t_hparams is not None
     assert tokens is not None
+    if args.clip_model_is_siglip:
+        text_projection_dim = 0
+    else:
+        text_projection_dim = t_hparams.get("projection_dim", config["projection_dim"])
     # text_model hparams
     fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
     fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
     fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
-    fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
+    fout.add_uint32("clip.text.projection_dim", text_projection_dim)
     fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
     fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
     fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
     fout.add_token_list(tokens)
 
+
+
+def get_non_negative_vision_feature_layers(v_hparams):
+    """
+    Determine the vision feature layer(s) for the llava model, which are indices into the
+    hidden states of the visual encoder. Note that the hidden states array generally takes the
+    form:
+
+        [<emb input>, <output of enc block 0>, ... <output of enc block num_hidden_layers>]
+
+    so feature indices should be offset as n+1 to get the output of encoder block n.
+    We convert all vision feature layers to non-negative so that -1 can be used in
+    the model as an unset value. If no vision feature layer is found, we leave it unset.
+    """
+    num_hidden_layers = v_hparams["num_hidden_layers"]
+    to_non_negative = lambda layer_idx: layer_idx  if layer_idx >= 0 else num_hidden_layers + layer_idx + 1
+    feature_layers_key = None
+    # Key used for llava models in transformers
+    if "vision_feature_layer" in config:
+        feature_layers_key = "vision_feature_layer"
+    # Key used for llava models in the original format
+    elif "mm_vision_select_layer" in config:
+        feature_layers_key = "mm_vision_select_layer"
+    if feature_layers_key is not None:
+        feature_layers = config[feature_layers_key]
+        if isinstance(feature_layers, int):
+            feature_layers = [feature_layers]
+        return [to_non_negative(feature_layer) for feature_layer in feature_layers]
+
+# Determine if we have explicitly specified vision feature layers in our config
+feature_layers = get_non_negative_vision_feature_layers(v_hparams)
+
 if has_vision_encoder:
-    # vision_model hparams
+    # Siglip does not have a visual projector; set projection dim to 0
+    if args.clip_model_is_siglip:
+        visual_projection_dim = 0
+    else:
+        visual_projection_dim = v_hparams.get("projection_dim", config["projection_dim"])
+
+    # set vision_model hparams
     fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
     fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
     fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
     fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
-    fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"]))
+    fout.add_uint32("clip.vision.projection_dim", visual_projection_dim)
     fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
     fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
-    block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
+    if feature_layers:
+        block_count = max(feature_layers)
+    else:
+        block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
     fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
                             #     /**
                             #      "image_grid_pinpoints": [
@@ -258,7 +329,8 @@ if has_vision_encoder:
         fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
     if "mm_projector_type" in v_hparams:
         fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
-
+    if feature_layers:
+        fout.add_array("clip.vision.feature_layer", feature_layers)
 
     if processor is not None:
         image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean  # pyright: ignore[reportAttributeAccessIssue]
@@ -274,7 +346,13 @@ fout.add_bool("clip.use_gelu", use_gelu)
 
 
 if has_llava_projector:
-    model.vision_model.encoder.layers.pop(-1)
+    # By default, we drop the last layer for llava projector
+    # models unless we have explicitly set vision feature layers
+    if feature_layers is None:
+        model.vision_model.encoder.layers.pop(-1)
+    else:
+        model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)]
+
     projector = torch.load(args.llava_projector)
     for name, data in projector.items():
         name = get_tensor_name(name)
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index ae7b36d1c..887698c4a 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -353,9 +353,10 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
         LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
 
         const int32_t * image_grid = clip_image_grid(ctx_clip);
+        const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
 
         std::vector<std::pair<int, int>> grid_pinpoints;
-        for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
+        for (size_t i = 0; i < num_gridpoints; i += 2) {
             grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
         }
 
@@ -405,7 +406,8 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
 }
 
 bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
-    int num_max_patches = 6;
+    // Granite vision uses up to 10 patches + base patch
+    int num_max_patches = 11;
     if (clip_is_minicpmv(ctx_clip)) {
         num_max_patches = 10;
     }
diff --git a/examples/llava/llava_surgery_v2.py b/examples/llava/llava_surgery_v2.py
index 2d5b32fe6..b07c3e323 100644
--- a/examples/llava/llava_surgery_v2.py
+++ b/examples/llava/llava_surgery_v2.py
@@ -33,6 +33,33 @@ def save_model(model, file_path, file_type):
     else:
         torch.save(model, file_path)
 
+# Helpers to match weight names from specific components or
+# determine if a saved shard contains that component
+def is_vision_tower(weight_name):
+    return (
+        weight_name.startswith("model.vision_tower") or
+        weight_name.startswith("vit.") or
+        weight_name.startswith("vision_tower")
+    )
+
+def is_newline(weight_name):
+    return (
+        weight_name.startswith("model.image_newline") or
+        weight_name.startswith("image_newline")
+    )
+
+def is_mm_projector(weight_name):
+    return (
+        weight_name.startswith("model.mm_projector") or
+        weight_name.startswith("vision_proj.") or
+        weight_name.startswith("multi_modal_projector")
+    )
+
+def newline_criteria(checkpoint):
+    return any(is_newline(k) for k in checkpoint.keys())
+
+def proj_criteria(checkpoint):
+    return any(is_mm_projector(k) for k in checkpoint.keys())
 
 # Adapted function to clean vision tower from checkpoint
 def clean_vision_tower_from_checkpoint(checkpoint_path):
@@ -40,7 +67,7 @@ def clean_vision_tower_from_checkpoint(checkpoint_path):
     # file_type = 'pytorch'
     model_path = os.path.dirname(checkpoint_path)
     print(f"Searching for vision tower tensors in {checkpoint_path}")
-    clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") or k.startswith("vit."))]
+    clip_tensors = [k for k, v in checkpoint.items() if is_vision_tower(k)]
 
     if len(clip_tensors) > 0:
         print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
@@ -84,12 +111,6 @@ def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector):
 
     return newline_checkpoint_path, projector_checkpoint_path
 
-def newline_criteria(checkpoint):
-    return any(k.startswith("model.image_newline") for k in checkpoint.keys())
-
-def proj_criteria(checkpoint):
-    return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys())
-
 
 # Command-line interface setup
 ap = argparse.ArgumentParser()
@@ -123,14 +144,14 @@ first_checkpoint = None
 if newline_checkpoint_path is not None:
     print(f"Taking newline from {newline_checkpoint_path}")
     first_checkpoint, file_type = load_model(newline_checkpoint_path)
-    first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")]
+    first_mm_tensors = [k for k, v in first_checkpoint.items() if is_newline(k)]
 
 # Load the checkpoint
 mm_tensors = []
 last_checkpoint = None
 if projector_checkpoint_path is not None:
     last_checkpoint, file_type = load_model(projector_checkpoint_path)
-    mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")]
+    mm_tensors = [k for k, v in last_checkpoint.items() if is_mm_projector(k)]
 
 if len(mm_tensors) == 0:
     if last_checkpoint is not None:
@@ -155,5 +176,5 @@ if len(projector) > 0:
     save_model(projector, f"{args.model}/llava.projector", 'pytorch')
 
 print("Done!")
-print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
+print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
 print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
diff --git a/examples/nix_example.md b/examples/nix_example.md
index 8ffb736f2..54537062b 100644
--- a/examples/nix_example.md
+++ b/examples/nix_example.md
@@ -1,10 +1,13 @@
 # Guide to Nix for KoboldCpp
 
-- KoboldCpp is available on Nixpkgs and can be installed by adding just `koboldcpp` to your `environment.systemPackages` *(or it can also be placed in `home.packages`)*.
+- KoboldCpp is available on Nixpkgs and can be installed by adding just
+`koboldcpp` to your `environment.systemPackages` *(or it can also be placed
+in `home.packages`)*.
 
 ## KoboldCpp Nix - CUDA Support
 
-In order to enable NVIDIA CUDA support, you'll need to configure several settings:
+In order to enable NVIDIA CUDA support, you'll need to configure several
+settings:
 
 - Enable required options:
 
@@ -13,20 +16,10 @@ nixpkgs.config.allowUnfree = true;    # Allow proprietary software
 nixpkgs.config.cudaSupport = true;    # Enable CUDA functionality
 ```
 
-- Enable graphics support based on your NixOS version:
-
-```nix
-# For NixOS 24.05:
-hardware.opengl.enable = true;
-
-# For NixOS 24.11 or unstable:
-hardware.graphics.enable = true;
-```
-
 - Set your GPU architecture:
 
 ```nix
-nixpkgs.config.cudaArches = [ "sm_75" ];  # Example for RTX 2080
+nixpkgs.config.cudaCapabilities = [ "sm_75" ];  # Example for RTX 2080
 ```
 
 To find your GPU's architecture code:
@@ -37,7 +30,7 @@ To find your GPU's architecture code:
 
 ## Hardware Support
 
-- ✅ Vulkan: Enabled by default on Linux and macOS
+- ✅ Vulkan: Enabled by default on Linux
 - ✅ Metal: Enabled by default on macOS
 - ❌ ROCm: Not currently available
 
@@ -47,24 +40,31 @@ To find your GPU's architecture code:
 nixpkgs.config = {
   allowUnfree = true;
   cudaSupport = true;
-  cudaArches = [ "sm_75" ];
+  cudaCapabilities = [ "sm_75" ];
 };
-# NixOS 24.05
-hardware.opengl.enable = true;
-# NixOS 24.11 or unstable
-# hardware.graphics.enable = true;
 environment.systemPackages = [ pkgs.koboldcpp ];
 # If you're using home-manager to install KoboldCpp
 # home.packages = [ pkgs.koboldcpp ];
+
+# You can also just override koboldcpp to add your CUDA architecture:
+# environment.systemPackages = [ (koboldcpp.override { cudaArches = ["sm_75"]; }) ]
+# or
+# home.packages = [ (koboldcpp.override { cudaArches = ["sm_75"]; }) ];
 ```
 
 ## KoboldCpp - Home Manager
 
-The setup for Home Manager is the same as regular Nix, with one exception regarding Home Manager's instance of nixpkgs. By default, Home Manager manages its own isolated instance of nixpkgs, which has two implications:
+The setup for Home Manager is the same as regular Nix, with one exception
+regarding Home Manager's instance of nixpkgs. By default, Home Manager manages
+its own isolated instance of nixpkgs, which has two implications:
 
-1. You can keep your private Home Manager nixpkgs instance and simply repeat your `nixpkgs.config` in home manager.
-2. You can set `home-manager.useGlobalPkgs = true;` to copy your module system's nixpkgs instance. This way, you only need to define it in your `configuration.nix`, and Home Manager will "inherit" this configuration.
+1. You can keep your private Home Manager nixpkgs instance and simply repeat
+your `nixpkgs.config` in home manager.
+2. You can set `home-manager.useGlobalPkgs = true;` to copy your module
+system's nixpkgs instance. This way, you only need to define it in your
+`configuration.nix`, and Home Manager will "inherit" this configuration.
 
 ## Getting Help for KoboldCpp Nix
 
-- If you face any issues with running KoboldCpp on Nix, please open an issue [here](https://github.com/NixOS/nixpkgs/issues/new?assignees=&labels=0.kind%3A+bug&projects=&template=bug_report.md&title=)
+- If you face any issues with running KoboldCpp on Nix, please open an issue
+[here](https://github.com/NixOS/nixpkgs/issues/new?assignees=&labels=0.kind%3A+bug&projects=&template=bug_report.md&title=)
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 4db4c783a..6830c2e1a 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -521,8 +521,13 @@ static json oaicompat_completion_params_parse(const json & body) {
         throw std::runtime_error("Only one completion choice is allowed");
     }
 
+    // Handle "echo" field
+    if (json_value(body, "echo", false)) {
+        throw std::runtime_error("Only no echo is supported");
+    }
+
     // Params supported by OAI but unsupported by llama.cpp
-    static const std::vector<std::string> unsupported_params { "best_of", "echo", "suffix" };
+    static const std::vector<std::string> unsupported_params { "best_of", "suffix" };
     for (const auto & param : unsupported_params) {
         if (body.contains(param)) {
             throw std::runtime_error("Unsupported param: " + param);
@@ -598,7 +603,7 @@ static json oaicompat_completion_params_parse(
     inputs.tool_choice           = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto")));
     inputs.json_schema           = json_schema.is_null() ? "" : json_schema.dump();
     inputs.grammar               = grammar;
-    inputs.add_generation_prompt = true;
+    inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
     inputs.use_jinja             = use_jinja;
     inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", false);
     inputs.extract_reasoning     = reasoning_format != COMMON_REASONING_FORMAT_NONE;
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
index 23600eea9..2cb150fd2 100644
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@@ -19,7 +19,7 @@ struct ggml_tallocr {
 };
 
 GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
-GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
+GGML_API enum ggml_status    ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
 
 // Graph allocator
 /*
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index fc9571c82..64671495b 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -56,7 +56,7 @@ extern "C" {
     GGML_API void                           ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
     GGML_API void *                         ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
     GGML_API size_t                         ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API void                           ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API enum ggml_status               ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
     GGML_API size_t                         ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
     GGML_API size_t                         ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
     GGML_API size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
@@ -342,8 +342,8 @@ extern "C" {
     GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
 
     // Tensor initialization
-    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
-    GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
+    GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
+    GGML_API enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor);
 
     // CPU buffer types are always available
     GGML_API ggml_backend_buffer_t      ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index 9b8a69754..b48cc560e 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -99,6 +99,7 @@ extern "C" {
     // other
     GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
     GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
+    GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
     GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
     GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
 
diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
index 7244a9cbb..a3d3f6901 100644
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -89,7 +89,7 @@ struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
     return talloc;
 }
 
-void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
+enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
     size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
     size = GGML_PAD(size, talloc->alignment);
 
@@ -104,7 +104,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
 
     assert(((uintptr_t)addr % talloc->alignment) == 0);
 
-    ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
+    return ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
 }
 
 // dynamic tensor allocator
@@ -933,42 +933,51 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
 
 // utils
 
+static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
+    for (size_t i = 0; i < *n_buffers; i++) {
+        ggml_backend_buffer_free((*buffers)[i]);
+    }
+    free(*buffers);
+}
+
 static bool alloc_tensor_range(struct ggml_context * ctx,
         struct ggml_tensor * first, struct ggml_tensor * last,
         ggml_backend_buffer_type_t buft, size_t size,
         ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
+
     ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
     if (buffer == NULL) {
-#ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
-#endif
-        for (size_t i = 0; i < *n_buffers; i++) {
-            ggml_backend_buffer_free((*buffers)[i]);
-        }
-        free(*buffers);
+        GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
+        free_buffers(buffers, n_buffers);
         return false;
     }
 
-    struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
-
-    for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
-        if (t->data == NULL) {
-            if (t->view_src == NULL) {
-                ggml_tallocr_alloc(&tallocr, t);
-            } else if (t->buffer == NULL) {
-                ggml_backend_view_init(t);
-            }
-        } else {
-            if (t->view_src != NULL && t->buffer == NULL) {
-                // view of a pre-allocated tensor
-                ggml_backend_view_init(t);
-            }
-        }
-    }
-
     *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
     (*buffers)[(*n_buffers)++] = buffer;
 
+    struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
+
+    for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
+        enum ggml_status status = GGML_STATUS_SUCCESS;
+        if (t->data == NULL) {
+            if (t->view_src == NULL) {
+                status = ggml_tallocr_alloc(&tallocr, t);
+            } else if (t->buffer == NULL) {
+                status = ggml_backend_view_init(t);
+            }
+        } else {
+            if (t->view_src != NULL && t->buffer == NULL) {
+                // view of a pre-allocated tensor
+                status = ggml_backend_view_init(t);
+            }
+        }
+        if (status != GGML_STATUS_SUCCESS) {
+            GGML_LOG_ERROR("%s: failed to initialize tensor %s\n", __func__, t->name);
+            free_buffers(buffers, n_buffers);
+            return false;
+        }
+    }
+
     return true;
 }
 
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
index d1c2d76d8..c36c12d65 100644
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -44,7 +44,7 @@ extern "C" {
         // base address of the buffer
         void *       (*get_base)     (ggml_backend_buffer_t buffer);
         // (optional) initialize a tensor in the buffer (eg. add tensor extras)
-        void         (*init_tensor)  (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+        enum ggml_status (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
         // tensor data access
         void         (*memset_tensor)(ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
         void         (*set_tensor)   (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 1030ebb7f..5e5a3dc63 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -126,11 +126,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
     return base;
 }
 
-void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+enum ggml_status ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
     // init_tensor is optional
     if (buffer->iface.init_tensor) {
-        buffer->iface.init_tensor(buffer, tensor);
+        return buffer->iface.init_tensor(buffer, tensor);
     }
+    return GGML_STATUS_SUCCESS;
 }
 
 void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -1647,7 +1648,7 @@ ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched,
 
 // utils
 
-void ggml_backend_view_init(struct ggml_tensor * tensor) {
+enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor) {
     GGML_ASSERT(tensor->buffer == NULL);
     GGML_ASSERT(tensor->view_src != NULL);
     GGML_ASSERT(tensor->view_src->buffer != NULL);
@@ -1655,10 +1656,10 @@ void ggml_backend_view_init(struct ggml_tensor * tensor) {
 
     tensor->buffer = tensor->view_src->buffer;
     tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
-    ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
+    return ggml_backend_buffer_init_tensor(tensor->buffer, tensor);
 }
 
-void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
+enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
     GGML_ASSERT(tensor->buffer == NULL);
     GGML_ASSERT(tensor->data == NULL);
     GGML_ASSERT(tensor->view_src == NULL);
@@ -1668,7 +1669,7 @@ void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor
 
     tensor->buffer = buffer;
     tensor->data = addr;
-    ggml_backend_buffer_init_tensor(buffer, tensor);
+    return ggml_backend_buffer_init_tensor(buffer, tensor);
 }
 
 static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
@@ -1714,7 +1715,8 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
     struct ggml_tensor * dst = node_copies[id];
     if (dst->view_src != NULL) {
         graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
-        ggml_backend_view_init(dst);
+        enum ggml_status status = ggml_backend_view_init(dst);
+        GGML_ASSERT(status == GGML_STATUS_SUCCESS);
     }
     else {
         ggml_backend_tensor_copy(src, dst);
@@ -1829,7 +1831,6 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
     assert(g1->n_nodes == g2->n_nodes);
 
     for (int i = 0; i < g1->n_nodes; i++) {
-        //printf("eval %d/%d\n", i, g1->n_nodes);
         struct ggml_tensor * t1 = g1->nodes[i];
         struct ggml_tensor * t2 = g2->nodes[i];
 
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index d410c0244..b8d272cda 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -796,11 +796,11 @@ static bool need_transform(ggml_type type) {
  * @param buffer The CANN buffer from which to initialize the tensor.
  * @param tensor Pointer to the tensor to be initialized.
  */
-static void ggml_backend_cann_buffer_init_tensor(
+static enum ggml_status ggml_backend_cann_buffer_init_tensor(
     ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
     if (tensor->view_src != NULL && tensor->view_offs == 0) {
         GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
-        return;
+        return GGML_STATUS_SUCCESS;
     }
 
     // TODO: can backend doesn't support quantized yet. Just leave the code
@@ -817,6 +817,7 @@ static void ggml_backend_cann_buffer_init_tensor(
                                   memset_size, 0, memset_size));
         }
     }
+    return GGML_STATUS_SUCCESS;
 }
 
 // TODO: need handle tensor which has paddings.
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
index a37d983b4..2618e101c 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
@@ -4197,10 +4197,11 @@ static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(con
     return nullptr;
 }
 
-static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+static enum ggml_status ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
     tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_aarch64_get_optimal_repack_type(tensor));
 
     GGML_UNUSED(buffer);
+    return GGML_STATUS_SUCCESS;
 }
 
 static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
index 9ddd972a5..7f7d210cb 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -59,6 +59,15 @@ struct ggml_compute_params {
 #endif
 #endif
 
+#if defined(__s390x__) && defined(__VEC__)
+#ifndef __VXE__
+#define __VXE__
+#endif
+#ifndef __VXE2__
+#define __VXE2__
+#endif
+#endif
+
 #if defined(__ARM_FEATURE_SVE)
 #include <arm_sve.h>
 #include <sys/prctl.h>
@@ -359,6 +368,148 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
 #endif
 #endif
 
+#if defined(__VXE__) || defined(__VXE2__)
+#include <vecintrin.h>
+
+#define vec_neg(a)    (-(a))                // Vector Negate
+#define vec_add(a, b) ((a) + (b))           // Vector Add
+#define vec_sub(a, b) ((a) - (b))           // Vector Subtract
+#define vec_mul(a, b) ((a) * (b))           // Vector Multiply
+#define vec_div(a, b) ((a) / (b))           // Vector Divide
+#define vec_sl(a, b)  ((a) << (b))          // Vector Shift Left
+#define vec_sra(a, b) ((a) >> (b))          // Vector Shift Right
+#define vec_sr(a, b)  ((a) >> (b))          // Vector Shift Right Algebraic
+#define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
+#define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
+
+#ifndef vec_and
+#define vec_and(a, b) ((a) & (b)) // Vector AND
+#endif
+
+#ifndef vec_or
+#define vec_or(a, b)  ((a) | (b)) // Vector OR
+#endif
+
+#ifndef vec_xor
+#define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
+#endif
+
+typedef signed char char8x16_t __attribute__((vector_size(16)));
+typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
+
+typedef int8_t  int8x16_t __attribute__((vector_size(16)));
+typedef int16_t int16x8_t __attribute__((vector_size(16)));
+typedef int32_t int32x4_t __attribute__((vector_size(16)));
+
+typedef uint8_t  uint8x16_t __attribute__((vector_size(16)));
+typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
+typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
+
+typedef float float32x4_t __attribute__((vector_size(16)));
+typedef double double64x2_t __attribute((vector_size(16)));
+
+typedef signed long long long64x2_t __attribute((vector_size(16)));
+typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
+
+typedef struct ggml_uint8x16x2_t {
+    uint8x16_t val[2];
+} ggml_uint8x16x2_t;
+
+inline static ggml_uint8x16x2_t ggml_vec_xl_u8x2(const uint8_t * ptr) {
+    ggml_uint8x16x2_t res;
+
+    res.val[0] = vec_xl( 0, ptr);
+    res.val[1] = vec_xl(16, ptr);
+
+    return res;
+}
+
+typedef struct ggml_uint8x16x4_t {
+    uint8x16_t val[4];
+} ggml_uint8x16x4_t;
+
+inline static ggml_uint8x16x4_t ggml_vec_xl_u8x4(const uint8_t * ptr) {
+    ggml_uint8x16x4_t res;
+
+    res.val[0] = vec_xl( 0, ptr);
+    res.val[1] = vec_xl(16, ptr);
+    res.val[2] = vec_xl(32, ptr);
+    res.val[3] = vec_xl(48, ptr);
+
+    return res;
+}
+
+typedef struct ggml_int8x16x4_t {
+    int8x16_t val[4];
+} ggml_int8x16x4_t;
+
+inline static ggml_int8x16x4_t ggml_vec_xl_s8x4(const int8_t * ptr) {
+    ggml_int8x16x4_t res;
+
+    res.val[0] = vec_xl( 0, ptr);
+    res.val[1] = vec_xl(16, ptr);
+    res.val[2] = vec_xl(32, ptr);
+    res.val[3] = vec_xl(48, ptr);
+
+    return res;
+}
+
+typedef struct ggml_int16x8x2_t {
+    int16x8_t val[2];
+} ggml_int16x8x2_t;
+
+inline static ggml_int16x8x2_t ggml_vec_xl_s16x2(const int16_t * ptr) {
+    ggml_int16x8x2_t res;
+
+    res.val[0] = vec_xl( 0, ptr);
+    res.val[1] = vec_xl(16, ptr);
+
+    return res;
+}
+
+/*
+    ! WARNING: Very slow. Use vec_perm if possible. Refer to iq4_xs
+    !          or iq4_nl for example implementation.
+*/
+inline static int8x16_t ggml_vec_tbl(int8x16_t a, uint8x16_t b) {
+    int8x16_t res;
+
+    res[ 0] = a[b[ 0]];
+    res[ 1] = a[b[ 1]];
+    res[ 2] = a[b[ 2]];
+    res[ 3] = a[b[ 3]];
+    res[ 4] = a[b[ 4]];
+    res[ 5] = a[b[ 5]];
+    res[ 6] = a[b[ 6]];
+    res[ 7] = a[b[ 7]];
+    res[ 8] = a[b[ 8]];
+    res[ 9] = a[b[ 9]];
+    res[10] = a[b[10]];
+    res[11] = a[b[11]];
+    res[12] = a[b[12]];
+    res[13] = a[b[13]];
+    res[14] = a[b[14]];
+    res[15] = a[b[15]];
+
+    return res;
+}
+
+inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
+    const uchar8x16_t v_maske = {  0,  1,  4,  5,  8,  9, 12, 13,
+                                  16, 17, 20, 21, 24, 25, 28, 29 };
+
+    const int16x8_t v_abo = vec_pack((int32x4_t)a, (int32x4_t)b);
+    const int16x8_t v_abe = vec_perm(a, b, v_maske);
+    return v_abo + v_abe;
+}
+
+inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
+    const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
+    return acc + (vec_unpackh(p) + vec_unpackl(p));
+}
+
+#endif
+
 #if defined(__loongarch_asx)
 /* float type data load instructions */
 static __m128 __lsx_vreplfr2vr_s(const float val) {
diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c
index f0ac76861..4853adbad 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -1012,6 +1012,38 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
         __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
 
     }
+#elif defined(__VXE__) || defined(__VXE2__)
+    for (int i = 0; i < nb; i++) {
+        __vector float srcv [8];
+        __vector float asrcv[8];
+        __vector float amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
+                                   vec_extract(amaxv[0], 1)),
+                               MAX(vec_extract(amaxv[0], 2),
+                                   vec_extract(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f / d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        for (int j = 0; j < 8; j++) {
+            const __vector float v = vec_mul(srcv[j], vec_splats(id));
+            const __vector int32_t vi = vec_signed(v);
+
+            y[i].qs[4*j + 0] = vec_extract(vi, 0);
+            y[i].qs[4*j + 1] = vec_extract(vi, 1);
+            y[i].qs[4*j + 2] = vec_extract(vi, 2);
+            y[i].qs[4*j + 3] = vec_extract(vi, 3);
+        }
+    }
 #else
     GGML_UNUSED(nb);
     // scalar
@@ -1338,6 +1370,44 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
         __lsx_vst(ni0, (__m128i *)(y[i].qs +  0), 0);
         __lsx_vst(ni4, (__m128i *)(y[i].qs + 16), 0);
     }
+#elif defined(__VXE__) || defined(__VXE2__)
+    for (int i = 0; i < nb; i++) {
+        __vector float srcv [8];
+        __vector float asrcv[8];
+        __vector float amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
+                                   vec_extract(amaxv[0], 1)),
+                               MAX(vec_extract(amaxv[0], 2),
+                                   vec_extract(amaxv[0], 3)));
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f / d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        __vector int32_t acc = vec_splats(0);
+
+        for (int j = 0; j < 8; j++) {
+            const __vector float v = vec_mul(srcv[j], vec_splats(id));
+            const __vector int32_t vi = vec_signed(v);
+
+            y[i].qs[4*j + 0] = vec_extract(vi, 0);
+            y[i].qs[4*j + 1] = vec_extract(vi, 1);
+            y[i].qs[4*j + 2] = vec_extract(vi, 2);
+            y[i].qs[4*j + 3] = vec_extract(vi, 3);
+
+            acc = vec_add(acc, vi);
+        }
+
+        y[i].s = GGML_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
+    }
 #else
     GGML_UNUSED(nb);
     // scalar
@@ -2489,6 +2559,37 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     }
 
     sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
+#elif defined(__VXE__) || defined(__VXE2__)
+    __vector float acc = vec_splats(0.0f);
+
+    const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F);
+    const __vector int8_t  v_s = vec_splats( (const int8_t)0x08);
+
+    for (; ib < nb; ++ib) {
+        const __vector uint8_t v_x = vec_xl(0, x[ib].qs);
+        const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m);
+        const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4);
+
+        const __vector int8_t v_xls = vec_sub(v_xl, v_s);
+        const __vector int8_t v_xhs = vec_sub(v_xh, v_s);
+
+        const __vector int8_t v_yl = vec_xl(0      , y[ib].qs);
+        const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
+
+        const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl);
+        const __vector int16_t v_xylse = vec_mule(v_xls, v_yl);
+        const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh);
+        const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh);
+
+        __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
+
+        const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
+        const __vector float v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
+
+        acc = vec_madd(v_xy, v_d, acc);
+    }
+
+    sumf = acc[0] + acc[1] + acc[2] + acc[3];
 #endif
     for (; ib < nb; ++ib) {
         int sumi0 = 0;
@@ -2782,6 +2883,35 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
     }
 
     sumf = hsum_float_8(acc) + summs;
+#elif defined(__VXE__) || defined(__VXE2__)
+    float summs = 0;
+    float32x4_t acc = vec_splats(0.0f);
+
+    const uint8x16_t v_m = vec_splat_u8(0x0F);
+
+#pragma GCC unroll 4
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
+
+        const uint8x16_t v_x = vec_xl(0, x[ib].qs);
+        const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
+        const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
+
+        const int8x16_t v_yl = vec_xl(0      , y[ib].qs);
+        const int8x16_t v_yh = vec_xl(QK8_1/2, y[ib].qs);
+
+        const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
+        const float32x4_t v_xy = vec_float(v_xy_);
+
+        const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
+
+        acc = vec_madd(v_xy, v_d, acc);
+    }
+
+    sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
 #endif
     for (; ib < nb; ++ib) {
         int sumi0 = 0;
@@ -3916,6 +4046,27 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     }
 
     sumf = hsum_float_8(acc);
+#elif defined(__VXE__) || defined(__VXE2__)
+    __vector float acc = vec_splats(0.0f);
+
+#pragma GCC unroll 8
+    for (; ib < nb; ++ib) {
+        __builtin_prefetch(x[ib].qs, 0, 1);
+        __builtin_prefetch(y[ib].qs, 0, 1);
+
+        const int8x16_t v_xl = vec_xl(0      , x[ib].qs);
+        const int8x16_t v_xh = vec_xl(QK8_0/2, x[ib].qs);
+        const int8x16_t v_yl = vec_xl(0      , y[ib].qs);
+        const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
+
+        const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
+        const float32x4_t v_xy = vec_float(v_xy_);
+        const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
+
+        acc = vec_madd(v_xy, v_d, acc);
+    }
+
+    sumf = acc[0] + acc[1] + acc[2] + acc[3];
 #endif
     for (; ib < nb; ++ib) {
         int sumi = 0;
@@ -4437,7 +4588,252 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     const int nb = n / QK_K;
 
-#ifdef __ARM_NEON
+#ifdef __ARM_FEATURE_SVE
+    const int vector_length = svcntb()*8;
+    const svuint8_t m3s = svdup_n_u8(0x3);
+    const svuint32_t m4s = svdup_n_u32(0xF);
+    const svint32_t vzero_sv = svdup_n_s32(0);
+    svfloat32_t acc_sum = svdup_n_f32(0);
+    svbool_t pred_s32 = svptrue_pat_b32(SV_VL4);
+
+    switch (vector_length) {
+        case 128:
+            for (int i = 0; i < nb; ++i) {
+                const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+                svfloat32_t d_broad = svdup_n_f32((float32_t)d);
+                const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+                svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
+
+                const uint8_t * restrict q2 = x[i].qs;
+                const int8_t  * restrict q8_sv = y[i].qs;
+                const uint8_t * restrict sc = x[i].scales;
+
+                svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc);
+                const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
+
+                mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+4);
+                const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
+
+                svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums);
+                svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+4);
+
+                const svint32_t s0 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_2, q8sums_sv_2));
+
+                mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+8);
+                const svint32_t mins_sv_3 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
+
+                mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc+12);
+                const svint32_t mins_sv_4 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
+
+                q8sums_sv_1 = svld1sh_s32(svptrue_b32(), y[i].bsums+8);
+                q8sums_sv_2 = svld1sh_s32(svptrue_b32(), y[i].bsums+12);
+
+                svint32_t s1 = svadd_s32_x(svptrue_b32(), svmul_s32_x(svptrue_b32(), mins_sv_3, q8sums_sv_1), svmul_s32_x(svptrue_b32(), mins_sv_4, q8sums_sv_2));
+
+                svfloat32_t temp = svcvt_f32_s32_x(svptrue_b32(), svadd_s32_x(svptrue_b32(), s0, s1));
+
+                acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, temp, dmin_broad);
+
+                svint32_t sumi1 = svdup_n_s32(0);
+
+                {
+                    const svuint8_t q2bits_1 = svld1_u8(svptrue_b8(), q2);
+                    svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_1, m3s));
+                    svint8_t q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+                    const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc), m4s));
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 0));
+
+                    const svuint8_t q2bits_3 = svld1_u8(svptrue_b8(), q2+16);
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_3, m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 1));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 2), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 2));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 2), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv, 3));
+
+
+                    const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+4), m4s));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 4), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 0));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 4), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 1));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_1, 6), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 2));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_3, 6), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_1, 3));
+
+                    //-------------------------------
+
+                    q2 += 32;
+                    const svint32_t scales_sv_2 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+8), m4s));
+                    const svuint8_t q2bits_2 = svld1_u8(svptrue_b8(), q2);
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_2, m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 0));
+
+                    const svuint8_t q2bits_4 = svld1_u8(svptrue_b8(), q2+16);
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), q2bits_4, m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 1));
+
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 2), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 2));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 2), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_2, 3));
+
+
+                    const svint32_t scales_sv_3 = svreinterpret_s32_u32(svand_u32_m(svptrue_b32(), svld1ub_u32(svptrue_b32(), sc+12), m4s));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 4), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 0));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 4), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 1));
+
+
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_2, 6), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 2));
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svlsr_n_u8_x(svptrue_b8(), q2bits_4, 6), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_b8(), q8_sv); q8_sv += 16;
+
+                    sumi1 = svmla_s32_m(svptrue_b32(), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), svdup_lane_s32(scales_sv_3, 3));
+                }
+                acc_sum = svmla_f32_m(svptrue_b32(), acc_sum, svcvt_f32_s32_x(svptrue_b32(), sumi1), d_broad);
+            }
+            *s = svaddv_f32(svptrue_b32(), acc_sum);
+            break;
+
+        case 256:
+        case 512:
+            for (int i = 0; i < nb; ++i) {
+                const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+                svfloat32_t d_broad = svdup_n_f32((float32_t)d);
+                const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+                svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
+
+                const uint8_t * restrict q2 = x[i].qs;
+                const int8_t  * restrict q8_sv = y[i].qs;
+                const uint8_t * restrict sc = x[i].scales;
+
+                const svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc); sc += 8;
+                const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, m4s));
+                const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, 4));
+                svint32_t q8sums_sv_1 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums);
+
+                const svuint32_t mins_and_scales_sve_1 = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc);
+                const svint32_t scales_sv_1 = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, m4s));
+                const svint32_t mins_sv_2 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_pat_b32(SV_VL8), mins_and_scales_sve_1, 4));
+
+                svint32_t q8sums_sv_2 = svld1sh_s32(svptrue_pat_b32(SV_VL8), y[i].bsums+8);
+
+                svfloat32_t temp = svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_1, q8sums_sv_1), svmul_s32_x(svptrue_pat_b32(SV_VL8), mins_sv_2, q8sums_sv_2)));
+
+                acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, temp, dmin_broad);
+
+                svint32_t sumi1 = svdup_n_s32(0);
+
+                {
+                    const svuint8_t q2bits_1 = svld1_u8(svptrue_pat_b8(SV_VL32), q2);
+                    svint8_t q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_1, m3s));
+                    svint8_t q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+
+                    svint32_t scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 0), svdup_lane_s32(scales_sv, 1));
+                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 2), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+
+                    svint32_t scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 2), svdup_lane_s32(scales_sv, 3));
+                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(svdup_n_s32(0), q2bytes_sv, q8bytes_sv), scale_2);
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 4), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+
+                    scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv, 4), svdup_lane_s32(scales_sv, 5));
+                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_1, 6), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+
+                    scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv, 6), svdup_lane_s32(scales_sv, 7));
+                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
+
+                    q2 += 32;
+
+                    const svuint8_t q2bits_2 = svld1_u8(svptrue_pat_b8(SV_VL32), q2);
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), q2bits_2, m3s));
+                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+
+                    scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 0), svdup_lane_s32(scales_sv_1, 1));
+                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 2), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+
+                    scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 2), svdup_lane_s32(scales_sv_1, 3));
+                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 4), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+
+                    scale_1 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 4), svdup_lane_s32(scales_sv_1, 5));
+                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_1);
+
+                    q2bytes_sv = svreinterpret_s8_u8(svand_u8_m(svptrue_pat_b8(SV_VL32), svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q2bits_2, 6), m3s));
+                    q8bytes_sv = svld1_s8(svptrue_pat_b8(SV_VL32), q8_sv); q8_sv += 32;
+
+                    scale_2 = svsel(pred_s32, svdup_lane_s32(scales_sv_1, 6), svdup_lane_s32(scales_sv_1, 7));
+                    sumi1 = svmla_s32_m(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(vzero_sv, q2bytes_sv, q8bytes_sv), scale_2);
+                }
+                acc_sum = svmla_f32_m(svptrue_pat_b32(SV_VL8), acc_sum, svcvt_f32_s32_x(svptrue_pat_b32(SV_VL8), sumi1), d_broad);
+            }
+            *s = svaddv_f32(svptrue_pat_b32(SV_VL8), acc_sum);
+            break;
+
+        default:
+            assert(false && "Unsupported vector length");
+            break;
+    }
+
+#elif __ARM_NEON
     const uint8x16_t m3 = vdupq_n_u8(0x3);
     const uint8x16_t m4 = vdupq_n_u8(0xF);
 
@@ -5115,6 +5511,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
 #if defined(__ARM_FEATURE_SVE)
 
+    uint32_t aux[3];
     uint32_t utmp[4];
 
     const int8_t m32 = 32;
@@ -5126,7 +5523,6 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     const svuint8_t m1_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 1);
     const svuint8_t m2_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 2);
     const svuint8_t m3_sv = svlsl_n_u8_x(svptrue_b8(), m0_sv, 3);
-    svbool_t pred_s32 = svnot_b_z (svptrue_b32(), svptrue_pat_b32(SV_VL4));
 
     float sum = 0;
 
@@ -5139,7 +5535,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         const int8_t  * restrict q8_sv = y[i].qs;
 
         // Set up scales
-        uint32_t * aux = &x[i].scales;
+        memcpy(aux, x[i].scales, 12);
         utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
         utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
         utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
@@ -6798,6 +7194,77 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
 
     *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
+#elif defined(__VXE__) || defined(__VXE2__)
+    const uint8x16_t v_lm = vec_splat_u8(0x0F);
+    const int32x4_t v_z = vec_splat_s32(0);
+
+    uint8x16_t v_x[2];
+    int8x16_t  v_xl[2];
+    int8x16_t  v_y[2];
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
+        const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
+        const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
+
+        memcpy(utmp, x[i].scales, 12);
+
+        uint32x4_t v_mins8 = { 0 };
+        v_mins8 = vec_insert(utmp[1] & kmask1, v_mins8, 0);
+        v_mins8 = vec_insert(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), v_mins8, 1);
+
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[0] &= kmask1;
+
+        const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8);
+
+        const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh);
+        const int32x4_t v_minse = vec_mule(v_ysums, v_minsh);
+        const int32x4_t v_mins = v_minso + v_minse;
+        sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);
+
+        const uint8_t * scales = (const uint8_t *)utmp;
+        const uint8_t * restrict x0 = x[i].qs;
+        const int8_t  * restrict y0 = y[i].qs;
+
+        int32_t sumi1 = 0;
+        int32_t sumi2 = 0;
+
+        for (int j = 0; j < QK_K/64; ++j) {
+            v_x[0] = vec_xl(0 , x0);
+            v_x[1] = vec_xl(16, x0);
+            x0 += 32;
+
+            v_y[0] = vec_xl(0 , y0);
+            v_y[1] = vec_xl(16, y0);
+            y0 += 32;
+
+            v_xl[0] = (int8x16_t)vec_and(v_x[0], v_lm);
+            v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
+
+            const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
+            sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0];
+
+            v_y[0] = vec_xl(0 , y0);
+            v_y[1] = vec_xl(16, y0);
+            y0 += 32;
+
+            v_xl[0] = (int8x16_t)vec_sr(v_x[0], 4);
+            v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
+
+            const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
+            sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1];
+        }
+
+        sumf += d * (sumi1 + sumi2);
+    }
+
+    *s = sumf;
 #else
 
     const uint8_t * scales = (const uint8_t*)&utmp[0];
@@ -7527,7 +7994,94 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vbsrl_v(acc_m, 4));
 
     *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
+#elif defined(__VXE__) || defined(__VXE2__)
+    const uint8x16_t v_lm = vec_splat_u8(0x0F);
+    const uint8x16_t v_1m = vec_splat_u8(0x01);
+    const uint8x16_t v_2m = vec_splat_u8(0x02);
 
+    const int32x4_t v_z = vec_splat_s32(0);
+
+    const uchar8x16_t v_minsm = {
+        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+        0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+    };
+
+    int8x16_t  q5b[4];
+    uint8x16_t q5h[4];
+
+    uint8x16_t v_xl[2];
+    uint8x16_t v_xh[2];
+    int8x16_t  v_y[4];
+
+    float sumf = 0;
+
+    for (int i = 0; i < nb; ++i) {
+        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
+        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
+
+        const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
+        const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
+        const int16x8_t v_ysums = vec_padd_s16(v_ysumsl, v_ysumsh);
+
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        const uint8x16_t v_mins16 = vec_xl(0, (const uint8_t *)utmp);
+        const uint8x16_t v_mins8 = vec_perm(v_mins16, v_mins16, v_minsm);
+        const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8);
+
+        const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
+        const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
+        const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
+        const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
+
+        const uint8_t * scales = (const uint8_t *)utmp;
+        const uint8_t * restrict x0l = x[i].qs;
+        const uint8_t * restrict x0h = x[i].qh;
+        const int8_t  * restrict y0 = y[i].qs;
+
+        v_xh[0] = vec_xl(0 , x0h);
+        v_xh[1] = vec_xl(16, x0h);
+
+        int32_t sumi = 0;
+        for (int j = 0; j < QK_K/64; ++j) {
+            v_xl[0] = vec_xl(0 , x0l);
+            v_xl[1] = vec_xl(16, x0l);
+            x0l += 32;
+
+            v_y[0] = vec_xl(0 , y0);
+            v_y[1] = vec_xl(16, y0);
+            v_y[2] = vec_xl(32, y0);
+            v_y[3] = vec_xl(48, y0);
+            y0 += 64;
+
+            q5h[0] = vec_sl(vec_and(v_1m, v_xh[0]), 4);
+            q5h[1] = vec_sl(vec_and(v_1m, v_xh[1]), 4);
+            q5h[2] = vec_sl(vec_and(v_2m, v_xh[0]), 3);
+            q5h[3] = vec_sl(vec_and(v_2m, v_xh[1]), 3);
+            v_xh[0] = vec_sr(v_xh[0], 2);
+            v_xh[1] = vec_sr(v_xh[1], 2);
+
+            q5b[0] = (int8x16_t)vec_or(vec_and(v_xl[0], v_lm), q5h[0]);
+            q5b[1] = (int8x16_t)vec_or(vec_and(v_xl[1], v_lm), q5h[1]);
+            q5b[2] = (int8x16_t)vec_or(vec_sr(v_xl[0], 4), q5h[2]);
+            q5b[3] = (int8x16_t)vec_or(vec_sr(v_xl[1], 4), q5h[3]);
+
+            int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
+            int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
+
+            sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++;
+            sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++;
+        }
+
+        sumf += d * sumi - dmin * mins;
+    }
+
+    *s = sumf;
 #else
 
     const uint8_t * scales = (const uint8_t*)&utmp[0];
@@ -8244,7 +8798,130 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     }
 
     *s = hsum_float_8(acc);
+#elif defined(__VXE__) || defined(__VXE2__)
+    float sum = 0;
 
+    // Lower 4-bit and upper 2-bit masks
+    const uint8x16_t v_lm = vec_splat_u8(0x0F);
+    const uint8x16_t v_um = vec_splat_u8(0x03);
+
+    const int32x4_t v_z = vec_splat_s32(0);
+
+    int8x16_t  q6b[4];
+    uint8x16_t q6h[4];
+
+    uint8x16_t v_xl[4];
+    uint8x16_t v_xh[2];
+    int8x16_t  v_y[4];
+
+    for (int i = 0; i < nb; ++i) {
+        const float d_all = GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict x0l = x[i].ql;
+        const uint8_t * restrict x0h = x[i].qh;
+        const int8_t  * restrict y0 = y[i].qs;
+
+        const int8_t  * restrict scale = x[i].scales;
+
+        const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
+        const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
+
+        const int8x16_t v_scale  = vec_xl(0, scale);
+        const int16x8_t v_scalel = vec_unpackh(v_scale);
+        const int16x8_t v_scaleh = vec_unpackl(v_scale);
+
+        const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel);
+        const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel);
+        const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh);
+        const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
+        const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
+
+        const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
+
+        int32_t isum = 0;
+        for (int j = 0; j < QK_K/128; ++j) {
+            // Load model upper 2 bits
+            v_xh[0] = vec_xl(0 , x0h);
+            v_xh[1] = vec_xl(16, x0h);
+            x0h += 32;
+
+            // Load model lower 4 bits
+            v_xl[0] = vec_xl(0 , x0l);
+            v_xl[1] = vec_xl(16, x0l);
+            v_xl[2] = vec_xl(32, x0l);
+            v_xl[3] = vec_xl(48, x0l);
+            x0l += 64;
+
+            // Load activation quants
+            v_y[0] = vec_xl(0 , y0);
+            v_y[1] = vec_xl(16, y0);
+            v_y[2] = vec_xl(32, y0);
+            v_y[3] = vec_xl(48, y0);
+            y0 += 64;
+
+            q6h[0] = vec_sl(vec_and(v_um, v_xh[0]), 4);
+            q6h[1] = vec_sl(vec_and(v_um, v_xh[1]), 4);
+            uint8x16_t shifted = vec_sr(v_xh[0], 2);
+            q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
+            shifted = vec_sr(v_xh[1], 2);
+            q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
+
+            q6b[0] = (int8x16_t)(vec_or(vec_and(v_xl[0], v_lm), q6h[0]));
+            q6b[1] = (int8x16_t)(vec_or(vec_and(v_xl[1], v_lm), q6h[1]));
+            q6b[2] = (int8x16_t)(vec_or(vec_and(v_xl[2], v_lm), q6h[2]));
+            q6b[3] = (int8x16_t)(vec_or(vec_and(v_xl[3], v_lm), q6h[3]));
+
+            int32x4_t summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
+            int32x4_t summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
+            int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
+            int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
+
+            isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
+                    (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
+                    (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
+                    (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
+
+            scale += 4;
+
+
+            // Load activation quants
+            v_y[0] = vec_xl(0 , y0);
+            v_y[1] = vec_xl(16, y0);
+            v_y[2] = vec_xl(32, y0);
+            v_y[3] = vec_xl(48, y0);
+            y0 += 64;
+
+            shifted = vec_sr(v_xh[0], 4);
+            q6h[0] = vec_sl(vec_and(v_um, shifted), 4);
+            shifted = vec_sr(v_xh[1], 4);
+            q6h[1] = vec_sl(vec_and(v_um, shifted), 4);
+            shifted = vec_sr(v_xh[0], 6);
+            q6h[2] = vec_sl(vec_and(v_um, shifted), 4);
+            shifted = vec_sr(v_xh[1], 6);
+            q6h[3] = vec_sl(vec_and(v_um, shifted), 4);
+
+            q6b[0] = (int8x16_t)(vec_or(vec_sr(v_xl[0], 4), q6h[0]));
+            q6b[1] = (int8x16_t)(vec_or(vec_sr(v_xl[1], 4), q6h[1]));
+            q6b[2] = (int8x16_t)(vec_or(vec_sr(v_xl[2], 4), q6h[2]));
+            q6b[3] = (int8x16_t)(vec_or(vec_sr(v_xl[3], 4), q6h[3]));
+
+            summs0 = ggml_vec_dot(v_z, q6b[0], v_y[0]);
+            summs1 = ggml_vec_dot(v_z, q6b[1], v_y[1]);
+            summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
+            summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
+
+            isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
+                    (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
+                    (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
+                    (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
+
+            scale += 4;
+        }
+
+        sum += d_all * y[i].d * (isum - 32 * mins);
+    }
+
+    *s = sum;
 #else
 
     int8_t  aux8[QK_K];
@@ -8605,7 +9282,57 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
     }
 
     *s = 0.125f * hsum_float_8(accumf);
-
+//#elif defined(__VXE__) || defined(__VXE2__)
+//    const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
+//
+//    uint32_t aux32[4];
+//    const uint8_t * aux8 = (const uint8_t *)aux32;
+//
+//    float sumf = 0;
+//
+//    for (int i = 0; i < nb; ++i) {
+//        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+//        const uint16_t * restrict q2 = x[i].qs;
+//        const int8_t   * restrict q8 = y[i].qs;
+//
+//        float sumf1 = 0, sumf2 = 0;
+//
+//        for (int ib32 = 0; ib32 < QK_K/32; ib += 2) {
+//            int8x16_t q8b0 = vec_xl( 0, q8);
+//            int8x16_t qb81 = vec_xl(16, q8);
+//            int8x16_t q8b2 = vec_xl(32, q8);
+//            int8x16_t q8b3 = vec_xl(48, q8);
+//            q8 += 64;
+//
+//            memcpy(aux32, q2, 4 * sizeof(uint32_t));
+//            q2 += 8;
+//
+//            int8x16_t q2u0 = { *(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1]) };
+//            int8x16_t q2u1 = { *(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3]) };
+//            int8x16_t q2u2 = { *(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9]) };
+//            int8x16_t q2u3 = { *(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11]) };
+//
+//            int8x16_t q2s0 = { *(const int64_t *)(signs64 + ((aux32[1] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >>  7) & 127)) };
+//            int8x16_t q2s1 = { *(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127)) };
+//            int8x16_t q2s2 = { *(const int64_t *)(signs64 + ((aux32[3] >>  0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >>  7) & 127)) };
+//            int8x16_t q2s3 = { *(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127)) };
+//
+//            q2u0 = vec_mul(q2u0, q2s0);
+//            q2u1 = vec_mul(q2u1, q2s1);
+//            q2u2 = vec_mul(q2u2, q2s2);
+//            q2u3 = vec_mul(q2u3, q2s3);
+//
+//            const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u0, q8b0), q2u1, q8b1);
+//            const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(vec_splat_s32(0), q2u2, q8b2), q2u3, q8b3);
+//
+//            sumf1 += (p1[0] + p1[1] + p1[2] + p1[3]) * (0.5f + (aux32[1] >> 28));
+//            sumf2 += (p2[0] + p2[1] + p2[2] + p2[3]) * (0.5f + (aux32[3] >> 28));
+//        }
+//
+//        sumf += d * (sumf1 + sumf2);
+//    }
+//
+//    *s = 0.25f * sumf;
 #else
 
     uint32_t aux32[2];
@@ -11366,6 +12093,27 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
 
     sumf = hsum_float_8(__lasx_xvfadd_s(accum1, accum2));
 
+#elif defined(__VXE__) || defined(__VXE2__)
+    const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
+    const uint8x16_t v_m = vec_splat_u8(0x0F);
+
+    for (; ib < nb; ++ib) {
+        const block_iq4_nl * restrict x0 = &x[ib];
+        const block_q8_0   * restrict y0 = &y[ib];
+
+        const uint8x16_t v_x = vec_xl(0, x0->qs);
+        int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
+        int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
+
+        v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
+        v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
+
+        const int8x16_t v_yl = vec_xl(0      , y0->qs);
+        const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
+        const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
+
+        sumf += GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
+    }
 #endif
     for (; ib < nb; ++ib) {
         const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
@@ -11644,6 +12392,56 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
     }
 
     *s = hsum_float_8(accum);
+#elif defined(__VXE__) || defined(__VXE2__)
+    const int8x16_t v_k = vec_xl(0, kvalues_iq4nl);
+    const uint8x16_t v_m = vec_splat_u8(0x0F);
+
+    float sumf = 0;
+
+    for (int ibl = 0; ibl < nb; ++ibl) {
+        const uint8_t * restrict q4 = x[ibl].qs;
+        const int8_t  * restrict q8 = y[ibl].qs;
+
+        uint16_t h = x[ibl].scales_h;
+
+        int sumi1 = 0, sumi2 = 0;
+        for (int ib = 0; ib < QK_K/64; ++ib) {
+            const uint8x16_t v_x0 = vec_xl(0       , q4);
+            const uint8x16_t v_x1 = vec_xl(QK4_NL/2, q4);
+            q4 += 32;
+
+            int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
+            int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
+            int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
+            int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
+
+            v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
+            v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
+            v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
+            v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
+
+            const int8x16_t v_y0 = vec_xl( 0, q8);
+            const int8x16_t v_y1 = vec_xl(16, q8);
+            const int8x16_t v_y2 = vec_xl(32, q8);
+            const int8x16_t v_y3 = vec_xl(48, q8);
+            q8 += 64;
+
+            int32x4_t vsumi0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0), v_x0h, v_y1);
+            int32x4_t vsumi1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y2), v_x1h, v_y3);
+
+            int ls1 = ((x[ibl].scales_l[ib] & 0xF) | ((h << 4) & 0x30)) - 32;
+            int ls2 = ((x[ibl].scales_l[ib] >>  4) | ((h << 2) & 0x30)) - 32;
+
+            h >>= 4;
+
+            sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1;
+            sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
+        }
+
+        sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
+    }
+
+    *s = sumf;
 
 #else
     float sumf = 0;
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index e4ba0c12f..6bb8bb00e 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -242,6 +242,8 @@ typedef pthread_t ggml_thread_t;
 #else
 #if defined(__POWER9_VECTOR__)
 #define CACHE_LINE_SIZE 128
+#elif defined(__VXE__) || defined(__VXE2__)
+#define CACHE_LINE_SIZE 256
 #else
 #define CACHE_LINE_SIZE 64
 #endif
@@ -1216,6 +1218,87 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
 #define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
 #define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
 
+#elif defined(__VXE__) || defined(__VXE2__)
+
+#define GGML_SIMD
+
+// F32 s390x
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4              __vector float
+#define GGML_F32x4_ZERO         vec_splats(0.0f)
+#define GGML_F32x4_SET1         vec_splats
+#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
+#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
+#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
+#define GGML_F32x4_ADD          vec_add
+#define GGML_F32x4_MUL          vec_mul
+#define GGML_F32x4_REDUCE(res, x)                   \
+{                                                   \
+    int offset = GGML_F32_ARR >> 1;                 \
+    for (int i = 0; i < offset; ++i) {              \
+        x[i] = vec_add(x[i], x[offset + i]);        \
+    }                                               \
+    offset >>= 1;                                   \
+    for (int i = 0; i < offset; ++i) {              \
+        x[i] = vec_add(x[i], x[offset + i]);        \
+    }                                               \
+    offset >>= 1;                                   \
+    for (int i = 0; i < offset; ++i) {              \
+        x[i] = vec_add(x[i], x[offset + i]);        \
+    }                                               \
+    res = vec_extract(x[0], 0) +                    \
+          vec_extract(x[0], 1) +                    \
+          vec_extract(x[0], 2) +                    \
+          vec_extract(x[0], 3);                     \
+}
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 s390x
+#define GGML_F16_STEP GGML_F32_STEP
+#define GGML_F16_EPR  GGML_F32_EPR
+
+static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
+    float tmp[4];
+
+    for (int i = 0; i < 4; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(x[i]);
+    }
+
+    return vec_xl(0, tmp);
+}
+
+static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
+    float arr[4];
+
+    vec_xst(y, 0, arr);
+
+    for (int i = 0; i < 4; i++) {
+        x[i] = GGML_FP32_TO_FP16(arr[i]);
+    }
+}
+
+#define GGML_F16_VEC                GGML_F32x4
+#define GGML_F16_VEC_ZERO           GGML_F32x4_ZERO
+#define GGML_F16_VEC_SET1           GGML_F32x4_SET1
+#define GGML_F16_VEC_LOAD(p, i)     __lzs_f16cx4_load(p)
+#define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F32x4_FMA
+#define GGML_F16_VEC_ADD            GGML_F32x4_ADD
+#define GGML_F16_VEC_MUL            GGML_F32x4_MUL
+#define GGML_F16_VEC_REDUCE         GGML_F32x4_REDUCE
+
 #endif
 
 // GGML_F32_ARR / GGML_F16_ARR
@@ -14464,6 +14547,14 @@ int ggml_cpu_has_vsx(void) {
 #endif
 }
 
+int ggml_cpu_has_vxe(void) {
+#if defined(__VXE__) || defined(__VXE2__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_neon(void) {
 #if defined(__ARM_ARCH) && defined(__ARM_NEON)
     return ggml_arm_arch_features.has_neon;
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index a198fa7f5..03058d7bd 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -557,6 +557,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
         if (ggml_cpu_has_vsx()) {
             features.push_back({ "VSX", "1" });
         }
+        if (ggml_cpu_has_vxe()) {
+            features.push_back({ "VXE", "1" });
+        }
         if (ggml_cpu_has_wasm_simd()) {
             features.push_back({ "WASM_SIMD", "1" });
         }
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 7e99838c0..adf0d3ecb 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -204,9 +204,9 @@ typedef float2 dfloat2;
 #define CP_ASYNC_AVAILABLE
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
 
-#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
+#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
 #define FLASH_ATTN_AVAILABLE
-#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
+#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
 
 static bool fp16_available(const int cc) {
     return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
diff --git a/ggml/src/ggml-cuda/cp-async.cuh b/ggml/src/ggml-cuda/cp-async.cuh
index 51aa41e7e..ecb659997 100644
--- a/ggml/src/ggml-cuda/cp-async.cuh
+++ b/ggml/src/ggml-cuda/cp-async.cuh
@@ -24,7 +24,7 @@ static __device__ __forceinline__ void cp_async_cg_16(const unsigned int dst, co
     } else
 #endif // CUDART_VERSION >= 11040
     {
-        asm volatile("cp.async.cg.shared.global.L2 [%0], [%1], 16;"
+        asm volatile("cp.async.cg.shared.global [%0], [%1], 16;"
             : : "r"(dst), "l"(src));
     }
 #else
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
index fefbd319b..7b9566fb4 100644
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -516,27 +516,25 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
         nullptr;
 }
 
-// The HIP compiler for some reason complains that it can't unroll a loop because of the jt*ncols + j >= ne01 conditional.
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wpass-failed"
-#endif // __clang__
-
-template<int D, int ncols, int KQ_stride> // D == head size
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+template<int D, int ncols1, int ncols2, int KQ_stride> // D == head size
 __launch_bounds__(D, 1)
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_stream_k_fixup(
         float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne11) {
-    const float * dst_fixup_data = ((const float *) dst_fixup) + gridDim.x*(2*2*ncols);
-
-    const int iter_k = ne11 / KQ_stride;
-    const int iter_j = (ne01 + (ncols - 1)) / ncols;
+    constexpr int ncols = ncols1*ncols2;
 
     const int bidx0 = blockIdx.x;
+    const int j     = blockIdx.y;
+    const int c     = blockIdx.z;
+    const int jc    = j*ncols2 + c;
+    const int tid   = threadIdx.x;
 
-    const int kbc0      = (bidx0 + 0)*iter_k*iter_j*ne02 / gridDim.x;
-    const int kbc0_stop = (bidx0 + 1)*iter_k*iter_j*ne02 / gridDim.x;
+    const float * dst_fixup_data = ((const float *) dst_fixup) + gridDim.x*(2*2*ncols);
+
+    const int iter_k = ne11 / FATTN_KQ_STRIDE;
+    const int iter_j = (ne01 + (ncols1 - 1)) / ncols1;
+
+    const int kbc0      = (bidx0 + 0)*iter_k*iter_j*(ne02/ncols2) / gridDim.x;
+    const int kbc0_stop = (bidx0 + 1)*iter_k*iter_j*(ne02/ncols2) / gridDim.x;
 
     const bool did_not_have_any_data   = kbc0 == kbc0_stop;
     const bool wrote_beginning_of_tile = kbc0 % iter_k == 0;
@@ -548,22 +546,22 @@ static __global__ void flash_attn_stream_k_fixup(
     const int channel = kbc0 / (iter_k*iter_j);
     const int jt      = (kbc0 - channel*iter_k*iter_j) / iter_k;
 
-    dst += jt*ncols*ne02*D + channel*D;
+    if (jt*ncols1 + j >= ne01) {
+        return;
+    }
+
+    dst += jt*ne02*(ncols1*D) + channel*(ncols2*D) + (j*ne02 + c)*D + tid;
 
     // Load the partial result that needs a fixup:
-    float dst_val[ncols] = {0.0f};
-    float max_val[ncols] = {0.0f};
-    float rowsum[ncols]  = {0.0f};
-#pragma unroll
-    for (int j = 0; j < ncols; ++j) {
-        if (jt*ncols + j >= ne01) {
-            break;
-        }
-        dst_val[j] = dst[j*ne02*D + threadIdx.x];
+    float dst_val = 0.0f;
+    float max_val = 0.0f;
+    float rowsum  = 0.0f;
+    {
+        dst_val = *dst;
 
-        const float2 tmp = dst_fixup[bidx0*ncols + j];
-        max_val[j] = tmp.x;
-        rowsum[j]  = tmp.y;
+        const float2 tmp = dst_fixup[bidx0*ncols + jc];
+        max_val = tmp.x;
+        rowsum  = tmp.y;
     }
 
     // Iterate over previous blocks and compute the combined results.
@@ -571,36 +569,30 @@ static __global__ void flash_attn_stream_k_fixup(
     int bidx = bidx0 - 1;
     int kbc_stop = kbc0;
     while(true) {
-        const int kbc = bidx*iter_k*iter_j*ne02 / gridDim.x;
+        const int kbc = bidx*iter_k*iter_j*(ne02/ncols2) / gridDim.x;
         if (kbc == kbc_stop) { // Did not have any data.
             bidx--;
             kbc_stop = kbc;
             continue;
         }
 
-#pragma unroll
-        for (int j = 0; j < ncols; ++j) {
-            if (jt*ncols + j >= ne01) {
-                break;
-            }
-            const float dst_add = dst_fixup_data[bidx*ncols*D + j*D + threadIdx.x];
+        const float dst_add = dst_fixup_data[bidx*ncols*D + jc*D + tid];
 
-            const float2 tmp = dst_fixup[(gridDim.x + bidx)*ncols + j];
+        const float2 tmp = dst_fixup[(gridDim.x + bidx)*ncols + jc];
 
-            // Scale the current and new value accumulators depending on the max. values.
-            const float max_val_new = fmaxf(max_val[j], tmp.x);
+        // Scale the current and new value accumulators depending on the max. values.
+        const float max_val_new = fmaxf(max_val, tmp.x);
 
-            const float diff_val = max_val[j] - max_val_new;
-            const float diff_add = tmp.x      - max_val_new;
+        const float diff_val = max_val - max_val_new;
+        const float diff_add = tmp.x   - max_val_new;
 
-            const float scale_val = diff_val >= SOFTMAX_FTZ_THRESHOLD ? expf(diff_val) : 0.0f;
-            const float scale_add = diff_add >= SOFTMAX_FTZ_THRESHOLD ? expf(diff_add) : 0.0f;
+        const float scale_val = diff_val >= SOFTMAX_FTZ_THRESHOLD ? expf(diff_val) : 0.0f;
+        const float scale_add = diff_add >= SOFTMAX_FTZ_THRESHOLD ? expf(diff_add) : 0.0f;
 
-            dst_val[j] = scale_val*dst_val[j] + scale_add*dst_add;
-            rowsum[j]  = scale_val*rowsum[j]  + scale_add*tmp.y;
+        dst_val = scale_val*dst_val + scale_add*dst_add;
+        rowsum  = scale_val*rowsum  + scale_add*tmp.y;
 
-            max_val[j] = max_val_new;
-        }
+        max_val = max_val_new;
 
         // If this block started in a previous tile we are done and don't need to combine additional partial results.
         if (kbc % iter_k == 0 || kbc/iter_k < kbc0/iter_k) {
@@ -611,19 +603,9 @@ static __global__ void flash_attn_stream_k_fixup(
     }
 
     // Write back final result:
-#pragma unroll
-    for (int j = 0; j < ncols; ++j) {
-        if (jt*ncols + j >= ne01) {
-            return;
-        }
-        dst[j*ne02*D + threadIdx.x] = dst_val[j] / rowsum[j];
-    }
+    *dst = dst_val / rowsum;
 }
 
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif // __clang__
-
 template<int D, int parallel_blocks> // D == head size
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
@@ -690,11 +672,13 @@ static void on_no_fattn_vec_case(const int D) {
 }
 
 // parallel_blocks == 0 is stream-k decomposition
-template <int D, int cols_per_block, int parallel_blocks, int KQ_stride>
+template <int D, int ncols1, int ncols2, int parallel_blocks, int KQ_stride>
 void launch_fattn(
     ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel,
     const int nwarps, const size_t nbytes_shared, const bool need_f16_K, const bool need_f16_V
 ) {
+    constexpr int ncols = ncols1 * ncols2;
+
     const ggml_tensor * Q = dst->src[0];
     const ggml_tensor * K = dst->src[1];
     const ggml_tensor * V = dst->src[2];
@@ -763,25 +747,26 @@ void launch_fattn(
         nb23 = nb23*bs*sizeof(half)/ts;
     }
 
-    const int ntiles_x = ((Q->ne[1] + cols_per_block - 1) / cols_per_block);
-    const int ntiles_total = ntiles_x*Q->ne[2]*Q->ne[3];
+    const int ntiles_x = ((Q->ne[1] + ncols1 - 1) / ncols1);
+    const int ntiles_total = ntiles_x * (Q->ne[2] / ncols2) * Q->ne[3];
 
     const dim3 block_dim(WARP_SIZE, nwarps, 1);
     dim3 blocks_num;
     if (parallel_blocks == 0) {
         // For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
-        const int tiles_nwaves  = (ntiles_total + 2*nsm - 1) / (2*nsm);
-        const int tiles_efficiency_percent = 100 * ntiles_total / (2*nsm*tiles_nwaves);
+        const int max_blocks = 2*nsm;
+        const int tiles_nwaves = (ntiles_total + max_blocks - 1) / max_blocks;
+        const int tiles_efficiency_percent = 100 * ntiles_total / (max_blocks*tiles_nwaves);
 
-        const int nblocks_stream_k = 2*nsm;
+        const int nblocks_stream_k = max_blocks;
 
-        const bool use_stream_k = tiles_efficiency_percent < 75 || cc >= GGML_CUDA_CC_ADA_LOVELACE;
+        const bool use_stream_k = cc >= GGML_CUDA_CC_ADA_LOVELACE || tiles_efficiency_percent < 75;
 
         blocks_num.x = use_stream_k ? nblocks_stream_k : ntiles_total;
         blocks_num.y = 1;
         blocks_num.z = 1;
 
-        dst_tmp_meta.alloc(blocks_num.x*cols_per_block * (2*2 + D) * sizeof(float));
+        dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + D) * sizeof(float));
     } else {
         blocks_num.x = parallel_blocks*ntiles_x;
         blocks_num.y = Q->ne[2];
@@ -793,7 +778,6 @@ void launch_fattn(
         }
     }
 
-
     float scale         = 1.0f;
     float max_bias      = 0.0f;
     float logit_softcap = 0.0f;
@@ -832,9 +816,9 @@ void launch_fattn(
     if constexpr (parallel_blocks == 0) {
         if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
             const dim3 block_dim_combine(D, 1, 1);
-            const dim3 blocks_num_combine = blocks_num;
+            const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2};
 
-            flash_attn_stream_k_fixup<D, cols_per_block, KQ_stride>
+            flash_attn_stream_k_fixup<D, ncols1, ncols2, KQ_stride>
                 <<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
                 ((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], K->ne[1]);
         }
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
index d777f5413..718ee5402 100644
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -5,12 +5,15 @@
 
 using namespace ggml_cuda_mma;
 
-typedef tile<16, 8, half2> tile_A;
-typedef tile< 8, 8, half2> tile_B;
-typedef tile<16, 8, float> tile_C_KQ;
-typedef tile<16, 4, half2> tile_C_VKQ;
+typedef tile<16,  8, half2> tile_A;
+typedef tile< 8,  8, half2> tile_B;
+typedef tile<16,  8, half2> tile_B_16;
+typedef tile<16,  8, float> tile_C_KQ;
+typedef tile<16, 16, float> tile_C_KQ_16;
+typedef tile<16,  4, half2> tile_C_VKQ;
+typedef tile<16,  8, half2> tile_C_VKQ_16;
 
-template<int D, int nwarps, int KQ_stride>
+template<int D, int nwarps, int KQ_per_iter>
 static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
         const half2 * const __restrict__ KV, half2 * const __restrict__ tile_KV, const int stride_KV) {
     constexpr int D2_padded = D/2 + 4; // Size of D in half2, padded to avoid shared memory bank conflicts.
@@ -27,7 +30,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
     constexpr int chunks_per_row = k0_sync_start / h2_per_chunk;
     constexpr int stride_i = WARP_SIZE / chunks_per_row;
 #pragma unroll
-    for (int i0 = 0; i0 < KQ_stride; i0 += nwarps*stride_i) {
+    for (int i0 = 0; i0 < KQ_per_iter; i0 += nwarps*stride_i) {
         const int i = i0 + threadIdx.y*stride_i + (chunks_per_row == WARP_SIZE ? 0 : threadIdx.x / chunks_per_row);
         const int k = (chunks_per_row == WARP_SIZE ? threadIdx.x : threadIdx.x % chunks_per_row)*h2_per_chunk;
 
@@ -40,7 +43,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
 
     // If D is not a power of 2, the rest is loaded synchronously.
     // K/V data is loaded with decreasing granularity for D for better memory bandwidth.
-    static_assert(KQ_stride % (4*nwarps) == 0, "out of bounds");
+    static_assert(KQ_per_iter % (4*nwarps) == 0, "out of bounds");
 #pragma unroll
     for (int stride_k : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4}) {
         const int k0_start = stride_k == WARP_SIZE ? k0_sync_start : D/2 - (D/2) % (2*stride_k);
@@ -52,7 +55,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
         }
 
 #pragma unroll
-        for (int i0 = 0; i0 < KQ_stride; i0 += nwarps*stride_i) {
+        for (int i0 = 0; i0 < KQ_per_iter; i0 += nwarps*stride_i) {
             const int i = i0 + threadIdx.y*stride_i + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
 
 #pragma unroll
@@ -65,12 +68,54 @@ static __device__ __forceinline__ void flash_attn_ext_f16_load_tile(
     }
 }
 
-template<int D, int ncols, int nwarps, int KQ_stride, bool use_logit_softcap, bool needs_fixup, bool is_fixup, bool last_iter>
+template<int ncols1, int nwarps, int KQ_per_iter>
+static __device__ __forceinline__ void flash_attn_ext_f16_load_mask(
+        const half2 * const __restrict__ mask_h2, half2 * const __restrict__ tile_mask, const int stride_mask) {
+    static_assert(KQ_per_iter == 2*WARP_SIZE || KQ_per_iter == WARP_SIZE, "bad KQ_per_iter");
+#ifdef CP_ASYNC_AVAILABLE
+    constexpr int preload = KQ_per_iter * sizeof(half);
+    constexpr int cols_per_warp = 8*WARP_SIZE/KQ_per_iter;
+    constexpr int stride_j = nwarps * cols_per_warp;
+
+    const unsigned int tile_mask_32 = __cvta_generic_to_shared(tile_mask);
+
+#pragma unroll
+    for (int j0 = 0; j0 < ncols1; j0 += stride_j) {
+        const int j = j0 + threadIdx.y*cols_per_warp +
+            (KQ_per_iter == 2*WARP_SIZE ? threadIdx.x / (WARP_SIZE/4) : threadIdx.x / (WARP_SIZE/8));
+
+        if (j0 + stride_j > ncols1 && j >= ncols1) {
+            break;
+        }
+
+        const int i = 4 * (KQ_per_iter == 2*WARP_SIZE ? threadIdx.x % (WARP_SIZE/4) : threadIdx.x % (WARP_SIZE/8));
+
+        cp_async_cg_16<preload>(tile_mask_32 + j*(KQ_per_iter*sizeof(half) + 16) + i*sizeof(half2), mask_h2 + j*stride_mask + i);
+    }
+#else
+    constexpr int cols_per_warp = 2*WARP_SIZE/KQ_per_iter;
+    constexpr int stride_j = nwarps * cols_per_warp;
+#pragma unroll
+    for (int j0 = 0; j0 < ncols1; j0 += stride_j) {
+        const int j = j0 + threadIdx.y*cols_per_warp + (KQ_per_iter == 2*WARP_SIZE ? 0 : threadIdx.x / (WARP_SIZE/2));
+
+        if (j0 + stride_j > ncols1 && j >= ncols1) {
+            break;
+        }
+
+        const int i = KQ_per_iter == 2*WARP_SIZE ? threadIdx.x : threadIdx.x % (WARP_SIZE/2);
+
+        tile_mask[j*(KQ_per_iter/2 + 4) + i] = mask_h2[j*stride_mask + i];
+    }
+#endif // CP_ASYNC_AVAILABLE
+}
+
+template<int D, int ncols1, int ncols2, int nwarps, int KQ_per_iter, int ntiles, bool use_logit_softcap, bool needs_fixup, bool is_fixup, bool last_iter>
 static __device__ __forceinline__ void flash_attn_ext_f16_iter(
         const float2 * const __restrict__ Q_f2,
         const half2  * const __restrict__ K_h2,
         const half2  * const __restrict__ V_h2,
-        const half   * const __restrict__ maskh,
+        const half2  * const __restrict__ mask_h2,
         float2       * const __restrict__ dstk,
         float2       * const __restrict__ dstk_fixup,
         const float scale,
@@ -78,42 +123,60 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
         const float logit_softcap,
         const int ne01,
         const int ne02,
-        const int stride_Q,
         const int stride_KV,
         const int stride_mask,
         const int jt,
         half2        * const __restrict__ tile_K,
         half2        * const __restrict__ tile_V,
+        half2        * const __restrict__ tile_mask,
         const tile_B * const __restrict__ Q_B,
         tile_C_VKQ   * const __restrict__ VKQ_C,
-        float2 & KQ_max,
-        float2 & KQ_rowsum,
+        float        * const __restrict__ KQ_max,
+        float        * const __restrict__ KQ_rowsum,
         const int kb0) {
 #ifdef NEW_MMA_AVAILABLE
-    constexpr int np = nwarps*tile_B::I / ncols; // Number of parallel CUDA warps per Q column.
-    constexpr int D2_padded = D/2 + 4; // Size of D in half2, padded to avoid shared memory bank conflicts.
+    constexpr int cols_per_warp   = ntiles * tile_B::I;
+    constexpr int cols_per_thread = ntiles == 1 ? 2 : ntiles;
+    constexpr int np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
+    constexpr int D2_padded       = D/2 + 4; // Size of D in half2, padded to avoid shared memory bank conflicts.
 
-    const int k_VKQ_0 = kb0*KQ_stride;
-    tile_C_KQ KQ_C[KQ_stride/(np*tile_C_KQ::I)];
+    const int k_VKQ_0 = kb0 * KQ_per_iter;
+    tile_C_KQ KQ_C[KQ_per_iter/(np*tile_C_KQ::I) * ntiles];
+
+    // Use wide variants of tiles if ntiles >= 2.
+    tile_B_16     * Q_B_16   = (tile_B_16     *) Q_B;
+    tile_C_VKQ_16 * VKQ_C_16 = (tile_C_VKQ_16 *) VKQ_C;
+    tile_C_KQ_16  * KQ_C_16  = (tile_C_KQ_16  *) KQ_C;
 
 #ifdef CP_ASYNC_AVAILABLE
     cp_async_wait_all();
     __syncthreads();
-    flash_attn_ext_f16_load_tile<D, nwarps, KQ_stride>(V_h2 + k_VKQ_0*stride_KV, tile_V, stride_KV);
+    flash_attn_ext_f16_load_tile<D, nwarps, KQ_per_iter>(V_h2 + k_VKQ_0*stride_KV, tile_V, stride_KV);
 #else
-    flash_attn_ext_f16_load_tile<D, nwarps, KQ_stride>(K_h2 + k_VKQ_0*stride_KV, tile_K, stride_KV);
+    if (ncols2 > 1 || mask_h2) {
+        flash_attn_ext_f16_load_mask<ncols1, nwarps, KQ_per_iter>(mask_h2 + k_VKQ_0/2, tile_mask, stride_mask);
+    }
+    flash_attn_ext_f16_load_tile<D, nwarps, KQ_per_iter>(K_h2 + k_VKQ_0*stride_KV, tile_K, stride_KV);
     __syncthreads();
 #endif // CP_ASYNC_AVAILABLE
 
     // Calculate tile of KQ:
 #pragma unroll
-    for (int i_KQ_00 = 0; i_KQ_00 < KQ_stride; i_KQ_00 += np*tile_A::I) {
+    for (int i_KQ_00 = 0; i_KQ_00 < KQ_per_iter; i_KQ_00 += np*tile_A::I) {
         const int i_KQ_0 = i_KQ_00 + (threadIdx.y % np)*tile_A::I;
 #pragma unroll
         for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += tile_A::J) {
             tile_A K_A;
             load_ldmatrix(K_A, tile_K + i_KQ_0*D2_padded + k_KQ_0, D2_padded);
-            mma(KQ_C[i_KQ_00/(np*tile_A::I)], K_A, ((tile_B *) Q_B)[k_KQ_0/tile_A::J]);
+            if (ntiles == 1) {
+                mma(KQ_C[i_KQ_00/(np*tile_A::I)], K_A, Q_B[k_KQ_0/tile_A::J]);
+            } else {
+#pragma unroll
+                for (int t = 0; t < ntiles/2; ++t) {
+                    // Wide version of KQ_C is column-major => swap A and B.
+                    mma(KQ_C_16[i_KQ_00/(np*tile_A::I) * ntiles/2 + t], Q_B_16[k_KQ_0/tile_A::J * ntiles/2 + t], K_A);
+                }
+            }
         }
     }
 
@@ -122,9 +185,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
 #endif // CP_ASYNC_AVAILABLE
 
     if (use_logit_softcap) {
-        static_assert(KQ_stride % (np*tile_C_KQ::I) == 0, "bad loop size");
+        static_assert(KQ_per_iter % (np*tile_C_KQ::I) == 0, "bad loop size");
 #pragma unroll
-        for (int i = 0; i < KQ_stride/(np*tile_C_KQ::I); ++i) {
+        for (int i = 0; i < KQ_per_iter/(np*tile_C_KQ::I) * ntiles; ++i) {
 #pragma unroll
             for (int l = 0; l < tile_C_KQ::ne; ++l) {
                 KQ_C[i].x[l] = logit_softcap*tanhf(KQ_C[i].x[l]);
@@ -132,109 +195,209 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
         }
     }
 
-    if (maskh) {
-        static_assert(KQ_stride % (np       *tile_C_KQ::I) == 0, "bad loop size");
-        static_assert(ncols     % (nwarps/np*tile_C_KQ::J) == 0, "bad loop size");
+    float KQ_max_new[cols_per_thread];
 #pragma unroll
-        for (int i00 = 0; i00 < KQ_stride; i00 += np*tile_C_KQ::I) {
-            const int i0 = i00 + (threadIdx.y % np)*tile_C_KQ::I;
-#pragma unroll
-            for (int l = 0; l < tile_C_KQ::ne; ++l) {
-                const int i = i0 + tile_C_KQ::get_i(l);
-                const int j = (threadIdx.y / np)*tile_C_KQ::J + tile_C_KQ::get_j(l);
+    for (int col = 0; col < cols_per_thread; ++col) {
+        KQ_max_new[col] = KQ_max[col];
+    }
+    float KQ_rowsum_add[cols_per_thread] = {0.0f};
 
-                KQ_C[i00/(np*tile_C_KQ::I)].x[l] += slope*__half2float(maskh[j*stride_mask + k_VKQ_0 + i]);
+    if (ntiles == 1) {
+        if (ncols2 > 1 || mask_h2) {
+#pragma unroll
+            for (int i00 = 0; i00 < KQ_per_iter; i00 += np*tile_C_KQ::I) {
+                const int i0 = i00 + (threadIdx.y % np)*tile_C_KQ::I;
+#pragma unroll
+                for (int l = 0; l < tile_C_KQ::ne; ++l) {
+                    const int i = i0 + tile_C_KQ::get_i(l);
+                    const int j = ((threadIdx.y / np)*tile_C_KQ::J + tile_C_KQ::get_j(l)) / ncols2;
+
+                    KQ_C[i00/(np*tile_C_KQ::I)].x[l] += slope *
+                        __half2float(((const half *) tile_mask)[j*(KQ_per_iter + 8) + i]);
+                }
             }
         }
-    }
 
-    // Calculate softmax for each KQ column using the current max. value.
-    // The divisor is stored in KQ_rowsum and will be applied at the end.
-    float2 KQ_max_new = KQ_max;
-    static_assert(KQ_stride % (np*tile_C_KQ::I) == 0, "bad loop size");
+        // Calculate softmax for each KQ column using the current max. value.
+        // The divisor is stored in KQ_rowsum and will be applied at the end.
+        static_assert(KQ_per_iter % (np*tile_C_KQ::I) == 0, "bad loop size");
 #pragma unroll
-    for (int k = 0; k < KQ_stride/(np*tile_C_KQ::I); ++k) {
+        for (int k = 0; k < KQ_per_iter/(np*tile_C_KQ::I); ++k) {
 #pragma unroll
-        for (int l0 = 0; l0 < tile_C_KQ::ne; l0 += 2) {
-            KQ_max_new.x = fmaxf(KQ_max_new.x, KQ_C[k].x[l0 + 0]);
-            KQ_max_new.y = fmaxf(KQ_max_new.y, KQ_C[k].x[l0 + 1]);
+            for (int l = 0; l < tile_C_KQ::ne; ++l) {
+                KQ_max_new[l % 2] = fmaxf(KQ_max_new[l % 2], KQ_C[k].x[l]);
+            }
         }
-    }
 
-    // Values per KQ column are spread across 8 threads, does not need full warp reduce:
+        // Values per KQ column are spread across 8 threads, does not need full warp reduce:
 #pragma unroll
-    for (int offset = 16; offset > 2; offset >>= 1) {
-        KQ_max_new.x = fmaxf(KQ_max_new.x, __shfl_xor_sync(0xFFFFFFFF, KQ_max_new.x, offset, WARP_SIZE));
-        KQ_max_new.y = fmaxf(KQ_max_new.y, __shfl_xor_sync(0xFFFFFFFF, KQ_max_new.y, offset, WARP_SIZE));
-    }
+        for (int col = 0; col < cols_per_thread; ++col) {
+#pragma unroll
+            for (int offset = 16; offset >= 4; offset >>= 1) {
+                KQ_max_new[col] = fmaxf(KQ_max_new[col], __shfl_xor_sync(0xFFFFFFFF, KQ_max_new[col], offset, WARP_SIZE));
+            }
+        }
 
-    float2 KQ_rowsum_add = make_float2(0.0f, 0.0f);
-    static_assert(KQ_stride % (np*tile_C_KQ::I) == 0, "bad loop size");
-#pragma unroll
-    for (int k = 0; k < KQ_stride/(np*tile_C_KQ::I); ++k) {
-#pragma unroll
-        for (int l = 0; l < tile_C_KQ::ne; ++l) {
-            const float KQ_max_l = l % 2 == 0 ? KQ_max_new.x : KQ_max_new.y;
-            const float diff = KQ_C[k].x[l] - KQ_max_l;
-            KQ_C[k].x[l] = expf(diff);
+        static_assert(KQ_per_iter % (np*tile_C_KQ::I) == 0, "bad loop size");
 
-            if (l % 2 == 0) {
-                KQ_rowsum_add.x += KQ_C[k].x[l];
-            } else {
-                KQ_rowsum_add.y += KQ_C[k].x[l];
+#pragma unroll
+        for (int k = 0; k < KQ_per_iter/(np*tile_C_KQ::I); ++k) {
+#pragma unroll
+            for (int l = 0; l < tile_C_KQ::ne; ++l) {
+                KQ_C[k].x[l] = expf(KQ_C[k].x[l] - KQ_max_new[l % 2]);
+
+                KQ_rowsum_add[l % 2] += KQ_C[k].x[l];
+            }
+        }
+    } else { // ntiles > 1
+        if (ncols2 > 1 || mask_h2) {
+#pragma unroll
+            for (int i00 = 0; i00 < KQ_per_iter; i00 += np*tile_C_KQ_16::J) {
+                const int i0 = i00 + (threadIdx.y % np)*tile_C_KQ_16::J;
+#pragma unroll
+                for (int t = 0; t < ntiles/2; ++t) {
+#pragma unroll
+                    for (int l0 = 0; l0 < tile_C_KQ_16::ne; l0 += 2) {
+                        const int i = (i0 + tile_C_KQ_16::get_j(l0)) / 2;
+                        const int j = ((threadIdx.y / np)*cols_per_warp + t*tile_C_KQ_16::I + tile_C_KQ_16::get_i(l0)) / ncols2;
+
+                        const float2 tmp = __half22float2(tile_mask[j*(KQ_per_iter/2 + 4) + i]);
+                        const int KQ_index = i00/(np*tile_C_KQ_16::J) * ntiles/2 + t;
+                        KQ_C_16[KQ_index].x[l0 + 0] += slope*tmp.x;
+                        KQ_C_16[KQ_index].x[l0 + 1] += slope*tmp.y;
+                    }
+                }
+            }
+        }
+
+        // Calculate softmax for each KQ column using the current max. value.
+        // The divisor is stored in KQ_rowsum and will be applied at the end.
+        static_assert(KQ_per_iter % (np*tile_C_KQ::I) == 0, "bad loop size");
+#pragma unroll
+        for (int k = 0; k < KQ_per_iter/(np*tile_C_KQ_16::J); ++k) {
+#pragma unroll
+            for (int t = 0; t < ntiles/2; ++t) {
+#pragma unroll
+                for (int l = 0; l < tile_C_KQ_16::ne; ++l) {
+                    const int KQ_index = 2*t + (l/2) % 2;
+                    KQ_max_new[KQ_index] = fmaxf(KQ_max_new[KQ_index], KQ_C_16[k*ntiles/2 + t].x[l]);
+                }
+            }
+        }
+
+        // Values per KQ column are spread across 4 threads, does not need full warp reduce:
+#pragma unroll
+        for (int col = 0; col < cols_per_thread; ++col) {
+#pragma unroll
+            for (int offset = 2; offset >= 1; offset >>= 1) {
+                KQ_max_new[col] = fmaxf(KQ_max_new[col], __shfl_xor_sync(0xFFFFFFFF, KQ_max_new[col], offset, WARP_SIZE));
+            }
+        }
+
+        static_assert(KQ_per_iter % (np*tile_C_KQ_16::J) == 0, "bad loop size");
+#pragma unroll
+        for (int k = 0; k < KQ_per_iter/(np*tile_C_KQ_16::J); ++k) {
+#pragma unroll
+            for (int t = 0; t < ntiles/2; ++t) {
+#pragma unroll
+                for (int l = 0; l < tile_C_KQ_16::ne; ++l) {
+                    const int KQ_index = 2*t + (l/2) % 2;
+
+                    KQ_C_16[k*ntiles/2 + t].x[l] = expf(KQ_C_16[k*ntiles/2 + t].x[l] - KQ_max_new[KQ_index]);
+
+                    KQ_rowsum_add[KQ_index] += KQ_C_16[k*ntiles/2 + t].x[l];
+                }
             }
         }
     }
 
     {
-        const float2 diff = make_float2(KQ_max.x - KQ_max_new.x, KQ_max.y - KQ_max_new.y);
-        const float2 KQ_max_scale = make_float2(expf(diff.x), expf(diff.y));
-        KQ_max = KQ_max_new;
-
-        // Scale previous KQ_rowsum to account for a potential increase in KQ_max:
-        KQ_rowsum.x = KQ_max_scale.x*KQ_rowsum.x + KQ_rowsum_add.x;
-        KQ_rowsum.y = KQ_max_scale.y*KQ_rowsum.y + KQ_rowsum_add.y;
-
-        const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale.x, KQ_max_scale.y);
+        float KQ_max_scale[cols_per_thread];
 #pragma unroll
-        for (int i = 0; i < D/tile_C_VKQ::I; ++i) {
+        for (int col = 0; col < cols_per_thread; ++col) {
+            KQ_max_scale[col] = expf(KQ_max[col] - KQ_max_new[col]);
+            KQ_max[col] = KQ_max_new[col];
+
+            // Scale previous KQ_rowsum to account for a potential increase in KQ_max:
+            KQ_rowsum[col] = KQ_max_scale[col]*KQ_rowsum[col] + KQ_rowsum_add[col];
+        }
+
+        if (ntiles == 1) {
+            const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[1]);
 #pragma unroll
-            for (int l = 0; l < tile_C_VKQ::ne; ++l) {
-                VKQ_C[i].x[l] *= KQ_max_scale_h2;
+            for (int i = 0; i < D/tile_C_VKQ::I; ++i) {
+#pragma unroll
+                for (int l = 0; l < tile_C_VKQ::ne; ++l) {
+                    VKQ_C[i].x[l] *= KQ_max_scale_h2;
+                }
+            }
+        } else {
+#pragma unroll
+            for (int col = 0; col < cols_per_thread; ++col) {
+                const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[col], KQ_max_scale[col]);
+#pragma unroll
+                for (int i = 0; i < D/tile_C_VKQ_16::J; ++i) {
+#pragma unroll
+                    for (int l0 = 0; l0 < tile_C_VKQ_16::ne; l0 += 2) {
+                        VKQ_C_16[i*ntiles/2 + col/2].x[l0 + col % 2] *= KQ_max_scale_h2;
+                    }
+                }
             }
         }
     }
 
     // Convert KQ C tiles into B tiles for VKQ calculation:
-    tile_B B[KQ_stride/(np*2*tile_B::J)];
-    static_assert(KQ_stride % (np*2*tile_B::J) == 0, "bad loop size");
+    tile_B B[KQ_per_iter/(np*2*tile_B::J) * ntiles];
+    tile_B_16 * B_16 = (tile_B_16 *) B;
+    static_assert(KQ_per_iter % (np*2*tile_B::J) == 0, "bad loop size");
+    if (ntiles == 1) {
 #pragma unroll
-    for (int k = 0; k < KQ_stride/(np*2*tile_B::J); ++k) {
-        B[k] = get_transposed(get_half2(KQ_C[k]));
+        for (int k = 0; k < KQ_per_iter/(np*2*tile_B::J); ++k) {
+            B[k] = get_transposed(get_half2(KQ_C[k]));
+        }
+    } else {
+        for (int k = 0; k < KQ_per_iter/(np*2*tile_B_16::J); ++k) {
+#pragma unroll
+            for (int t = 0; t < ntiles/2; ++t) {
+                B_16[k*ntiles/2 + t] = get_half2(KQ_C_16[k*ntiles/2 + t]);
+            }
+        }
     }
 
 #ifdef CP_ASYNC_AVAILABLE
+    // Preload K tile for next iteration:
     cp_async_wait_all();
     __syncthreads();
     if (!last_iter) {
-        flash_attn_ext_f16_load_tile<D, nwarps, KQ_stride>(K_h2 + (k_VKQ_0 + KQ_stride)*stride_KV, tile_K, stride_KV);
+        if (ncols2 > 1 || mask_h2) {
+            flash_attn_ext_f16_load_mask<ncols1, nwarps, KQ_per_iter>(mask_h2 + (k_VKQ_0 + KQ_per_iter)/2, tile_mask, stride_mask);
+        }
+        flash_attn_ext_f16_load_tile<D, nwarps, KQ_per_iter>(K_h2 + (k_VKQ_0 + KQ_per_iter)*stride_KV, tile_K, stride_KV);
     }
 #else
-    flash_attn_ext_f16_load_tile<D, nwarps, KQ_stride>(V_h2 + k_VKQ_0*stride_KV, tile_V, stride_KV);
+    flash_attn_ext_f16_load_tile<D, nwarps, KQ_per_iter>(V_h2 + k_VKQ_0*stride_KV, tile_V, stride_KV);
     __syncthreads();
 #endif // CP_ASYNC_AVAILABLE
 
     // Calculate VKQ tile:
 #pragma unroll
     for (int i_VKQ_0 = 0; i_VKQ_0 < D; i_VKQ_0 += tile_C_VKQ::I) {
-        static_assert((KQ_stride/2) % (np*tile_A::J) == 0, "bad loop size");
+        static_assert((KQ_per_iter/2) % (np*tile_A::J) == 0, "bad loop size");
 #pragma unroll
-        for (int k00 = 0; k00 < KQ_stride/2; k00 += np*tile_A::J) {
+        for (int k00 = 0; k00 < KQ_per_iter/2; k00 += np*tile_A::J) {
             const int k0 = k00 + (threadIdx.y % np)*tile_A::J;
 
             tile_A A;
             load_ldmatrix_trans(A, tile_V + 2*k0*D2_padded + i_VKQ_0/2, D2_padded);
-            mma(VKQ_C[i_VKQ_0/tile_C_VKQ::I], A, B[k00/(np*tile_A::J)]);
+            if (ntiles == 1) {
+                mma(VKQ_C[i_VKQ_0/tile_C_VKQ::I], A, B[k00/(np*tile_A::J)]);
+            } else {
+#pragma unroll
+                for (int t = 0; t < ntiles/2; ++t) {
+                    // Wide version of VKQ_C is column-major => swap A and B.
+                    mma(VKQ_C_16[i_VKQ_0/tile_C_VKQ::I * ntiles/2 + t], B_16[k00/(np*tile_A::J) * ntiles/2 + t], A);
+                }
+            }
         }
     }
 
@@ -247,12 +410,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
 #endif // NEW_MMA_AVAILABLE
 }
 
-template<int D, int ncols, int nwarps, int KQ_stride, bool use_logit_softcap, bool needs_fixup, bool is_fixup>
+template<int D, int ncols1, int ncols2, int nwarps, int KQ_per_iter, int ntiles, bool use_logit_softcap, bool needs_fixup, bool is_fixup>
 static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
         const float2 * const __restrict__ Q_f2,
         const half2  * const __restrict__ K_h2,
         const half2  * const __restrict__ V_h2,
-        const half   * const __restrict__ maskh,
+        const half2  * const __restrict__ mask_h2,
         float2       * const __restrict__ dstk,
         float2       * const __restrict__ dstk_fixup,
         const float scale,
@@ -260,7 +423,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
         const float logit_softcap,
         const int ne01,
         const int ne02,
-        const int stride_Q,
+        const int stride_Q1,
+        const int stride_Q2,
         const int stride_KV,
         const int stride_mask,
         const int jt,
@@ -269,63 +433,78 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
 #ifdef NEW_MMA_AVAILABLE
     //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
 
-    static_assert(nwarps*tile_B::I % ncols == 0, "bad nwarps");
-    constexpr int np = nwarps*tile_B::I / ncols; // Number of parallel CUDA warps per Q column.
+    constexpr int ncols           = ncols1 * ncols2;
+    constexpr int cols_per_warp   = ntiles * tile_B::I;
+    constexpr int cols_per_thread = ntiles == 1 ? 2 : ntiles;
+    constexpr int np              = nwarps * (cols_per_warp/ncols2) / ncols1; // Number of parallel CUDA warps per Q column.
 
-    static_assert(D         % nwarps == 0, "bad D");
-    static_assert(KQ_stride % nwarps == 0, "bad KQ_stride");
+    static_assert(nwarps * (cols_per_warp/ncols2) % ncols1 == 0, "bad nwarps");
+
+    static_assert(D           % nwarps == 0, "bad D");
+    static_assert(KQ_per_iter % nwarps == 0, "bad KQ_per_iter");
 
     constexpr int D2_padded = D/2 + 4; // Size of D in half2, padded to avoid shared memory bank conflicts.
 
-    // Temporary shared buffer for loading K/V data with KQ_stride*D logical elements:
+    // Temporary shared buffer for loading K/V data with KQ_per_iter*D logical elements:
     extern __shared__ half2 tile_K[];
 #ifdef CP_ASYNC_AVAILABLE
-    half2 * tile_V = tile_K + KQ_stride*D2_padded;
+    half2 * tile_V    = tile_K + KQ_per_iter*D2_padded;
 #else
-    half2 * tile_V = tile_K;
+    half2 * tile_V    = tile_K;
 #endif // CP_ASYNC_AVAILABLE
+    half2 * tile_mask = tile_V + KQ_per_iter*D2_padded;
 
-    tile_B Q_B[D/(2*tile_B::J)];
-    tile_C_VKQ VKQ_C[D/tile_C_VKQ::I];
+    tile_B       Q_B[D/(2*tile_B::J) * ntiles];
+    tile_C_VKQ VKQ_C[D/tile_C_VKQ::I * ntiles];
 
-    float2 KQ_rowsum = {0.0f, 0.0f};
-    float2    KQ_max = {-FLT_MAX/2.0f, -FLT_MAX/2.0f};
+    tile_B_16     * Q_B_16   = (tile_B_16     *) Q_B;
+    tile_C_VKQ_16 * VKQ_C_16 = (tile_C_VKQ_16 *) VKQ_C;
+
+    float KQ_rowsum[cols_per_thread] = {0.0f};
+    float KQ_max[cols_per_thread];
+#pragma unroll
+    for (int col = 0; col < cols_per_thread; ++col) {
+        KQ_max[col] = -FLT_MAX/2.0f;
+    }
 
     // Temporarily load Q data into tile_K, will be loaded into registers afterwards.
     // The loading is done with decreasing granularity for D for better memory bandwidth.
     const half2 scale_h2 = make_half2(scale, scale);
 #pragma unroll
     for (int stride_k : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4}) {
-        const int k0_start = stride_k == WARP_SIZE ? 0 : D/2 - (D/2) % (2*stride_k);
-        const int k0_stop  =                             D/2 - (D/2) % (1*stride_k);
-        const int stride_j = WARP_SIZE / stride_k;
+        const int k0_start  = stride_k == WARP_SIZE ? 0 : D/2 - (D/2) % (2*stride_k);
+        const int k0_stop   =                             D/2 - (D/2) % (1*stride_k);
+        const int stride_jc = WARP_SIZE / stride_k;
 
         if (k0_start == k0_stop) {
             continue;
         }
 
-        if (nwarps*stride_j > ncols && threadIdx.y*stride_j >= ncols) {
-            break;
-        }
-
 #pragma unroll
-        for (int j0 = 0; j0 < ncols; j0 += nwarps*stride_j) {
-            const int j = j0 + threadIdx.y*stride_j + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
+        for (int jc0 = 0; jc0 < ncols; jc0 += nwarps*stride_jc) {
+            const int jc = jc0 + threadIdx.y*stride_jc + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
 
-            if (jt*ncols + j < ne01) {
+            if (jc0 + nwarps*stride_jc > ncols && jc >= ncols) {
+                break;
+            }
+
+            const int j = jc / ncols2;
+            const int c = jc % ncols2;
+
+            if (jt*ncols1 + j < ne01) {
 #pragma unroll
                 for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
                     const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
 
-                    const float2 tmp = Q_f2[(jt*ncols + j)*stride_Q + k];
-                    tile_K[j*D2_padded + k] = scale_h2 * make_half2(tmp.x, tmp.y);
+                    const float2 tmp = Q_f2[(jt*ncols1 + j)*stride_Q1 + c*stride_Q2 + k];
+                    tile_K[jc*D2_padded + k] = scale_h2 * make_half2(tmp.x, tmp.y);
                 }
             } else {
 #pragma unroll
                 for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
                     const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
 
-                    tile_K[j*D2_padded + k] = make_half2(0.0f, 0.0f);
+                    tile_K[jc*D2_padded + k] = make_half2(0.0f, 0.0f);
                 }
             }
         }
@@ -334,128 +513,217 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
     __syncthreads();
 
     {
-        const int j0 = (threadIdx.y / np) * tile_B::I;
+        const int j0 = (threadIdx.y / np) * cols_per_warp;
 
 #pragma unroll
         for (int k0 = 0; k0 < D/2; k0 += tile_B::J) {
-            load_ldmatrix(Q_B[k0/tile_B::J], tile_K + j0*D2_padded + k0, D2_padded);
+            if (ntiles == 1) {
+                load_ldmatrix(Q_B[k0/tile_B::J], tile_K + j0*D2_padded + k0, D2_padded);
+            } else {
+#pragma unroll
+                for (int t = 0; t < ntiles/2; ++t) {
+                    load_ldmatrix(Q_B_16[k0/tile_B_16::J * ntiles/2 + t],
+                        tile_K + (j0 + t*tile_B_16::I)*D2_padded + k0, D2_padded);
+                }
+            }
         }
     }
 
     __syncthreads();
 
-    // Preload K data for first iteration when using cp_async:
+    // Preload mask and K data for first iteration when using cp_async:
 #ifdef CP_ASYNC_AVAILABLE
-    flash_attn_ext_f16_load_tile<D, nwarps, KQ_stride>(K_h2 + kb0_start*KQ_stride*stride_KV, tile_K, stride_KV);
+    if (ncols2 > 1 || mask_h2) {
+        flash_attn_ext_f16_load_mask<ncols1, nwarps, KQ_per_iter>(mask_h2 + kb0_start*KQ_per_iter/2, tile_mask, stride_mask);
+    }
+    flash_attn_ext_f16_load_tile<D, nwarps, KQ_per_iter>(K_h2 + kb0_start*KQ_per_iter*stride_KV, tile_K, stride_KV);
 #endif // CP_ASYNC_AVAILABLE
 
     // Iterate over ne11 == previous tokens:
     for (int kb0 = kb0_start; kb0 < kb0_stop-1; ++kb0) {
         constexpr bool last_iter = false;
-        flash_attn_ext_f16_iter<D, ncols, nwarps, KQ_stride, use_logit_softcap, needs_fixup, is_fixup, last_iter>
-            (Q_f2, K_h2, V_h2, maskh, dstk, dstk_fixup, scale, slope, logit_softcap,
-             ne01, ne02, stride_Q, stride_KV, stride_mask, jt, tile_K, tile_V, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0);
+        flash_attn_ext_f16_iter<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap, needs_fixup, is_fixup, last_iter>
+            (Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap,
+             ne01, ne02, stride_KV, stride_mask, jt, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0);
     }
     { // kb0_start is always < kb0_stop so the last iter can be executed unconditionally.
         constexpr bool last_iter = true;
-        flash_attn_ext_f16_iter<D, ncols, nwarps, KQ_stride, use_logit_softcap, needs_fixup, is_fixup, last_iter>
-            (Q_f2, K_h2, V_h2, maskh, dstk, dstk_fixup, scale, slope, logit_softcap,
-             ne01, ne02, stride_Q, stride_KV, stride_mask, jt, tile_K, tile_V, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0_stop-1);
+        flash_attn_ext_f16_iter<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap, needs_fixup, is_fixup, last_iter>
+            (Q_f2, K_h2, V_h2, mask_h2, dstk, dstk_fixup, scale, slope, logit_softcap,
+             ne01, ne02, stride_KV, stride_mask, jt, tile_K, tile_V, tile_mask, Q_B, VKQ_C, KQ_max, KQ_rowsum, kb0_stop-1);
     }
 
     // With cp_async there is no __syncthreads at the end of the iter,
     //     there can be a race condition on shared memory access for combining/writing back results.
 #ifdef CP_ASYNC_AVAILABLE
-    if (nwarps*tile_B::I > KQ_stride) {
+    if (nwarps*cols_per_warp > KQ_per_iter) {
         __syncthreads();
     }
 #endif // CP_ASYNC_AVAILABLE
 
     // Finally, sum up partial KQ rowsums.
-    // The partial sums are spread across 8 threads each, does not need full reduce.
+    // The partial sums are spread across 8/4 threads each, does not need full reduce.
+    {
+        constexpr int offset_first = ntiles == 1 ? 16 : 2;
+        constexpr int offset_last  = ntiles == 1 ?  4 : 1;
 #pragma unroll
-    for (int offset = 16; offset > 2; offset >>= 1) {
-        KQ_rowsum.x += __shfl_xor_sync(0xFFFFFFFF, KQ_rowsum.x, offset, WARP_SIZE);
-        KQ_rowsum.y += __shfl_xor_sync(0xFFFFFFFF, KQ_rowsum.y, offset, WARP_SIZE);
+        for (int col = 0; col < cols_per_thread; ++col) {
+#pragma unroll
+            for (int offset = offset_first; offset >= offset_last; offset >>= 1) {
+                KQ_rowsum[col] += __shfl_xor_sync(0xFFFFFFFF, KQ_rowsum[col], offset, WARP_SIZE);
+            }
+        }
     }
 
     // Write VKQ accumulators to shared memory in column-major format.
     // It's faster to do small writes to shared memory, then large write to VRAM than to do small writes to VRAM.
     // Also for np > 1 the combination is done via these values in shared memory.
-    const int j_cwd = threadIdx.y*tile_B::I + tile_B::get_i(-1); // j combine write data
+    if (ntiles == 1) {
+        const int jc_cwd = threadIdx.y*tile_B::I + tile_B::get_i(-1); // jc combine write data
 #pragma unroll
-    for (int k0 = 0; k0 < D/2; k0 += tile_B::J) {
-        const tile_B B = get_transposed(VKQ_C[k0/tile_B::J]); // Conversion of C to B matrix puts it in column-major format.
+        for (int k0 = 0; k0 < D/2; k0 += tile_B::J) {
+            const tile_B B = get_transposed(VKQ_C[k0/tile_B::J]); // Conversion of C to B matrix puts it in column-major format.
 
 #pragma unroll
-        for (int l = 0; l < tile_B::ne; ++l) {
-            const int k = k0 + tile_B::get_j(l);
+            for (int l = 0; l < tile_B::ne; ++l) {
+                const int k = k0 + tile_B::get_j(l);
 
-            tile_K[j_cwd*D2_padded + k] = B.x[l];
+                tile_K[jc_cwd*D2_padded + k] = B.x[l];
+            }
+        }
+    } else {
+#pragma unroll
+        for (int t = 0; t < ntiles/2; ++t) {
+            const int j0 = threadIdx.y*cols_per_warp + t*tile_C_VKQ_16::I;
+#pragma unroll
+            for (int k0 = 0; k0 < D/2; k0 += tile_C_VKQ_16::J) {
+#pragma unroll
+                for (int l = 0; l < tile_C_VKQ_16::ne; ++l) {
+                    const int j = j0 + tile_C_VKQ_16::get_i(l);
+                    const int k = k0 + tile_C_VKQ_16::get_j(l);
+
+                    tile_K[j*D2_padded + k] = VKQ_C_16[k0/tile_C_VKQ_16::J * ntiles/2 + t].x[l];
+                }
+            }
         }
     }
 
-    const int j_cwmo = (threadIdx.x % (2*tile_C_VKQ::J)) / tile_C_VKQ::J; // j combine write meta offset
-    const int j_cwm = threadIdx.y*(2*tile_C_VKQ::J) + 2*tile_C_VKQ::get_j(-1) + j_cwmo; // j combine write meta
-    const float2 KQ_cmr = make_float2(((const float *) &KQ_max)[j_cwmo], ((const float *) &KQ_rowsum)[j_cwmo]); // KQ combine max rowsum
+    if constexpr (ntiles == 1) {
+        const int jc_cwmo = (threadIdx.x % (2*tile_C_VKQ::J)) / tile_C_VKQ::J; // jc combine write meta offset
+        const int jc_cwm = threadIdx.y*(2*tile_C_VKQ::J) + 2*tile_C_VKQ::get_j(-1) + jc_cwmo; // jc combine write meta
+        const float2 KQ_cmr = make_float2(KQ_max[jc_cwmo], KQ_rowsum[jc_cwmo]); // KQ combine max rowsum
 
-    if (((!needs_fixup && !is_fixup) || np > 1) && threadIdx.x < 2*tile_C_VKQ::J) {
-        // Use the 16 bytes of padding in each row to store the meta data: KQ max, KQ rowsum, KQ max scale.
-        ((float2 *) tile_K)[j_cwm*(D2_padded/2) + D/4] = KQ_cmr;
+        if (((!needs_fixup && !is_fixup) || np > 1) && threadIdx.x < 2*tile_C_VKQ::J) {
+            // Use the 16 bytes of padding in each row to store the meta data: KQ max, KQ rowsum, KQ max scale.
+            ((float2 *) tile_K)[jc_cwm*(D2_padded/2) + D/4] = KQ_cmr;
+        }
+
+        __syncthreads();
+
+        if (np == 1) {
+            // No combination is needed, the meta data can be directly written from registers to VRAM.
+            if (needs_fixup && threadIdx.x < tile_B::I) {
+                float2 * dstk_fixup_meta = dstk_fixup + blockIdx.x*ncols;
+                dstk_fixup_meta[jc_cwm] = KQ_cmr;
+            }
+            if (is_fixup && threadIdx.x < tile_B::I) {
+                float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols;
+                dstk_fixup_meta[jc_cwm] = KQ_cmr;
+            }
+        }
+    } else {
+        static_assert(ntiles == 2 || ntiles == 4, "bad ntiles");
+        const int jc_cwm = threadIdx.y*cols_per_warp // jc combine write meta
+            + (ntiles == 4 ? ((threadIdx.x % 4) / 2) * tile_C_VKQ_16::I : 0)
+            + tile_C_VKQ_16::get_i(threadIdx.x % 4);
+        const float2 KQ_cmr = make_float2(KQ_max[threadIdx.x % cols_per_thread], KQ_rowsum[threadIdx.x % cols_per_thread]); // KQ combine max rowsum
+
+        if (((!needs_fixup && !is_fixup) || np > 1) && (ntiles == 4 || threadIdx.x % 4 < cols_per_thread)) {
+            // Use the 16 bytes of padding in each row to store the meta data: KQ max, KQ rowsum, KQ max scale.
+            ((float2 *) tile_K)[jc_cwm*(D2_padded/2) + D/4] = KQ_cmr;
+        }
+
+        __syncthreads();
+
+        if (np == 1) {
+            // No combination is needed, the meta data can be directly written from registers to VRAM.
+            if (needs_fixup && (ntiles == 4 || threadIdx.x % 4 < ntiles)) {
+                float2 * dstk_fixup_meta = dstk_fixup + blockIdx.x*ncols;
+                dstk_fixup_meta[jc_cwm] = KQ_cmr;
+            }
+            if (is_fixup && (ntiles == 4 || threadIdx.x % 4 < ntiles)) {
+                float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols;
+                dstk_fixup_meta[jc_cwm] = KQ_cmr;
+            }
+        }
     }
 
-    __syncthreads();
-
-    static_assert(np == 1 || np == 2 || np == 4, "bad np");
-    if (np == 1) {
-        // No combination is needed, the meta data can be directly written from registers to VRAM.
-        if (needs_fixup && threadIdx.x < tile_B::I) {
-            float2 * dstk_fixup_meta = dstk_fixup + blockIdx.x*ncols;
-            dstk_fixup_meta[j_cwm] = KQ_cmr;
-        }
-        if (is_fixup && threadIdx.x < tile_B::I) {
-            float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols;
-            dstk_fixup_meta[j_cwm] = KQ_cmr;
-        }
-    } else if (threadIdx.y % np == 0) {
+    static_assert(np == 1 || ntiles == 1 || ntiles == 2, "bad ntiles");
+    if (np > 1 && threadIdx.y % np == 0) {
         // Combine the meta data for parallel warps via shared memory.
         // Warps with threadIdx.y % np != 0 must NOT return early.
         // All threads must return simultaneously to avoid race conditions with work on the next tile.
 
-        float * meta_j = (float *) tile_K + (threadIdx.y*tile_B::I + threadIdx.x)*D2_padded + D/2;
+        constexpr int nmeta = np*cols_per_warp >= WARP_SIZE ? np*cols_per_warp/WARP_SIZE : 1;
 
-        float KQ_cm = -FLT_MAX/2; // KQ combine max per parallel warp.
-        if (np*tile_B::I == WARP_SIZE || threadIdx.x < np*tile_B::I) {
-            KQ_cm = meta_j[0];
+        const int jc_meta = threadIdx.y*cols_per_warp + (np*cols_per_warp < WARP_SIZE ? threadIdx.x % (np*cols_per_warp) : threadIdx.x);
+        float2 * const meta_ptr = ((float2 *) tile_K) + jc_meta*(D2_padded/2) + D/4;
+        float2 meta[nmeta];
+#pragma unroll
+        for (int imeta = 0; imeta < nmeta; ++imeta) {
+            meta[imeta] = meta_ptr[imeta * WARP_SIZE * D2_padded/2];
         }
 
-        float KQ_cmn = KQ_cm; // KQ combine max new, max between all parallel warps.
+        float KQ_cmn = meta[0].x; // KQ combine max new, max between all parallel warps.
 #pragma unroll
-        for (int offset = np*tile_B::I/2; offset >= tile_B::I; offset >>= 1) {
+        for (int imeta = 1; imeta < nmeta; ++imeta) {
+            KQ_cmn = fmaxf(KQ_cmn, meta[imeta].x);
+        }
+#pragma unroll
+        for (int offset = np*cols_per_warp/2; offset >= cols_per_warp; offset >>= 1) {
+            if (offset >= WARP_SIZE) {
+                continue;
+            }
             KQ_cmn = fmaxf(KQ_cmn, __shfl_xor_sync(0xFFFFFFFF, KQ_cmn, offset, WARP_SIZE));
         }
 
-        const float KQ_cms = expf(KQ_cm - KQ_cmn); // KQ combine max scale per warp.
-        float KQ_crs = 0.0f; // KQ combine rowsum, scaled sum of all parallel warps.
-        if (np*tile_B::I == WARP_SIZE || threadIdx.x < np*tile_B::I) {
-            KQ_crs = KQ_cms*meta_j[1];
+        float KQ_cms[nmeta]; // KQ combine max scale per warp.
+#pragma unroll
+        for (int imeta = 0; imeta < nmeta; ++imeta) {
+            KQ_cms[imeta] = expf(meta[imeta].x - KQ_cmn);
+        }
+
+        float KQ_crs = KQ_cms[0]*meta[0].y; // KQ combine rowsum, scaled sum of all parallel warps.
+#pragma unroll
+        for (int imeta = 1; imeta < nmeta; ++imeta) {
+            KQ_crs += KQ_cms[imeta]*meta[imeta].y;
         }
 #pragma unroll
-        for (int offset = np*tile_B::I/2; offset >= tile_B::I; offset >>= 1) {
+        for (int offset = np*cols_per_warp/2; offset >= cols_per_warp; offset >>= 1) {
+            if (offset >= WARP_SIZE) {
+                continue;
+            }
             KQ_crs += __shfl_xor_sync(0xFFFFFFFF, KQ_crs, offset, WARP_SIZE);
         }
 
         // Write back combined meta data:
-        if (np*tile_B::I == WARP_SIZE || threadIdx.x < np*tile_B::I) {
-            *((float2 *) meta_j) = make_float2(KQ_cms, KQ_crs); // Combined KQ max scale + rowsum.
+#pragma unroll
+        for (int imeta = 0; imeta < nmeta; ++imeta) {
+            if (np*cols_per_warp >= WARP_SIZE || threadIdx.x < np*cols_per_warp) {
+                // Combined KQ max scale + rowsum.
+                meta_ptr[imeta * WARP_SIZE * D2_padded/2] = make_float2(KQ_cms[imeta], KQ_crs);
+            }
         }
-        if (needs_fixup && threadIdx.x < tile_B::I) {
+
+        // Combined KQ max + rowsum.
+        static_assert(cols_per_warp <= WARP_SIZE);
+        if (needs_fixup && (cols_per_warp == WARP_SIZE || threadIdx.x < cols_per_warp)) {
             float2 * dstk_fixup_meta = dstk_fixup + blockIdx.x*ncols;
-            dstk_fixup_meta[(threadIdx.y/np)*tile_B::I + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);
+            dstk_fixup_meta[(threadIdx.y/np)*cols_per_warp + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);
         }
-        if (is_fixup && threadIdx.x < tile_B::I) {
+        if (is_fixup && (cols_per_warp == WARP_SIZE || threadIdx.x < cols_per_warp)) {
             float2 * dstk_fixup_meta = dstk_fixup + (gridDim.x + blockIdx.x)*ncols;
-            dstk_fixup_meta[(threadIdx.y/np)*tile_B::I + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);
+            dstk_fixup_meta[(threadIdx.y/np)*cols_per_warp + threadIdx.x] = make_float2(KQ_cmn, KQ_crs);
         }
     }
 
@@ -470,27 +738,32 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
 
 #pragma unroll
         for (int stride_k : {WARP_SIZE, WARP_SIZE/2, WARP_SIZE/4}) {
-            const int k0_start = stride_k == WARP_SIZE ? 0 : D/2 - (D/2) % (2*stride_k);
-            const int k0_stop  =                             D/2 - (D/2) % (1*stride_k);
-            const int stride_j = WARP_SIZE / stride_k;
+            const int k0_start  = stride_k == WARP_SIZE ? 0 : D/2 - (D/2) % (2*stride_k);
+            const int k0_stop   =                             D/2 - (D/2) % (1*stride_k);
+            const int stride_jc = WARP_SIZE / stride_k;
 
             if (k0_start == k0_stop) {
                 continue;
             }
 
-            if (nwarps*stride_j > ncols && threadIdx.y*stride_j >= ncols) {
-                break;
-            }
-
 #pragma unroll
-            for (int j0_dst = 0; j0_dst < ncols; j0_dst += (nwarps/np)*stride_j) {
-                const int j_dst = j0_dst + (threadIdx.y/np)*stride_j + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
-                const int j_tile_K = (j_dst/tile_B::I)*(np*tile_B::I) + j_dst % tile_B::I;
+            for (int jc0_dst = 0; jc0_dst < ncols; jc0_dst += (nwarps/np)*stride_jc) {
+                const int jc_dst = jc0_dst + (threadIdx.y/np)*stride_jc + (stride_k == WARP_SIZE ? 0 : threadIdx.x / stride_k);
 
-                if (!is_fixup && jt*ncols + j_dst >= ne01) {
+                if (jc0_dst + (nwarps/np)*stride_jc > ncols && jc_dst >= ncols) {
+                    break;
+                }
+
+                const int jc_tile_K = (jc_dst/cols_per_warp)*(np*cols_per_warp) + jc_dst % cols_per_warp;
+
+                const int j_dst = jc_dst / ncols2;
+                const int c_dst = jc_dst % ncols2;
+
+                if (!is_fixup && jt*ncols1 + j_dst >= ne01) {
                     continue;
                 }
-                const float * meta_j = (const float *) tile_K + j_tile_K*D2_padded + D/2;
+
+                const float * meta_j = (const float *) tile_K + jc_tile_K*D2_padded + D/2;
 #pragma unroll
                 for (int k0 = k0_start; k0 < k0_stop; k0 += stride_k) {
                     const int k = k0 + (stride_k == WARP_SIZE ? threadIdx.x : threadIdx.x % stride_k);
@@ -498,8 +771,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
                     float2 dstk_val = make_float2(0.0f, 0.0f);
 #pragma unroll
                     for (int ip = 0; ip < np; ++ip) {
-                        const float KQ_crs = np == 1 ? 1.0f : meta_j[ip*tile_B::I*D2_padded + 0];
-                        const float2 dstk_val_add = __half22float2(tile_K[(j_tile_K + ip*tile_B::I)*D2_padded + k]);
+                        const float KQ_crs = np == 1 ? 1.0f : meta_j[ip*cols_per_warp * D2_padded + 0];
+                        const float2 dstk_val_add = __half22float2(tile_K[(jc_tile_K + ip*cols_per_warp) * D2_padded + k]);
                         dstk_val.x += dstk_val_add.x*KQ_crs;
                         dstk_val.y += dstk_val_add.y*KQ_crs;
                     }
@@ -511,9 +784,9 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
                     }
 
                     if (is_fixup) {
-                        dstk_fixup_data[j_dst*(D/2) + k] = dstk_val;
+                        dstk_fixup_data[jc_dst*(D/2) + k] = dstk_val;
                     } else {
-                        dstk[(jt*ncols + j_dst)*ne02*(D/2) + k] = dstk_val;
+                        dstk[((jt*ncols1 + j_dst)*ne02 + c_dst)*(D/2) + k] = dstk_val;
                     }
                 }
             }
@@ -528,10 +801,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
 #endif // NEW_MMA_AVAILABLE
 }
 
-template<int D, int ncols, int nwarps, int KQ_stride, bool use_logit_softcap>
-#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
+template<int D, int ncols1, int ncols2, int nwarps, int KQ_per_iter, int ntiles, bool use_logit_softcap>
 __launch_bounds__(nwarps*WARP_SIZE, 2)
-#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_ext_f16(
         const char * __restrict__ Q,
         const char * __restrict__ K,
@@ -568,10 +839,7 @@ static __global__ void flash_attn_ext_f16(
         const int ne1,
         const int ne2,
         const int ne3) {
-#ifndef NEW_MMA_AVAILABLE
-    NO_DEVICE_CODE;
-    return;
-#endif // NEW_MMA_AVAILABLE
+#if defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
 
     // Skip unused kernel variants for faster compilation:
     if (use_logit_softcap && !(D == 128 || D == 256)) {
@@ -579,20 +847,23 @@ static __global__ void flash_attn_ext_f16(
         return;
     }
 
-    static_assert(FATTN_KQ_STRIDE % KQ_stride == 0, "bad KQ_stride");
+    static_assert(FATTN_KQ_STRIDE % KQ_per_iter == 0, "bad KQ_per_iter");
 
     const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
 
-    const int stride_Q    = nb01 / sizeof(float2);
+    const int stride_Q1   = nb01 / sizeof(float2);
+    const int stride_Q2   = nb02 / sizeof(float2);
     const int stride_KV   = nb11 / sizeof(half2);
-    const int stride_mask = nb31 / sizeof(half);
+    const int stride_mask = nb31 / sizeof(half2);
 
-    const int iter_k = ne11 / KQ_stride;
-    const int iter_j = (ne01 + (ncols - 1)) / ncols;
+    const int iter_k = ne11 / FATTN_KQ_STRIDE;
+    const int iter_j = (ne01 + (ncols1 - 1)) / ncols1;
+
+    constexpr int kb_niter = FATTN_KQ_STRIDE / KQ_per_iter; // Number of kernel iterations per assigned KQ slice.
 
     // kbc == k block continuous, current index in continuous ijk space.
-    int       kbc      = (blockIdx.x + 0)*iter_k*iter_j*ne02 / gridDim.x;
-    const int kbc_stop = (blockIdx.x + 1)*iter_k*iter_j*ne02 / gridDim.x;
+    int       kbc      = (blockIdx.x + 0)*iter_k*iter_j*(ne02/ncols2) / gridDim.x;
+    const int kbc_stop = (blockIdx.x + 1)*iter_k*iter_j*(ne02/ncols2) / gridDim.x;
 
     // If the seams of 2 CUDA blocks fall within an output tile their results need to be combined.
     // For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup).
@@ -605,25 +876,28 @@ static __global__ void flash_attn_ext_f16(
         const int channel = kbc / (iter_k*iter_j);
         const int jt      = (kbc - channel*iter_k*iter_j) / iter_k; // j index of current tile.
 
-        const float2 * Q_f2  = (const float2 *) (Q + nb02* channel);
-        const half2  * K_h2  = (const half2  *) (K + nb12*(channel / gqa_ratio));
-        const half2  * V_h2  = (const half2  *) (V + nb12*(channel / gqa_ratio)); // K and V have same shape
-        const half   * maskh = mask ? (const half  *) mask + (nb31/sizeof(half))*jt*ncols : nullptr;
-        float2       * dstk  = ((float2 *) dst) + channel*(D/2);
+        const float2 * Q_f2    = (const float2 *) (Q + nb02* channel*ncols2);
+        const half2  * K_h2    = (const half2  *) (K + nb12*(channel*ncols2 / gqa_ratio));
+        const half2  * V_h2    = (const half2  *) (V + nb12*(channel*ncols2 / gqa_ratio)); // K and V have same shape
+        const half2  * mask_h2 = ncols2 > 1 || mask ? (const half2  *) mask + (nb31/sizeof(half2))*jt*ncols1 : nullptr;
+        float2       * dstk    = ((float2 *) dst) + channel*(ncols2 * D/2);
 
-        const float slope = get_alibi_slope(max_bias, channel, n_head_log2, m0, m1);
+        const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, channel, n_head_log2, m0, m1) : 1.0f;
+
+        const int kb0_start_kernel = kb0_start * kb_niter;
+        const int kb0_stop_kernel  = kb0_stop  * kb_niter;
 
         constexpr bool is_fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
         if (kb0_start == 0) {
             constexpr bool needs_fixup = false; // CUDA block is working on an entire tile.
-            flash_attn_ext_f16_process_tile<D, ncols, nwarps, KQ_stride, use_logit_softcap, needs_fixup, is_fixup>
-                (Q_f2, K_h2, V_h2, maskh, dstk, dst_meta, scale, slope, logit_softcap,
-                 ne01, ne02, stride_Q, stride_KV, stride_mask, jt, kb0_start, kb0_stop);
+            flash_attn_ext_f16_process_tile<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap, needs_fixup, is_fixup>
+                (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
+                 ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
         } else {
             constexpr bool needs_fixup = true; // CUDA block is working on the beginning of a tile.
-            flash_attn_ext_f16_process_tile<D, ncols, nwarps, KQ_stride, use_logit_softcap, needs_fixup, is_fixup>
-                (Q_f2, K_h2, V_h2, maskh, dstk, dst_meta, scale, slope, logit_softcap,
-                 ne01, ne02, stride_Q, stride_KV, stride_mask, jt, kb0_start, kb0_stop);
+            flash_attn_ext_f16_process_tile<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap, needs_fixup, is_fixup>
+                (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
+                 ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
         }
 
         kbc += iter_k;
@@ -640,39 +914,49 @@ static __global__ void flash_attn_ext_f16(
     const int channel = kbc / (iter_k*iter_j);
     const int jt      = (kbc - channel*iter_k*iter_j) / iter_k; // j index of current tile.
 
-    const float2 * Q_f2  = (const float2 *) (Q + nb02* channel);
-    const half2  * K_h2  = (const half2  *) (K + nb12*(channel / gqa_ratio));
-    const half2  * V_h2  = (const half2  *) (V + nb12*(channel / gqa_ratio)); // K and V have same shape
-    const half   * maskh = mask ? (const half  *) mask + (nb31/sizeof(half))*jt*ncols : nullptr;
-    float2       * dstk  = ((float2 *) dst) + channel*(D/2);
+    const float2 * Q_f2    = (const float2 *) (Q + nb02* channel*ncols2);
+    const half2  * K_h2    = (const half2  *) (K + nb12*(channel*ncols2 / gqa_ratio));
+    const half2  * V_h2    = (const half2  *) (V + nb12*(channel*ncols2 / gqa_ratio)); // K and V have same shape
+    const half2  * mask_h2 = ncols2 > 1 || mask ? (const half2  *) mask + (nb31/sizeof(half2))*jt*ncols1 : nullptr;
+    float2       * dstk    = ((float2 *) dst) + channel*(ncols2 * D/2);
 
-    const float slope = get_alibi_slope(max_bias, channel, n_head_log2, m0, m1);
+    const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, channel, n_head_log2, m0, m1) : 1.0f;
+
+    const int kb0_start_kernel = kb0_start * kb_niter;
+    const int kb0_stop_kernel  = kb0_stop  * kb_niter;
 
     constexpr bool is_fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
     constexpr bool needs_fixup = false;
-    flash_attn_ext_f16_process_tile<D, ncols, nwarps, KQ_stride, use_logit_softcap, needs_fixup, is_fixup>
-        (Q_f2, K_h2, V_h2, maskh, dstk, dst_meta, scale, slope, logit_softcap,
-         ne01, ne02, stride_Q, stride_KV, stride_mask, jt, kb0_start, kb0_stop);
+    flash_attn_ext_f16_process_tile<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap, needs_fixup, is_fixup>
+        (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
+         ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
+#else
+    NO_DEVICE_CODE;
+#endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
 }
 
-template <int D, int cols_per_block>
+template <int D, int ncols1, int ncols2>
 void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    typedef tile<16, 8, half2> tile_A;
-    typedef tile< 8, 8, half2> tile_B;
+    constexpr int ncols         = ncols1 * ncols2;
+    constexpr int KQ_per_iter   = D <= 128 && ncols1 <= 64 ? 64 : 32;
+    constexpr int nwarps        = (KQ_per_iter == 32 && ncols <= 16) ? 2 : 4;
+    constexpr int ntiles        = ncols <= 8 ? 1 : (ncols <= 64 ? 2 : 4);
+    constexpr int cols_per_warp = ntiles * tile_B::I;
 
-    static_assert(D              % tile_B::J == 0, "bad D");
-    static_assert(cols_per_block % tile_B::I == 0, "bad cols_per_block");
+    static_assert(D     %    tile_B::J  == 0, "bad D");
+    static_assert(ncols % cols_per_warp == 0, "bad ncols");
 
     const ggml_tensor * KQV = dst;
-    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+    const int id    = ggml_cuda_get_device();
+    const int cc    = ggml_cuda_info().devices[id].cc;
 
-    constexpr int KQ_stride = D <= 128 ? 64 : 32;
-    constexpr int nwarps    = (KQ_stride == 32 && cols_per_block <= 16) ?
-                              cols_per_block/tile_B::J * KQ_stride/tile_A::I : (cols_per_block <= 8 ? 4 : 8);
+    const int KQ_shared_rows = cp_async_available(cc) ? 2*KQ_per_iter : KQ_per_iter;
 
-    const int    nrows_KQ      = cp_async_available(cc) ? 2*KQ_stride : KQ_stride;
-    const int    nrows_combine = nwarps*tile_B::J;
-    const size_t nbytes_shared = std::max(nrows_KQ, nrows_combine) * (D + 8) * sizeof(half);
+    const size_t nbytes_shared_KV      = KQ_shared_rows       * (D           + 8) * sizeof(half);
+    const size_t nbytes_shared_mask    = ncols1               * (KQ_per_iter + 8) * sizeof(half);
+    const size_t nbytes_shared_combine = nwarps*cols_per_warp * (D           + 8) * sizeof(half);
+
+    const size_t nbytes_shared_total = std::max(nbytes_shared_KV + nbytes_shared_mask, nbytes_shared_combine);
 
     float logit_softcap;
     memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
@@ -680,42 +964,58 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
     fattn_kernel_t fattn_kernel;
     if (logit_softcap == 0.0f) {
         constexpr bool use_logit_softcap = false;
-        fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, KQ_stride, use_logit_softcap>;
+        fattn_kernel = flash_attn_ext_f16<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap>;
     } else {
         constexpr bool use_logit_softcap = true;
-        fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, KQ_stride, use_logit_softcap>;
+        fattn_kernel = flash_attn_ext_f16<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap>;
     }
-    launch_fattn<D, cols_per_block, 0, KQ_stride>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
+
+    launch_fattn<D, ncols1, ncols2, 0, KQ_per_iter>(ctx, dst, fattn_kernel, nwarps, nbytes_shared_total, true, true);
 }
 
-#define DECL_FATTN_MMA_F16_CASE(D, cols_per_block)                          \
+
+#define DECL_FATTN_MMA_F16_CASE(D, ncols1, ncols2)                          \
     template void ggml_cuda_flash_attn_ext_mma_f16_case                     \
-    <D, cols_per_block>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
+    <D, ncols1, ncols2>(ggml_backend_cuda_context & ctx, ggml_tensor * dst) \
 
-extern DECL_FATTN_MMA_F16_CASE( 64,  8);
-extern DECL_FATTN_MMA_F16_CASE( 80,  8);
-extern DECL_FATTN_MMA_F16_CASE( 96,  8);
-extern DECL_FATTN_MMA_F16_CASE(112,  8);
-extern DECL_FATTN_MMA_F16_CASE(128,  8);
-extern DECL_FATTN_MMA_F16_CASE(256,  8);
+#define DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(D, ncols) \
+    extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/1, 1); \
+    extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/2, 2); \
+    extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/4, 4); \
+    extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/8, 8); \
 
-extern DECL_FATTN_MMA_F16_CASE( 64, 16);
-extern DECL_FATTN_MMA_F16_CASE( 80, 16);
-extern DECL_FATTN_MMA_F16_CASE( 96, 16);
-extern DECL_FATTN_MMA_F16_CASE(112, 16);
-extern DECL_FATTN_MMA_F16_CASE(128, 16);
-extern DECL_FATTN_MMA_F16_CASE(256, 16);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,   8);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,   8);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,   8);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,   8);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,   8);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,   8);
 
-extern DECL_FATTN_MMA_F16_CASE( 64, 32);
-extern DECL_FATTN_MMA_F16_CASE( 80, 32);
-extern DECL_FATTN_MMA_F16_CASE( 96, 32);
-extern DECL_FATTN_MMA_F16_CASE(112, 32);
-extern DECL_FATTN_MMA_F16_CASE(128, 32);
-extern DECL_FATTN_MMA_F16_CASE(256, 32);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  16);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  16);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  16);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  16);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  16);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  16);
 
-extern DECL_FATTN_MMA_F16_CASE( 64, 64);
-extern DECL_FATTN_MMA_F16_CASE( 80, 64);
-extern DECL_FATTN_MMA_F16_CASE( 96, 64);
-extern DECL_FATTN_MMA_F16_CASE(112, 64);
-extern DECL_FATTN_MMA_F16_CASE(128, 64);
-extern DECL_FATTN_MMA_F16_CASE(256, 64);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  32);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  32);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  32);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  32);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  32);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  32);
+
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  64);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  64);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  64);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  64);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  64);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  64);
+
+// Kernels with ncols == 128 are only 4% faster due to register pressure.
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128);
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128);
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128);
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128);
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128);
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128); // Needs too much shared memory.
diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu
index 91ef95032..84a64b08f 100644
--- a/ggml/src/ggml-cuda/fattn-tile-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu
@@ -44,12 +44,7 @@ static __global__ void flash_attn_tile_ext_f16(
         const int ne1,
         const int ne2,
         const int ne3) {
-#ifdef FP16_AVAILABLE
-
-#ifndef FLASH_ATTN_AVAILABLE
-    NO_DEVICE_CODE;
-    return;
-#endif // FLASH_ATTN_AVAILABLE
+#if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
 
     // Skip unused kernel variants for faster compilation:
 #ifdef FP16_MMA_AVAILABLE
@@ -290,7 +285,7 @@ static __global__ void flash_attn_tile_ext_f16(
     }
 #else
    NO_DEVICE_CODE;
-#endif // FP16_AVAILABLE
+#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
 }
 
 template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
@@ -302,14 +297,14 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
             constexpr int    nwarps        = 8;
             constexpr size_t nbytes_shared = 0;
             fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
+            launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
         } break;
         case 128: {
             constexpr int    D             = 128;
             constexpr int    nwarps        = 8;
             constexpr size_t nbytes_shared = 0;
             fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
+            launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
         } break;
         default: {
             GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu
index 0d274f332..04b69c83b 100644
--- a/ggml/src/ggml-cuda/fattn-tile-f32.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu
@@ -44,10 +44,7 @@ static __global__ void flash_attn_tile_ext_f32(
         const int ne1,
         const int ne2,
         const int ne3) {
-#ifndef FLASH_ATTN_AVAILABLE
-    NO_DEVICE_CODE;
-    return;
-#endif // FLASH_ATTN_AVAILABLE
+#ifdef FLASH_ATTN_AVAILABLE
 
     // Skip unused kernel variants for faster compilation:
 #ifdef FP16_MMA_AVAILABLE
@@ -285,6 +282,9 @@ static __global__ void flash_attn_tile_ext_f32(
             dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
         }
     }
+#else
+    NO_DEVICE_CODE;
+#endif // FLASH_ATTN_AVAILABLE
 }
 
 template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
@@ -296,14 +296,14 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
             constexpr int    nwarps        = 8;
             constexpr size_t nbytes_shared = 0;
             fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
+            launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
         } break;
         case 128: {
             constexpr int    D             = 128;
             constexpr int    nwarps        = 8;
             constexpr size_t nbytes_shared = 0;
             fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
+            launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
         } break;
         default: {
             GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
index dee4595e1..47dd5aae3 100644
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -41,12 +41,7 @@ static __global__ void flash_attn_vec_ext_f16(
         const int ne1,
         const int ne2,
         const int ne3) {
-#ifdef FP16_AVAILABLE
-
-#ifndef FLASH_ATTN_AVAILABLE
-    NO_DEVICE_CODE;
-    return;
-#endif // FLASH_ATTN_AVAILABLE
+#if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
 
     // Skip unused kernel variants for faster compilation:
     if (use_logit_softcap && !(D == 128 || D == 256)) {
@@ -300,7 +295,7 @@ static __global__ void flash_attn_vec_ext_f16(
     }
 #else
    NO_DEVICE_CODE;
-#endif // FP16_AVAILABLE
+#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
 }
 
 template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
@@ -310,7 +305,7 @@ void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx,
     constexpr bool need_f16_K = D != 128;
     constexpr bool need_f16_V = D != 128 && D != 64;
     constexpr size_t nbytes_shared = 0;
-    launch_fattn<D, cols_per_block, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, need_f16_K, need_f16_V);
+    launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, need_f16_K, need_f16_V);
 }
 
 template <int D, ggml_type type_K, ggml_type type_V>
diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
index 6ef8f9dcc..c1d2dd8d1 100644
--- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -41,10 +41,7 @@ static __global__ void flash_attn_vec_ext_f32(
         const int ne1,
         const int ne2,
         const int ne3) {
-#ifndef FLASH_ATTN_AVAILABLE
-    NO_DEVICE_CODE;
-    return;
-#endif // FLASH_ATTN_AVAILABLE
+#ifdef FLASH_ATTN_AVAILABLE
 
     // Skip unused kernel variants for faster compilation:
     if (use_logit_softcap && !(D == 128 || D == 256)) {
@@ -281,6 +278,9 @@ static __global__ void flash_attn_vec_ext_f32(
     if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
         dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
     }
+#else
+    NO_DEVICE_CODE;
+#endif // FLASH_ATTN_AVAILABLE
 }
 
 template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
@@ -290,7 +290,7 @@ void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx,
     constexpr bool need_f16_K = D != 128;
     constexpr bool need_f16_V = D != 128 && D != 64;
     constexpr size_t nbytes_shared = 0;
-    launch_fattn<D, cols_per_block, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, need_f16_K, need_f16_V);
+    launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, need_f16_K, need_f16_V);
 }
 
 template <int D, ggml_type type_K, ggml_type type_V>
diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
index 45702ad65..8828652fb 100644
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@@ -51,7 +51,7 @@ static __global__ void flash_attn_ext_f16(
         const int ne1,
         const int ne2,
         const int ne3) {
-#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+#if defined(FLASH_ATTN_AVAILABLE) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
     // Skip unused kernel variants for faster compilation:
     if (use_logit_softcap && !(D == 128 || D == 256)) {
         NO_DEVICE_CODE;
@@ -425,7 +425,7 @@ static __global__ void flash_attn_ext_f16(
     }
 #else
    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+#endif // defined(FLASH_ATTN_AVAILABLE) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
 }
 
 constexpr int get_max_power_of_2(int x) {
@@ -478,7 +478,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
             fattn_kernel = flash_attn_ext_f16<
                 D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
         }
-        launch_fattn<D, cols_per_block, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
+        launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
         return;
     }
     if (2*blocks_num_pb1 < 2*nsm) {
@@ -493,7 +493,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
             fattn_kernel = flash_attn_ext_f16<
                 D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
         }
-        launch_fattn<D, cols_per_block, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
+        launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
         return;
     }
     constexpr int parallel_blocks = 1;
@@ -507,7 +507,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
         fattn_kernel = flash_attn_ext_f16<
             D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
     }
-    launch_fattn<D, cols_per_block, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
+    launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true);
 }
 
 void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
index b0cf152f5..b1becccb4 100644
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -8,28 +8,50 @@
 #include "fattn-wmma-f16.cuh"
 #include "fattn.cuh"
 
-template <int cols_per_block>
+template <int D, int ncols2>
+static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * Q = dst->src[0];
+
+    if (Q->ne[1] <= 8/ncols2) {
+        ggml_cuda_flash_attn_ext_mma_f16_case<D, 8/ncols2, ncols2>(ctx, dst);
+        return;
+    }
+
+    if (Q->ne[1] <= 16/ncols2) {
+        ggml_cuda_flash_attn_ext_mma_f16_case<D, 16/ncols2, ncols2>(ctx, dst);
+        return;
+    }
+
+    if (Q->ne[1] <= 32/ncols2) {
+        ggml_cuda_flash_attn_ext_mma_f16_case<D, 32/ncols2, ncols2>(ctx, dst);
+        return;
+    }
+
+    ggml_cuda_flash_attn_ext_mma_f16_case<D, 64/ncols2, ncols2>(ctx, dst);
+}
+
+template <int ncols2>
 static void ggml_cuda_flash_attn_ext_mma_f16_switch_hs(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const ggml_tensor * Q = dst->src[0];
 
     switch (Q->ne[0]) {
         case 64:
-            ggml_cuda_flash_attn_ext_mma_f16_case< 64, cols_per_block>(ctx, dst);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 64, ncols2>(ctx, dst);
             break;
         case 80:
-            ggml_cuda_flash_attn_ext_mma_f16_case< 80, cols_per_block>(ctx, dst);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 80, ncols2>(ctx, dst);
             break;
         case 96:
-            ggml_cuda_flash_attn_ext_mma_f16_case< 96, cols_per_block>(ctx, dst);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 96, ncols2>(ctx, dst);
             break;
         case 112:
-            ggml_cuda_flash_attn_ext_mma_f16_case<112, cols_per_block>(ctx, dst);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<112, ncols2>(ctx, dst);
             break;
         case 128:
-            ggml_cuda_flash_attn_ext_mma_f16_case<128, cols_per_block>(ctx, dst);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<128, ncols2>(ctx, dst);
             break;
         case 256:
-            ggml_cuda_flash_attn_ext_mma_f16_case<256, cols_per_block>(ctx, dst);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<256, ncols2>(ctx, dst);
             break;
         default:
             GGML_ABORT("fatal error");
@@ -38,24 +60,35 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_hs(ggml_backend_cuda_context
 }
 
 static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * Q = dst->src[0];
+    const ggml_tensor * KQV  = dst;
+    const ggml_tensor * Q    = dst->src[0];
+    const ggml_tensor * K    = dst->src[1];
+    const ggml_tensor * mask = dst->src[3];
 
-    if (Q->ne[1] <= 8) {
+    float max_bias = 0.0f;
+    memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
+
+    const float use_gqa_opt = mask && max_bias == 0.0f;
+
+    GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
+    const int gqa_ratio = Q->ne[2] / K->ne[2];
+
+    if (use_gqa_opt && gqa_ratio % 8 == 0) {
         ggml_cuda_flash_attn_ext_mma_f16_switch_hs<8>(ctx, dst);
         return;
     }
 
-    if (Q->ne[1] <= 16) {
-        ggml_cuda_flash_attn_ext_mma_f16_switch_hs<16>(ctx, dst);
+    if (use_gqa_opt && gqa_ratio == 4) {
+        ggml_cuda_flash_attn_ext_mma_f16_switch_hs<4>(ctx, dst);
         return;
     }
 
-    if (Q->ne[1] <= 32) {
-        ggml_cuda_flash_attn_ext_mma_f16_switch_hs<32>(ctx, dst);
+    if (use_gqa_opt && gqa_ratio == 2) {
+        ggml_cuda_flash_attn_ext_mma_f16_switch_hs<2>(ctx, dst);
         return;
     }
 
-    ggml_cuda_flash_attn_ext_mma_f16_switch_hs<64>(ctx, dst);
+    ggml_cuda_flash_attn_ext_mma_f16_switch_hs<1>(ctx, dst);
 }
 
 #define FATTN_VEC_F16_CASE(D, type_K, type_V)                               \
@@ -209,8 +242,11 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg
 }
 
 void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * KQV = dst;
-    const ggml_tensor * Q   = dst->src[0];
+    const ggml_tensor * KQV  = dst;
+    const ggml_tensor * Q    = dst->src[0];
+    const ggml_tensor * K    = dst->src[1];
+    const ggml_tensor * V    = dst->src[2];
+    const ggml_tensor * mask = dst->src[3];
 
     ggml_cuda_set_device(ctx.device);
     const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
@@ -252,7 +288,10 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
         return;
     }
 
-    if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
+    const int gqa_ratio = Q->ne[2] / K->ne[2];
+    const bool mma_fast_for_bs1 = fp16_mma_available(cc) && gqa_ratio % 2 == 0 &&
+        K->type == GGML_TYPE_F16 && V->type == GGML_TYPE_F16 && mask;
+    if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0 && !mma_fast_for_bs1) {
         if (prec == GGML_PREC_DEFAULT) {
             ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
             return;
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 65e092486..98afd04cf 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -541,12 +541,12 @@ static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
     return ctx->dev_ptr;
 }
 
-static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 
     if (tensor->view_src != NULL) {
         assert(tensor->view_src->buffer->buft == buffer->buft);
-        return;
+        return GGML_STATUS_SUCCESS;
     }
 
     if (ggml_is_quantized(tensor->type) && tensor->view_src == nullptr && ggml_backend_buffer_get_usage(buffer) != GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
@@ -559,6 +559,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
             CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
         }
     }
+    return GGML_STATUS_SUCCESS;
 }
 
 static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -793,7 +794,7 @@ static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buff
     GGML_UNUSED(buffer);
 }
 
-static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+static enum ggml_status ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
 
     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
@@ -839,6 +840,7 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
         }
     }
     tensor->extra = extra;
+    return GGML_STATUS_SUCCESS;
 }
 
 static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -3208,7 +3210,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_FLASH_ATTN_EXT: {
 #ifndef FLASH_ATTN_AVAILABLE
             return false;
-#endif
+#endif // FLASH_ATTN_AVAILABLE
             if (op->src[1]->type == GGML_TYPE_BF16 || op->src[2]->type == GGML_TYPE_BF16) {
                 return false;
             }
diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh
index 0a5656e4c..9206bfeba 100644
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -73,6 +73,8 @@ namespace ggml_cuda_mma {
                 return threadIdx.x / 4;
             } else if constexpr (I == 16 && J == 8) {
                 return (l / 2) * 8 + threadIdx.x / 4;
+            } else if constexpr (I == 16 && J == 16) {
+                return ((l / 2) % 2) * 8 + threadIdx.x / 4;
             } else {
                 static_assert(I == -1 && J == -1, "template specialization not implemented");
             }
@@ -85,6 +87,8 @@ namespace ggml_cuda_mma {
                 return 4 * l + threadIdx.x % 4;
             } else if constexpr (I == 16 && J == 8) {
                 return 2 * (threadIdx.x % 4) + l % 2;
+            } else if constexpr (I == 16 && J == 16) {
+                return 8 * (l / 4) + 2 * (threadIdx.x % 4) + l % 2;
             } else {
                 static_assert(I == -1 && J == -1, "template specialization not implemented");
             }
@@ -289,6 +293,42 @@ namespace ggml_cuda_mma {
 #endif // NEW_MMA_AVAILABLE
     }
 
+    static __device__ __forceinline__ void mma(
+            tile<16, 8, half2> & D, const tile<16, 8, half2> & A, const tile<16, 8, half2> & B) {
+#ifdef NEW_MMA_AVAILABLE
+        const int * Axi = (const int *) A.x;
+        const int * Bxi = (const int *) B.x;
+        int       * Dxi = (int       *) D.x;
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+        asm("mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%0, %1}, {%2, %3, %4, %5}, {%6, %7}, {%0, %1};"
+            : "+r"(Dxi[0]), "+r"(Dxi[1])
+            : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[2]));
+        asm("mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%0, %1}, {%2, %3, %4, %5}, {%6, %7}, {%0, %1};"
+            : "+r"(Dxi[2]), "+r"(Dxi[3])
+            : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[1]), "r"(Bxi[3]));
+#else
+        // On Turing m16n8k16 mma is not available, use 4x m8n8k8 mma instead:
+        asm("mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 {%0, %1}, {%2, %3}, {%4}, {%0, %1};"
+            : "+r"(Dxi[0]), "+r"(Dxi[1])
+            : "r"(Axi[0]), "r"(Axi[1]), "r"(Bxi[0]));
+        asm("mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 {%0, %1}, {%2, %3}, {%4}, {%0, %1};"
+            : "+r"(Dxi[0]), "+r"(Dxi[1])
+            : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[2]));
+        asm("mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 {%0, %1}, {%2, %3}, {%4}, {%0, %1};"
+            : "+r"(Dxi[2]), "+r"(Dxi[3])
+            : "r"(Axi[0]), "r"(Axi[1]), "r"(Bxi[1]));
+        asm("mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 {%0, %1}, {%2, %3}, {%4}, {%0, %1};"
+            : "+r"(Dxi[2]), "+r"(Dxi[3])
+            : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3]));
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+#else
+        GGML_UNUSED(D);
+        GGML_UNUSED(A);
+        GGML_UNUSED(B);
+        NO_DEVICE_CODE;
+#endif // NEW_MMA_AVAILABLE
+    }
+
     static __device__ __forceinline__ void mma(
             tile<16, 8, float> & D, const tile<16, 8, half2> & A, const tile<8, 8, half2> & B) {
 #ifdef NEW_MMA_AVAILABLE
@@ -316,4 +356,39 @@ namespace ggml_cuda_mma {
 #endif // NEW_MMA_AVAILABLE
     }
 
+    static __device__ __forceinline__ void mma(
+            tile<16, 16, float> & D, const tile<16, 8, half2> & A, const tile<16, 8, half2> & B) {
+#ifdef NEW_MMA_AVAILABLE
+        const int * Axi = (const int *) A.x;
+        const int * Bxi = (const int *) B.x;
+        int       * Dxi = (int       *) D.x;
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+        asm("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
+            : "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3])
+            : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[2]));
+        asm("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
+            : "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
+            : "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[1]), "r"(Bxi[3]));
+#else
+        // On Turing m16n8k16 mma is not available, use 4x m8n8k8 mma instead:
+        asm("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
+            : "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3])
+            : "r"(Axi[0]), "r"(Axi[1]), "r"(Bxi[0]));
+        asm("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
+            : "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3])
+            : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[2]));
+        asm("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
+            : "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
+            : "r"(Axi[0]), "r"(Axi[1]), "r"(Bxi[1]));
+        asm("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
+            : "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
+            : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3]));
+#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+#else
+        GGML_UNUSED(D);
+        GGML_UNUSED(A);
+        GGML_UNUSED(B);
+        NO_DEVICE_CODE;
+#endif // NEW_MMA_AVAILABLE
+    }
 }
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index dd7cb99be..847f19de1 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -110,9 +110,9 @@ static constexpr __device__ int get_mmq_x_max_device() {
 
 #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
 #ifdef GGML_CUDA_FORCE_MMQ
-    return MMQ_DP4A_MAX_BATCH_SIZE;
-#else // GGML_CUDA_FORCE_MMQ
     return 128;
+#else // GGML_CUDA_FORCE_MMQ
+    return MMQ_DP4A_MAX_BATCH_SIZE;
 #endif // GGML_CUDA_FORCE_MMQ
 #else // __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
 
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb16.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb16.cu
deleted file mode 100644
index f09bdeff7..000000000
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb16.cu
+++ /dev/null
@@ -1,10 +0,0 @@
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-mma-f16.cuh"
-
-DECL_FATTN_MMA_F16_CASE(64, 16);
-DECL_FATTN_MMA_F16_CASE(80, 16);
-DECL_FATTN_MMA_F16_CASE(96, 16);
-DECL_FATTN_MMA_F16_CASE(112, 16);
-DECL_FATTN_MMA_F16_CASE(128, 16);
-DECL_FATTN_MMA_F16_CASE(256, 16);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb32.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb32.cu
deleted file mode 100644
index 221108873..000000000
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb32.cu
+++ /dev/null
@@ -1,10 +0,0 @@
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-mma-f16.cuh"
-
-DECL_FATTN_MMA_F16_CASE(64, 32);
-DECL_FATTN_MMA_F16_CASE(80, 32);
-DECL_FATTN_MMA_F16_CASE(96, 32);
-DECL_FATTN_MMA_F16_CASE(112, 32);
-DECL_FATTN_MMA_F16_CASE(128, 32);
-DECL_FATTN_MMA_F16_CASE(256, 32);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb64.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb64.cu
deleted file mode 100644
index d24b08575..000000000
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb64.cu
+++ /dev/null
@@ -1,10 +0,0 @@
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-mma-f16.cuh"
-
-DECL_FATTN_MMA_F16_CASE(64, 64);
-DECL_FATTN_MMA_F16_CASE(80, 64);
-DECL_FATTN_MMA_F16_CASE(96, 64);
-DECL_FATTN_MMA_F16_CASE(112, 64);
-DECL_FATTN_MMA_F16_CASE(128, 64);
-DECL_FATTN_MMA_F16_CASE(256, 64);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb8.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb8.cu
deleted file mode 100644
index bdf86c0ea..000000000
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-cpb8.cu
+++ /dev/null
@@ -1,10 +0,0 @@
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-mma-f16.cuh"
-
-DECL_FATTN_MMA_F16_CASE(64, 8);
-DECL_FATTN_MMA_F16_CASE(80, 8);
-DECL_FATTN_MMA_F16_CASE(96, 8);
-DECL_FATTN_MMA_F16_CASE(112, 8);
-DECL_FATTN_MMA_F16_CASE(128, 8);
-DECL_FATTN_MMA_F16_CASE(256, 8);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
new file mode 100644
index 000000000..80108615a
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 1, 8);
+DECL_FATTN_MMA_F16_CASE(80, 1, 8);
+DECL_FATTN_MMA_F16_CASE(96, 1, 8);
+DECL_FATTN_MMA_F16_CASE(112, 1, 8);
+DECL_FATTN_MMA_F16_CASE(128, 1, 8);
+DECL_FATTN_MMA_F16_CASE(256, 1, 8);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu
new file mode 100644
index 000000000..66161c0ab
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 16, 1);
+DECL_FATTN_MMA_F16_CASE(80, 16, 1);
+DECL_FATTN_MMA_F16_CASE(96, 16, 1);
+DECL_FATTN_MMA_F16_CASE(112, 16, 1);
+DECL_FATTN_MMA_F16_CASE(128, 16, 1);
+DECL_FATTN_MMA_F16_CASE(256, 16, 1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu
new file mode 100644
index 000000000..ee88c72aa
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 16, 2);
+DECL_FATTN_MMA_F16_CASE(80, 16, 2);
+DECL_FATTN_MMA_F16_CASE(96, 16, 2);
+DECL_FATTN_MMA_F16_CASE(112, 16, 2);
+DECL_FATTN_MMA_F16_CASE(128, 16, 2);
+DECL_FATTN_MMA_F16_CASE(256, 16, 2);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
new file mode 100644
index 000000000..d888a5a42
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 16, 4);
+DECL_FATTN_MMA_F16_CASE(80, 16, 4);
+DECL_FATTN_MMA_F16_CASE(96, 16, 4);
+DECL_FATTN_MMA_F16_CASE(112, 16, 4);
+DECL_FATTN_MMA_F16_CASE(128, 16, 4);
+DECL_FATTN_MMA_F16_CASE(256, 16, 4);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
new file mode 100644
index 000000000..d93a2d08e
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 2, 4);
+DECL_FATTN_MMA_F16_CASE(80, 2, 4);
+DECL_FATTN_MMA_F16_CASE(96, 2, 4);
+DECL_FATTN_MMA_F16_CASE(112, 2, 4);
+DECL_FATTN_MMA_F16_CASE(128, 2, 4);
+DECL_FATTN_MMA_F16_CASE(256, 2, 4);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
new file mode 100644
index 000000000..617464c94
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 2, 8);
+DECL_FATTN_MMA_F16_CASE(80, 2, 8);
+DECL_FATTN_MMA_F16_CASE(96, 2, 8);
+DECL_FATTN_MMA_F16_CASE(112, 2, 8);
+DECL_FATTN_MMA_F16_CASE(128, 2, 8);
+DECL_FATTN_MMA_F16_CASE(256, 2, 8);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu
new file mode 100644
index 000000000..970d2b686
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 32, 1);
+DECL_FATTN_MMA_F16_CASE(80, 32, 1);
+DECL_FATTN_MMA_F16_CASE(96, 32, 1);
+DECL_FATTN_MMA_F16_CASE(112, 32, 1);
+DECL_FATTN_MMA_F16_CASE(128, 32, 1);
+DECL_FATTN_MMA_F16_CASE(256, 32, 1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu
new file mode 100644
index 000000000..65cd377c3
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 32, 2);
+DECL_FATTN_MMA_F16_CASE(80, 32, 2);
+DECL_FATTN_MMA_F16_CASE(96, 32, 2);
+DECL_FATTN_MMA_F16_CASE(112, 32, 2);
+DECL_FATTN_MMA_F16_CASE(128, 32, 2);
+DECL_FATTN_MMA_F16_CASE(256, 32, 2);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu
new file mode 100644
index 000000000..f4a8bf348
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 4, 2);
+DECL_FATTN_MMA_F16_CASE(80, 4, 2);
+DECL_FATTN_MMA_F16_CASE(96, 4, 2);
+DECL_FATTN_MMA_F16_CASE(112, 4, 2);
+DECL_FATTN_MMA_F16_CASE(128, 4, 2);
+DECL_FATTN_MMA_F16_CASE(256, 4, 2);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
new file mode 100644
index 000000000..de191a8ab
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 4, 4);
+DECL_FATTN_MMA_F16_CASE(80, 4, 4);
+DECL_FATTN_MMA_F16_CASE(96, 4, 4);
+DECL_FATTN_MMA_F16_CASE(112, 4, 4);
+DECL_FATTN_MMA_F16_CASE(128, 4, 4);
+DECL_FATTN_MMA_F16_CASE(256, 4, 4);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu
new file mode 100644
index 000000000..e8cb0e1b3
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 4, 8);
+DECL_FATTN_MMA_F16_CASE(80, 4, 8);
+DECL_FATTN_MMA_F16_CASE(96, 4, 8);
+DECL_FATTN_MMA_F16_CASE(112, 4, 8);
+DECL_FATTN_MMA_F16_CASE(128, 4, 8);
+DECL_FATTN_MMA_F16_CASE(256, 4, 8);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu
new file mode 100644
index 000000000..a532e9629
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 64, 1);
+DECL_FATTN_MMA_F16_CASE(80, 64, 1);
+DECL_FATTN_MMA_F16_CASE(96, 64, 1);
+DECL_FATTN_MMA_F16_CASE(112, 64, 1);
+DECL_FATTN_MMA_F16_CASE(128, 64, 1);
+DECL_FATTN_MMA_F16_CASE(256, 64, 1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu
new file mode 100644
index 000000000..bf25181aa
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 8, 1);
+DECL_FATTN_MMA_F16_CASE(80, 8, 1);
+DECL_FATTN_MMA_F16_CASE(96, 8, 1);
+DECL_FATTN_MMA_F16_CASE(112, 8, 1);
+DECL_FATTN_MMA_F16_CASE(128, 8, 1);
+DECL_FATTN_MMA_F16_CASE(256, 8, 1);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu
new file mode 100644
index 000000000..378c132e6
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 8, 2);
+DECL_FATTN_MMA_F16_CASE(80, 8, 2);
+DECL_FATTN_MMA_F16_CASE(96, 8, 2);
+DECL_FATTN_MMA_F16_CASE(112, 8, 2);
+DECL_FATTN_MMA_F16_CASE(128, 8, 2);
+DECL_FATTN_MMA_F16_CASE(256, 8, 2);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu
new file mode 100644
index 000000000..372641be9
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 8, 4);
+DECL_FATTN_MMA_F16_CASE(80, 8, 4);
+DECL_FATTN_MMA_F16_CASE(96, 8, 4);
+DECL_FATTN_MMA_F16_CASE(112, 8, 4);
+DECL_FATTN_MMA_F16_CASE(128, 8, 4);
+DECL_FATTN_MMA_F16_CASE(256, 8, 4);
diff --git a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu
new file mode 100644
index 000000000..9ff5968b6
--- /dev/null
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu
@@ -0,0 +1,10 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(64, 8, 8);
+DECL_FATTN_MMA_F16_CASE(80, 8, 8);
+DECL_FATTN_MMA_F16_CASE(96, 8, 8);
+DECL_FATTN_MMA_F16_CASE(112, 8, 8);
+DECL_FATTN_MMA_F16_CASE(128, 8, 8);
+DECL_FATTN_MMA_F16_CASE(256, 8, 8);
diff --git a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
index a2628f16e..dd373a09d 100755
--- a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
+++ b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
@@ -18,7 +18,7 @@ SOURCE_FATTN_MMA_START = """// This file has been autogenerated by generate_cu_f
 
 """
 
-SOURCE_FATTN_MMA_CASE = "DECL_FATTN_MMA_F16_CASE({head_size}, {cols_per_block});\n"
+SOURCE_FATTN_MMA_CASE = "DECL_FATTN_MMA_F16_CASE({head_size}, {ncols1}, {ncols2});\n"
 
 TYPES_MMQ = [
     "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
@@ -57,12 +57,18 @@ for vkq_size in [16, 32]:
                 with open(f"fattn-vec-f{vkq_size}-instance-hs{head_size}-{get_short_name(type_k)}-{get_short_name(type_v)}.cu", "w") as f:
                     f.write(SOURCE_FATTN_VEC.format(vkq_size=vkq_size, head_size=head_size, type_k=type_k, type_v=type_v))
 
-for cols_per_block in [8, 16, 32, 64]:
-    with open(f"fattn-mma-f16-instance-cpb{cols_per_block}.cu", "w") as f:
-        f.write(SOURCE_FATTN_MMA_START)
+for ncols in [8, 16, 32, 64, 128]:
+    for ncols2 in [1, 2, 4, 8]:
+        ncols1 = ncols // ncols2
+        if ncols == 128:
+            continue  # Too much register pressure.
+        with open(f"fattn-mma-f16-instance-ncols1_{ncols1}-ncols2_{ncols2}.cu", "w") as f:
+            f.write(SOURCE_FATTN_MMA_START)
 
-        for head_size in [64, 80, 96, 112, 128, 256]:
-            f.write(SOURCE_FATTN_MMA_CASE.format(cols_per_block=cols_per_block, head_size=head_size))
+            for head_size in [64, 80, 96, 112, 128, 256]:
+                if ncols == 128 and head_size == 256:
+                    continue  # Needs too much shared memory.
+                f.write(SOURCE_FATTN_MMA_CASE.format(ncols1=ncols1, ncols2=ncols2, head_size=head_size))
 
 for type in TYPES_MMQ:
     with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f:
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 63b384d0f..cf52fa336 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -407,6 +407,16 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0,
     GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1,
     GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,
+    GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F32,
+    GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F16,
+    GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F32,
+    GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F16,
+    GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F32,
+    GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F16,
+    GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F32,
+    GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F16,
+    GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F32,
+    GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F16,
     GGML_METAL_KERNEL_TYPE_CONCAT,
     GGML_METAL_KERNEL_TYPE_SQR,
     GGML_METAL_KERNEL_TYPE_SQRT,
@@ -1012,6 +1022,16 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0,                  cpy_f32_q5_0,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1,                  cpy_f32_q5_1,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,                cpy_f32_iq4_nl,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F32,                  cpy_q4_0_f32,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F16,                  cpy_q4_0_f16,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F32,                  cpy_q4_1_f32,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F16,                  cpy_q4_1_f16,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F32,                  cpy_q5_0_f32,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F16,                  cpy_q5_0_f16,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F32,                  cpy_q5_1_f32,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F16,                  cpy_q5_1_f16,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F32,                  cpy_q8_0_f32,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F16,                  cpy_q8_0_f16,                   true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT,                        concat,                         true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR,                           sqr,                            true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT,                          sqrt,                           true);
@@ -1287,6 +1307,18 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
                             default:
                                 return false;
                         }
+                    case GGML_TYPE_Q4_0:
+                    case GGML_TYPE_Q4_1:
+                    case GGML_TYPE_Q5_0:
+                    case GGML_TYPE_Q5_1:
+                    case GGML_TYPE_Q8_0:
+                        switch (op->type) {
+                            case GGML_TYPE_F32:
+                            case GGML_TYPE_F16:
+                                return true;
+                            default:
+                                return false;
+                        }
                     default:
                         return false;
                 };
@@ -3899,10 +3931,6 @@ static void ggml_metal_encode_node(
         case GGML_OP_CPY:
         case GGML_OP_CONT:
             {
-                GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
-
-                int nth = MIN(1024, ne00/ggml_blck_size(src0->type));
-
                 id<MTLComputePipelineState> pipeline = nil;
 
                 switch (src0t) {
@@ -3936,7 +3964,47 @@ static void ggml_metal_encode_node(
                             switch (dstt) {
                                 case GGML_TYPE_F32:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_BF16_F32].pipeline; break;
                                 case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_BF16_BF16].pipeline; break;
-                                default: GGML_ASSERT(false && "not implemented");
+                                default: GGML_ABORT("not implemented");
+                            };
+                        } break;
+                    case GGML_TYPE_Q4_0:
+                        {
+                            switch (dstt) {
+                                case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F32].pipeline; break;
+                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q4_0_F16].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            };
+                        } break;
+                    case GGML_TYPE_Q4_1:
+                        {
+                            switch (dstt) {
+                                case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F32].pipeline; break;
+                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q4_1_F16].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            };
+                        } break;
+                    case GGML_TYPE_Q5_0:
+                        {
+                            switch (dstt) {
+                                case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F32].pipeline; break;
+                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q5_0_F16].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            };
+                        } break;
+                    case GGML_TYPE_Q5_1:
+                        {
+                            switch (dstt) {
+                                case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F32].pipeline; break;
+                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q5_1_F16].pipeline; break;
+                                default: GGML_ABORT("not implemented");
+                            };
+                        } break;
+                    case GGML_TYPE_Q8_0:
+                        {
+                            switch (dstt) {
+                                case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F32].pipeline; break;
+                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_Q8_0_F16].pipeline; break;
+                                default: GGML_ABORT("not implemented");
                             };
                         } break;
                     default: GGML_ABORT("not implemented");
@@ -3966,7 +4034,11 @@ static void ggml_metal_encode_node(
                 [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
                 [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
 
+                GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
+                int nth = MIN(1024, ne00/ggml_blck_size(src0->type));
+
                 [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+
             } break;
         case GGML_OP_SET:
             {
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 83e7ac9f4..d092a1690 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4341,6 +4341,49 @@ kernel void kernel_cpy_f32_iq4_nl(
     }
 }
 
+template<typename T4x4, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread T4x4 &)>
+kernel void kernel_cpy_q_f32(
+        constant ggml_metal_kargs_cpy & args,
+        device  const char * src0,
+        device        char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i03 = tgpig[2];
+    const int i02 = tgpig[1];
+    const int i01 = tgpig[0];
+
+    const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00;
+
+    const int64_t i3 = n/(args.ne2*args.ne1*args.ne0);
+    const int64_t i2 = (n - i3*args.ne2*args.ne1*args.ne0)/(args.ne1*args.ne0);
+    const int64_t i1 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0)/args.ne0;
+    const int64_t i0 = (n - i3*args.ne2*args.ne1*args.ne0 - i2*args.ne1*args.ne0 - i1*args.ne0);
+
+    device const block_q * src_data = (device const block_q *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01);
+    device       T4x4    * dst_data = (device       T4x4    *)(dst  +  i3*args.nb3  +  i2*args.nb2  +  i1*args.nb1 + i0*args.nb0);
+
+    for (int64_t i00 = tpitg.x; i00 < args.ne00/16; i00 += ntg.x) {
+        T4x4 temp;
+        dequantize_func(src_data + i00/nl, i00%nl, temp);
+        dst_data[i00] = temp;
+    }
+}
+
+typedef decltype(kernel_cpy_q_f32<float4x4, block_q4_0, 2, dequantize_q4_0>) cpy_q_f_t;
+
+template [[host_name("kernel_cpy_q4_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q4_0, 2, dequantize_q4_0>;
+template [[host_name("kernel_cpy_q4_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q4_1, 2, dequantize_q4_1>;
+template [[host_name("kernel_cpy_q5_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q5_0, 2, dequantize_q5_0>;
+template [[host_name("kernel_cpy_q5_1_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q5_1, 2, dequantize_q5_1>;
+template [[host_name("kernel_cpy_q8_0_f32")]] kernel cpy_q_f_t kernel_cpy_q_f32<float4x4, block_q8_0, 2, dequantize_q8_0>;
+
+template [[host_name("kernel_cpy_q4_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q4_0, 2, dequantize_q4_0>;
+template [[host_name("kernel_cpy_q4_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q4_1, 2, dequantize_q4_1>;
+template [[host_name("kernel_cpy_q5_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_0, 2, dequantize_q5_0>;
+template [[host_name("kernel_cpy_q5_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_1, 2, dequantize_q5_1>;
+template [[host_name("kernel_cpy_q8_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q8_0, 2, dequantize_q8_0>;
+
 kernel void kernel_concat(
     constant ggml_metal_kargs_concat & args,
     device  const char * src0,
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 7a0f94cf2..dc9a718f7 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -444,19 +444,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
         backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
         backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
 
-        // Default wave size is 128, A8x uses 64.
-        if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A8X) {
-            backend_ctx->adreno_wave_size = 64;
-        } else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
-                   backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
-            backend_ctx->adreno_wave_size = 128;
-        } else {
-            backend_ctx->adreno_wave_size = 128;
-            GGML_LOG_WARN("ggml_opencl: Unsupported Adreno GPU: %s, "
-                "using wave size %d, "
-                "may not work as expected\n",
-                backend_ctx->device_name.c_str(), backend_ctx->adreno_wave_size);
-        }
+        // Use wave size of 64 for all Adreno GPUs.
+        backend_ctx->adreno_wave_size = 64;
     } else if (strstr(default_device->name, "Intel")) {
         backend_ctx->gpu_family = GPU_FAMILY::INTEL;
     } else {
@@ -1222,7 +1211,7 @@ static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer)
     GGML_UNUSED(buffer);
 }
 
-static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
 
     ggml_cl2_init(buffer->buft->device);
@@ -1262,6 +1251,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
             tensor->extra = extra;
         }
     }
+    return GGML_STATUS_SUCCESS;
 }
 
 // The optimized gemm and gemv kernels are used for large matrices without batch.
@@ -1376,6 +1366,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
         int M = tensor->ne[1];   // ne01
         int K = tensor->ne[0];   // ne00
 
+        //For matrix-vector multiplication kernel, we assume K is a multiple of 32
+        GGML_ASSERT(K % 32 == 0);
+        //For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
+        GGML_ASSERT(M % 4 == 0);
+
         // transpose is out of place, so we need to allocate transposed buffers
         // <----------------------------------------------------------------------------------> //
         // use sub_buffer of max buffer size instead
@@ -1416,36 +1411,36 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
         cl_mem qT_d_image1D;
         cl_mem dT_d_image1D;
 
-        cl_image_format img_fmt_1d = { CL_RGBA, CL_FLOAT };
+        cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
         cl_image_desc img_desc_1d;
 
         memset(&img_desc_1d, 0, sizeof(img_desc_1d));
         img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
-        img_desc_1d.image_width = M * K / 8 / 4;
+        img_desc_1d.image_width = M * K / 4 / 4;
         img_desc_1d.buffer = extra->q;
         q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
         CL_CHECK(err);
 
-        img_fmt_1d = { CL_RGBA, CL_FLOAT };
+        img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
         memset(&img_desc_1d, 0, sizeof(img_desc_1d));
         img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
-        img_desc_1d.image_width = M * K / 8 / 4;
+        img_desc_1d.image_width = M * K / 4 / 4;
         img_desc_1d.buffer = qT_d;
         qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
         CL_CHECK(err);
 
-        img_fmt_1d = { CL_RGBA, CL_FLOAT };
+        img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
         memset(&img_desc_1d, 0, sizeof(img_desc_1d));
         img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
-        img_desc_1d.image_width = M * K / 32 / 4 / 2;
+        img_desc_1d.image_width = M * K / 32 / 4;
         img_desc_1d.buffer = extra->d;
         d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
         CL_CHECK(err);
 
-        img_fmt_1d = { CL_RGBA, CL_FLOAT };
+        img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
         memset(&img_desc_1d, 0, sizeof(img_desc_1d));
         img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
-        img_desc_1d.image_width = M * K / 32 / 4 / 2;
+        img_desc_1d.image_width = M * K / 32 / 4;
         img_desc_1d.buffer = dT_d;
         dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
         CL_CHECK(err);
@@ -1454,8 +1449,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
         // set up and call the transpose kernels
         // <----------------------------------------------------------------------------------> //
         // weights
-        int height_q = M / 8;
-        int width_q = K / 8 / 4;
+        int height_q = M / 4;
+        int width_q = K / 4 / 4;
         kernel = backend_ctx->kernel_transpose_16;
 
         CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
@@ -1469,8 +1464,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
         CL_CHECK(clWaitForEvents(1, &evt));
 
         // scales
-        int height_s = M / 8;
-        int width_s = K / 32 / 8;
+        int height_s = M / 4;
+        int width_s = K / 32 / 4;
 
         kernel = backend_ctx->kernel_transpose_16;
         CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
@@ -1864,7 +1859,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
     void * buf_d;
 #endif
 
-#ifdef GGML_USE_OPENCL
     // Make sure everything is done.
     CL_CHECK(clFinish(queue));
 
@@ -1900,7 +1894,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
         extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
     CL_CHECK(clFinish(queue));
 #endif // GGML_OPENCL_SOA_Q
-#endif // GGML_USE_OPENCL
 
     // Open file and dump.
     char fname[512];
@@ -2865,6 +2858,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
             CL_CHECK(status);
 
             int height_B = N/4;
+            if (height_B == 0) {
+                height_B = 1;
+            }
             int width_B = K/4;
             int padded_height_B = (N + padding)/4;
 
@@ -3013,11 +3009,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
         }
 
         if (N == 1) {
-            local_work_size[0] = backend_ctx->adreno_wave_size; // localsize
+            size_t wavesize = backend_ctx->adreno_wave_size;
+            local_work_size[0] = wavesize; // localsize
             local_work_size[1] = 4; // reduce factor
             local_work_size[2] = 1;
 
-            global_work_size[0] = M / 2;
+            global_work_size[0] = (((M / 2) + wavesize - 1) / wavesize) * wavesize;
             global_work_size[1] = 4; // reduce factor
             global_work_size[2] = 1;
         }
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl.cl
index d3cfb2f91..8882a8c9c 100644
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl.cl
@@ -1797,6 +1797,9 @@ kernel void kernel_mul_mat_f16_f16(
 //------------------------------------------------------------------------------
 // mul_mat_f16_f32_1row
 //------------------------------------------------------------------------------
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_64
+#endif
 kernel void kernel_mul_mat_f16_f32_1row(
         global char * src0,
         ulong offset0,
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
index 5e195411d..ee5c79f00 100644
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
@@ -1,9 +1,11 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
-#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
-#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
-#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
+
+#ifdef cl_qcom_reqd_sub_group_size
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
+#endif
 
 // assume
 #define QK4_0 32
@@ -186,8 +188,9 @@
     total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
     total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
 
-
-__attribute__((qcom_reqd_sub_group_size("full")))
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_64
+#endif
 __kernel void kernel_gemv_noshuffle(
         __read_only  image1d_buffer_t src0_q,  // quantized A
         global half2  * src0_d,  // A scales
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl
index 5bdd4d067..469d3edef 100644
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl
@@ -1,9 +1,11 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
-#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
-#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
-#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
+
+#ifdef cl_qcom_reqd_sub_group_size
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
+#endif
 
 // assume
 #define QK4_0 32
@@ -186,8 +188,9 @@
     total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
     total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
 
-
-__attribute__((qcom_reqd_sub_group_size("full")))
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_64
+#endif
 __kernel void kernel_gemv_noshuffle(
         __read_only  image1d_buffer_t src0_q,  // quantized A
         global half2  * src0_d,  // A scales
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
index 57768c803..ecb577b99 100644
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
@@ -7,7 +7,16 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 
-__attribute__((qcom_reqd_sub_group_size("full")))
+#ifdef cl_qcom_reqd_sub_group_size
+#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
+#define ADRENO_GPU 1
+#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
+#endif
+
+#ifdef ADRENO_GPU
+REQD_SUBGROUP_SIZE_128
+#endif
+
 kernel void kernel_mul_mat_Ab_Bi_8x4(
         global const ushort * src0_q,       // quantized A
         global const half  * src0_d,        // A scales
diff --git a/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
index d59a0c05d..cd4e0afba 100644
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
@@ -1,4 +1,6 @@
-// 16-bit transpose, loading/storing an 8x8 tile of elements
+// 16-bit transpose, loading/storing a 4x4 tile of elements
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 kernel void kernel_transpose_16(
     __read_only image1d_buffer_t input,
@@ -9,24 +11,16 @@ kernel void kernel_transpose_16(
 
     const int i = get_global_id(0);
     const int j = get_global_id(1);
-    const int i_3 = i<<3;
-    const int j_3 = j<<3;
+    const int i_2 = i<<2;
+    const int j_2 = j<<2;
 
-    ushort8 temp0 = as_ushort8(read_imagef(input, (j_3+0)*cols+i));
-    ushort8 temp1 = as_ushort8(read_imagef(input, (j_3+1)*cols+i));
-    ushort8 temp2 = as_ushort8(read_imagef(input, (j_3+2)*cols+i));
-    ushort8 temp3 = as_ushort8(read_imagef(input, (j_3+3)*cols+i));
-    ushort8 temp4 = as_ushort8(read_imagef(input, (j_3+4)*cols+i));
-    ushort8 temp5 = as_ushort8(read_imagef(input, (j_3+5)*cols+i));
-    ushort8 temp6 = as_ushort8(read_imagef(input, (j_3+6)*cols+i));
-    ushort8 temp7 = as_ushort8(read_imagef(input, (j_3+7)*cols+i));
+    half4 temp0 = read_imageh(input, (j_2+0)*cols+i);
+    half4 temp1 = read_imageh(input, (j_2+1)*cols+i);
+    half4 temp2 = read_imageh(input, (j_2+2)*cols+i);
+    half4 temp3 = read_imageh(input, (j_2+3)*cols+i);
 
-    write_imagef(output, (i_3+0)*rows+j, as_float4((ushort8)(temp0.s0, temp1.s0, temp2.s0, temp3.s0, temp4.s0, temp5.s0, temp6.s0, temp7.s0)));
-    write_imagef(output, (i_3+1)*rows+j, as_float4((ushort8)(temp0.s1, temp1.s1, temp2.s1, temp3.s1, temp4.s1, temp5.s1, temp6.s1, temp7.s1)));
-    write_imagef(output, (i_3+2)*rows+j, as_float4((ushort8)(temp0.s2, temp1.s2, temp2.s2, temp3.s2, temp4.s2, temp5.s2, temp6.s2, temp7.s2)));
-    write_imagef(output, (i_3+3)*rows+j, as_float4((ushort8)(temp0.s3, temp1.s3, temp2.s3, temp3.s3, temp4.s3, temp5.s3, temp6.s3, temp7.s3)));
-    write_imagef(output, (i_3+4)*rows+j, as_float4((ushort8)(temp0.s4, temp1.s4, temp2.s4, temp3.s4, temp4.s4, temp5.s4, temp6.s4, temp7.s4)));
-    write_imagef(output, (i_3+5)*rows+j, as_float4((ushort8)(temp0.s5, temp1.s5, temp2.s5, temp3.s5, temp4.s5, temp5.s5, temp6.s5, temp7.s5)));
-    write_imagef(output, (i_3+6)*rows+j, as_float4((ushort8)(temp0.s6, temp1.s6, temp2.s6, temp3.s6, temp4.s6, temp5.s6, temp6.s6, temp7.s6)));
-    write_imagef(output, (i_3+7)*rows+j, as_float4((ushort8)(temp0.s7, temp1.s7, temp2.s7, temp3.s7, temp4.s7, temp5.s7, temp6.s7, temp7.s7)));
+    write_imageh(output, (i_2+0)*rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
+    write_imageh(output, (i_2+1)*rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
+    write_imageh(output, (i_2+2)*rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
+    write_imageh(output, (i_2+3)*rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
 }
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index 97873acc7..6c3b80b08 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -464,7 +464,7 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
     return result;
 }
 
-static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+static enum ggml_status ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
 
     // CUDA backend on the server pads everything to 512 due to CUDA limitations.
@@ -478,6 +478,7 @@ static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, gg
         bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0);
         GGML_ASSERT(status);
     }
+    return GGML_STATUS_SUCCESS;
 }
 
 static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp
index 022e7b763..9069c4786 100644
--- a/ggml/src/ggml-sycl/common.cpp
+++ b/ggml/src/ggml-sycl/common.cpp
@@ -99,3 +99,20 @@ catch (sycl::exception const &exc) {
             << ", line:" << __LINE__ << std::endl;
   std::exit(1);
 }
+
+
+void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams) {
+    for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
+        for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
+            if (extra->events[i][is] != nullptr) {
+                SYCL_CHECK(CHECK_TRY_ERROR(dpct::destroy_event(extra->events[i][is])));
+            }
+        }
+        if (extra->data_device[i] != nullptr && streams.size()>0) {
+            ggml_sycl_set_device(i);
+            SYCL_CHECK(
+                CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i]))));
+        }
+    }
+    delete extra;
+}
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index abad847ca..7c503a1b1 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -19,6 +19,9 @@
 #include "dpct/helper.hpp"
 #include "ggml-sycl.h"
 #include "presets.hpp"
+#include "sycl_hw.hpp"
+
+
 #if GGML_SYCL_DNNL
 #include "dnnl.hpp"
 #include "dnnl_sycl.hpp"
@@ -35,7 +38,10 @@
 void* ggml_sycl_host_malloc(size_t size);
 void ggml_sycl_host_free(void* ptr);
 
-static int g_ggml_sycl_debug = 0;
+
+extern int g_ggml_sycl_debug;
+extern int g_ggml_sycl_disable_optimize;
+
 #define GGML_SYCL_DEBUG(...)        \
   do {                              \
     if (g_ggml_sycl_debug)          \
@@ -182,18 +188,24 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
 }
 
 //////////////////////
+struct optimize_feature {
+    bool reorder=false;
+};
+
+struct sycl_device_info {
+    int     cc;                 // compute capability
+    // int     nsm;                // number of streaming multiprocessors
+    // size_t  smpb;               // max. shared memory per block
+    bool    vmm;                // virtual memory support
+    size_t  total_vram;
+    sycl_hw_info hw_info;
+    optimize_feature opt_feature;
+};
+
 
 struct ggml_sycl_device_info {
     int device_count;
 
-    struct sycl_device_info {
-        int     cc;                 // compute capability
-        // int     nsm;                // number of streaming multiprocessors
-        // size_t  smpb;               // max. shared memory per block
-        bool    vmm;                // virtual memory support
-        size_t  total_vram;
-    };
-
     sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
 
     std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
@@ -260,17 +272,46 @@ struct ggml_tensor_extra_gpu {
                                        // tensors
   dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
                         [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
+  optimize_feature optimized_feature;
 };
 
+void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
+
+inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) {
+    optimize_feature opt;
+
+    opt.reorder =
+        (arch == syclex::architecture::intel_gpu_dg1 ||
+         arch == syclex::architecture::intel_gpu_acm_g10 ||
+         arch == syclex::architecture::intel_gpu_acm_g11 ||
+         arch == syclex::architecture::intel_gpu_acm_g12 ||
+         arch == syclex::architecture::intel_gpu_pvc ||
+         arch == syclex::architecture::intel_gpu_pvc_vg ||
+         arch == syclex::architecture::intel_gpu_mtl_u ||
+         arch == syclex::architecture::intel_gpu_mtl_s ||
+         arch == syclex::architecture::intel_gpu_mtl_h ||
+         arch == syclex::architecture::intel_gpu_arl_u ||
+         arch == syclex::architecture::intel_gpu_arl_s ||
+         arch == syclex::architecture::intel_gpu_arl_h ||
+         arch == syclex::architecture::intel_gpu_bmg_g21 ||
+         arch == syclex::architecture::intel_gpu_lnl_m
+        );
+
+    return opt;
+}
+
 struct ggml_backend_sycl_context {
     int device;
     std::string name;
+    optimize_feature opt_feature;
+    bool optimized_graph=false;
 
     queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
 
     explicit ggml_backend_sycl_context(int device) :
         device(device),
         name(GGML_SYCL_NAME + std::to_string(device)) {
+        opt_feature = ggml_sycl_info().devices[device].opt_feature;
     }
 
     queue_ptr stream(int device, int stream) {
@@ -680,5 +721,4 @@ bool gpu_has_xmx(sycl::device &dev);
 void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
                                  const ggml_tensor *src1, ggml_tensor *dst,
                                  const ggml_sycl_op_flatten_t op);
-
 #endif // GGML_SYCL_COMMON_HPP
diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp
index 05b01db2d..86b200e07 100644
--- a/ggml/src/ggml-sycl/convert.cpp
+++ b/ggml/src/ggml-sycl/convert.cpp
@@ -125,6 +125,25 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
     }
 }
 
+template <typename dst_t>
+static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int64_t k,
+                                     dpct::queue_ptr stream) {
+
+    dpct::has_capability_or_fail(stream->get_device(),
+                                    {sycl::aspect::fp16});
+
+    int constexpr WARP_K = WARP_SIZE * QK4_0;
+    const int n_warp = (k + WARP_K - 1) / WARP_K;
+    GGML_ASSERT(k % 2 == 0);
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
+        sycl::range<3>(1, 1, WARP_SIZE),
+        sycl::range<3>(1, 1, WARP_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]]{
+            dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
+        });
+
+}
+
 template <typename dst_t>
 static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
                                      dpct::queue_ptr stream) {
@@ -452,10 +471,15 @@ static void convert_unary_sycl(const void *__restrict__ vx,
     }
 }
 
-to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
+to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst) {
     switch (type) {
         case GGML_TYPE_Q4_0:
-            return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
+            if (dst->src[0]->extra &&
+                ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
+                return dequantize_row_q4_0_sycl_reorder;
+            } else {
+                return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
+            }
         case GGML_TYPE_Q4_1:
             return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
         case GGML_TYPE_Q5_0:
@@ -499,10 +523,15 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
     }
 }
 
-to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
+to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
     switch (type) {
         case GGML_TYPE_Q4_0:
-            return dequantize_row_q4_0_sycl;
+            if (dst->src[0]->extra &&
+                ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
+                return dequantize_row_q4_0_sycl_reorder;
+            } else {
+                return dequantize_row_q4_0_sycl;
+            }
         case GGML_TYPE_Q4_1:
             return dequantize_row_q4_1_sycl;
         case GGML_TYPE_Q5_0:
diff --git a/ggml/src/ggml-sycl/convert.hpp b/ggml/src/ggml-sycl/convert.hpp
index 0ce2874aa..355dae22b 100644
--- a/ggml/src/ggml-sycl/convert.hpp
+++ b/ggml/src/ggml-sycl/convert.hpp
@@ -21,7 +21,7 @@ using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
 typedef to_t_sycl_t<float> to_fp32_sycl_t;
 typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
 
-to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type);
-to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type);
+to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst);
+to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst);
 
 #endif // GGML_SYCL_CONVERT_HPP
diff --git a/ggml/src/ggml-sycl/dequantize.hpp b/ggml/src/ggml-sycl/dequantize.hpp
index b8304c3a2..651c2160d 100644
--- a/ggml/src/ggml-sycl/dequantize.hpp
+++ b/ggml/src/ggml-sycl/dequantize.hpp
@@ -16,6 +16,8 @@
 #include "common.hpp"
 
 typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
+typedef void (*dequantize_kernel_t_reorder)(const void *d, const int64_t ib, const void *qs,
+                                            const int iqs, dfloat2 &v);
 
 static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib,
                                             const int iqs, dfloat2 &v) {
@@ -40,6 +42,29 @@ static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib,
 #endif // GGML_SYCL_F16
 }
 
+static __dpct_inline__ void dequantize_q4_0_reorder(const void *d_ptr, const int64_t ib, const void *qs,
+                                            const int iqs, dfloat2 &v) {
+    // const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const dfloat d = (const dfloat)*((const sycl::half*)d_ptr+ib);
+
+    const int vui = *((const uint8_t *)qs+iqs);
+
+    v.x() = vui & 0xF;
+    v.y() = vui >> 4;
+
+#ifdef GGML_SYCL_F16
+    // v = v - {8.0f, 8.0f};
+    // v = v * {d, d};
+    v.s0() = (v.s0() - 8.0f) * d;
+    v.s1() = (v.s1() - 8.0f) * d;
+
+#else
+    v.x() = (v.x() - 8.0f) * d;
+    v.y() = (v.y() - 8.0f) * d;
+#endif // GGML_SYCL_F16
+}
+
 static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib,
                                             const int iqs, dfloat2 &v) {
     const block_q4_1 * x = (const block_q4_1 *) vx;
@@ -167,6 +192,36 @@ static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restri
     }
 }
 
+template<typename dst_t>
+static void dequantize_block_q4_0_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
+                                  const sycl::nd_item<3> &item_ct1) {
+
+    const int64_t i = item_ct1.get_group(2);
+    auto k=nb32;
+    // assume 32 threads
+    const int64_t tid = item_ct1.get_local_id(2);
+    const int lane_ib = i * WARP_SIZE + tid;
+
+    if (lane_ib >= k / QK4_0) {
+        return;
+    }
+
+    dst_t * y_ptr = yy + lane_ib * QK4_0;
+
+    auto qs = (const uint8_t*)vx + lane_ib * QK4_0 / 2;
+    auto s_ptr = (const sycl::half*)((const uint8_t*)vx + k / 2) + lane_ib;
+
+    const float d = float(*s_ptr);
+
+#pragma unroll
+    for (int l = 0; l < QK4_0 / 2; ++l) {
+        int vq = qs[l];
+        y_ptr[l + 0] = d * ((vq & 0xF) - 8);
+        y_ptr[l + 16] = d * ((vq >> 4) - 8);
+    }
+
+}
+
 template<typename dst_t>
 static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
                                   const sycl::nd_item<3> &item_ct1) {
diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp
index 0d097357c..99d3859de 100644
--- a/ggml/src/ggml-sycl/dmmv.cpp
+++ b/ggml/src/ggml-sycl/dmmv.cpp
@@ -3,7 +3,6 @@
 #include "dequantize.hpp"
 #include "presets.hpp"
 
-
 static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
     const sycl::half *x = (const sycl::half *)vx;
 
@@ -91,6 +90,112 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat *
     }
 }
 
+template <int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_reorder>
+static void dequantize_mul_mat_vec_reorder(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
+                                   const sycl::nd_item<3> &item_ct1) {
+    // qk = quantized weights per x block
+    // qr = number of quantized weights per data value in x block
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int tid = item_ct1.get_local_id(2);
+
+
+    const int ncols_left = ncols % (QK4_0*WARP_SIZE);
+    const int ncols_align = ncols - ncols_left;
+    const int iter_stride = 8*2*GGML_SYCL_DMMV_X;
+    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter //64/16=4, 512/16/2= 16
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+// partial sum for each thread
+#ifdef GGML_SYCL_F16
+    sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
+#else
+    float tmp = 0.0f;
+#endif // GGML_SYCL_F16
+    const char *d_ptr = (const char*)vx+ncols*nrows/2;
+    int i=0;
+    for (i = 0; i < ncols_align; i += iter_stride) {
+        const int col = i + vals_per_iter*tid;
+        const int ib = (row*ncols + col)/qk; // x block index
+        const int iqs = (col%qk)/qr; // x quant index
+        const int iybs = col - col%qk; // y block start index
+
+// processing >2 values per i iter is faster for fast GPUs
+#pragma unroll
+        for (int j = 0; j < vals_per_iter; j += 2) {
+            // process 2 vals per j iter
+
+            // dequantize
+            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
+            dfloat2 v;
+            dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
+
+            // matrix multiplication
+            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
+#ifdef GGML_SYCL_F16
+            dfloat2 t1{y[iybs + iqs + j / qr + 0],
+                        y[iybs + iqs + j / qr + y_offset]};
+
+            tmp += v * t1;
+#else
+            tmp += v.x() * y[iybs + iqs + j / qr + 0];
+            tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
+#endif // GGML_SYCL_F16
+        }
+    }
+
+    for (; i < ncols; i += iter_stride) {
+        if (tid>=ncols_left/QK4_0) continue;
+        const int col = i + vals_per_iter*tid;
+        const int ib = (row*ncols + col)/qk; // x block index
+        const int iqs = (col%qk)/qr; // x quant index
+        const int iybs = col - col%qk; // y block start index
+
+// processing >2 values per i iter is faster for fast GPUs
+#pragma unroll
+        for (int j = 0; j < vals_per_iter; j += 2) {
+            // process 2 vals per j iter
+
+            // dequantize
+            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
+            dfloat2 v;
+            dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v);
+
+            // matrix multiplication
+            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
+#ifdef GGML_SYCL_F16
+            dfloat2 t1{y[iybs + iqs + j / qr + 0],
+                        y[iybs + iqs + j / qr + y_offset]};
+
+            tmp += v * t1;
+#else
+            tmp += v.x() * y[iybs + iqs + j / qr + 0];
+            tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
+#endif // GGML_SYCL_F16
+        }
+    }
+
+    // sum up partial sums and write back result
+    const int mask_start = ncols > GGML_SYCL_DMMV_X ? WARP_SIZE >> 1 : WARP_SIZE >> 2;
+    for (int mask = mask_start; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (tid == 0) {
+#ifdef GGML_SYCL_F16
+        dst[row] = tmp.x() + tmp.y();
+#else
+        dst[row] = tmp;
+#endif // GGML_SYCL_F16
+    }
+}
+
 static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
                                          float *dst, const int ncols,
                                          const int nrows,
@@ -759,6 +864,28 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
     }
 }
 
+static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
 
 static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
                                              float *dst, const int ncols,
@@ -953,7 +1080,6 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
 
     const int64_t ne00 = src0->ne[0];
     const int64_t row_diff = row_high - row_low;
-
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
     // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
 #ifdef GGML_SYCL_F16
@@ -967,7 +1093,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
 
     if (src1_convert_f16) {
         src1_dfloat = src1_dfloat_a.alloc(ne00);
-        const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
+        const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
         GGML_ASSERT(to_fp16_sycl != nullptr);
         to_fp16_sycl(src1_ddf_i, src1_dfloat, ne00, stream);
     }
@@ -977,7 +1103,12 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
 
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
-            dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            if ((ggml_tensor_extra_gpu*)dst->src[0]->extra &&
+                ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
+                dequantize_mul_mat_vec_q4_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            } else {
+                dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            }
             break;
         case GGML_TYPE_Q4_1:
             dequantize_mul_mat_vec_q4_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
@@ -1020,4 +1151,5 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
     GGML_UNUSED(src1_ddq_i);
     GGML_UNUSED(src1_ncols);
     GGML_UNUSED(src1_padded_row_size);
+    GGML_UNUSED(ctx);
 }
diff --git a/ggml/src/ggml-sycl/getrows.cpp b/ggml/src/ggml-sycl/getrows.cpp
new file mode 100644
index 000000000..51c19f6b3
--- /dev/null
+++ b/ggml/src/ggml-sycl/getrows.cpp
@@ -0,0 +1,308 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#include "ggml-impl.h"
+#include "common.hpp"
+#include "dequantize.hpp"
+#include "getrows.hpp"
+
+
+template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void k_get_rows(
+            const void * src0, const int32_t * src1, dst_t * dst,
+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
+            size_t s10, size_t s11, size_t s12,
+            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
+
+    const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+                     item_ct1.get_local_id(2)) *
+                    2;
+    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) /
+                    ne12;
+    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) %
+                    ne12;
+
+    if (i00 >= ne00) {
+        return;
+    }
+
+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
+
+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
+    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
+
+    const int ib = i00/qk; // block index
+    const int iqs = (i00%qk)/qr; // quant index
+    const int iybs = i00 - i00%qk; // dst block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(src0_row, ib, iqs, v);
+
+    dst_row[iybs + iqs + 0] = v.x();
+    dst_row[iybs + iqs + y_offset] = v.y();
+}
+
+template<int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_recorder, typename dst_t>
+static void k_get_rows_reorder(
+            const void * src0, const void *src0_dq, const int32_t * src1, dst_t * dst,
+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
+            size_t s10, size_t s11, size_t s12,
+            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
+
+    const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+                     item_ct1.get_local_id(2)) *
+                    2;
+    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) /
+                    ne12;
+    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) %
+                    ne12;
+
+    if (i00 >= ne00) {
+        return;
+    }
+    auto ncols = ne00;
+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
+
+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
+
+    const int src0_off = i01 * ncols + i00;
+    const int ib = src0_off / QK4_0; // block index
+    const int iqs = (i00%qk)/qr; // x quant index
+    const int iybs = i00 - i00%qk; // dst block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel_recorder((const void *)src0_dq, ib, (const void *)src0, src0_off/2, v);
+
+    dst_row[iybs + iqs + 0] = v.x();
+    dst_row[iybs + iqs + y_offset] = v.y();
+
+    GGML_UNUSED(nb01);
+    GGML_UNUSED(nb02);
+    GGML_UNUSED(nb03);
+}
+
+template<typename src0_t, typename dst_t>
+static void k_get_rows_float(
+            const src0_t * src0, const int32_t * src1, dst_t * dst,
+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
+            size_t s10, size_t s11, size_t s12,
+            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
+
+    const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+                    item_ct1.get_local_id(2);
+    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) /
+                    ne12;
+    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) %
+                    ne12;
+
+    if (i00 >= ne00) {
+        return;
+    }
+
+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
+
+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
+    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
+
+    dst_row[i00] = src0_row[i00];
+}
+
+template <int qk, int qr, dequantize_kernel_t dq>
+static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                          ggml_tensor *dst, const void *src0_dd,
+                          const int32_t *src1_dd, float *dst_dd,
+                          queue_ptr stream) {
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
+    const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE);
+    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
+
+    // strides in elements
+    //const size_t s0 = nb0 / ggml_element_size(dst);
+    const size_t s1 = nb1 / ggml_element_size(dst);
+    const size_t s2 = nb2 / ggml_element_size(dst);
+    const size_t s3 = nb3 / ggml_element_size(dst);
+
+    const size_t s10 = nb10 / ggml_element_size(src1);
+    const size_t s11 = nb11 / ggml_element_size(src1);
+    const size_t s12 = nb12 / ggml_element_size(src1);
+    //const size_t s13 = nb13 / ggml_element_size(src1);
+
+    GGML_ASSERT(ne00 % 2 == 0);
+
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             k_get_rows<qk, qr, dq>(
+                                 src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
+                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
+                         });
+
+    GGML_UNUSED(dst);
+    GGML_UNUSED(ctx);
+}
+
+template <int qk, int qr, dequantize_kernel_t_reorder dq_reorder>
+static void get_rows_sycl_reorder(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
+                          ggml_tensor *dst, const void *src0_dd,
+                          const int32_t *src1_dd, float *dst_dd,
+                          queue_ptr stream) {
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
+    const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE);
+    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
+
+    // strides in elements
+    //const size_t s0 = nb0 / ggml_element_size(dst);
+    const size_t s1 = nb1 / ggml_element_size(dst);
+    const size_t s2 = nb2 / ggml_element_size(dst);
+    const size_t s3 = nb3 / ggml_element_size(dst);
+
+    const size_t s10 = nb10 / ggml_element_size(src1);
+    const size_t s11 = nb11 / ggml_element_size(src1);
+    const size_t s12 = nb12 / ggml_element_size(src1);
+    //const size_t s13 = nb13 / ggml_element_size(src1);
+
+    GGML_ASSERT(ne00 % 2 == 0);
+
+    const uint8_t* src0_q = (const uint8_t*)src0_dd;
+    const size_t ncols = ne00;
+    const size_t nrows = ne01;
+    const sycl::half* src0_dq = (const sycl::half*)(src0_q + nrows * ncols / 2);
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]]{
+                             k_get_rows_reorder<qk, qr, dq_reorder>(
+                                 src0_dd, src0_dq, src1_dd, dst_dd, ne00, ne12, s1, s2,
+                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
+                         });
+
+    GGML_UNUSED(dst);
+    GGML_UNUSED(ctx);
+}
+
+
+template <typename src0_t>
+static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                const ggml_tensor *src1, ggml_tensor *dst,
+                                const src0_t *src0_dd, const int32_t *src1_dd,
+                                float *dst_dd, queue_ptr stream) {
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
+    const int block_num_x = (ne00 + SYCL_GET_ROWS_BLOCK_SIZE - 1) / SYCL_GET_ROWS_BLOCK_SIZE;
+    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
+
+    // strides in elements
+    //const size_t s0 = nb0 / ggml_element_size(dst);
+    const size_t s1 = nb1 / ggml_element_size(dst);
+    const size_t s2 = nb2 / ggml_element_size(dst);
+    const size_t s3 = nb3 / ggml_element_size(dst);
+
+    const size_t s10 = nb10 / ggml_element_size(src1);
+    const size_t s11 = nb11 / ggml_element_size(src1);
+    const size_t s12 = nb12 / ggml_element_size(src1);
+    //const size_t s13 = nb13 / ggml_element_size(src1);
+
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
+                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
+            });
+    }
+
+    GGML_UNUSED(dst);
+    GGML_UNUSED(ctx);
+}
+
+void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+                                  const ggml_tensor *src1, ggml_tensor *dst,
+                                  const float *src0_d, const float *src1_d,
+                                  float *dst_d, const queue_ptr &stream) {
+
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
+    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
+    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
+
+    const int32_t * src1_i32 = (const int32_t *) src1_d;
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            get_rows_sycl_float(ctx, src0, src1, dst, (const sycl::half *)src0_d,
+                                src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_F32:
+            get_rows_sycl_float(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q4_0:
+            if (ctx.opt_feature.reorder && dst->op == GGML_OP_MUL_MAT) {
+                get_rows_sycl_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            } else {
+                get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            }
+            break;
+        case GGML_TYPE_Q4_1:
+            get_rows_sycl<QK4_1, QR4_1, dequantize_q4_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            get_rows_sycl<QK5_0, QR5_0, dequantize_q5_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            get_rows_sycl<QK5_1, QR5_1, dequantize_q5_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            get_rows_sycl<QK8_0, QR8_0, dequantize_q8_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        default:
+            // TODO: k-quants
+            GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
+            GGML_ABORT("fatal error");
+            break;
+    }
+}
+
diff --git a/ggml/src/ggml-sycl/getrows.hpp b/ggml/src/ggml-sycl/getrows.hpp
new file mode 100644
index 000000000..cdbe6c2f4
--- /dev/null
+++ b/ggml/src/ggml-sycl/getrows.hpp
@@ -0,0 +1,23 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_GETROWS_HPP
+#define GGML_SYCL_GETROWS_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
+    const ggml_tensor *src1, ggml_tensor *dst,
+    const float *src0_d, const float *src1_d,
+    float *dst_d, const queue_ptr &stream);
+
+#endif // GGML_SYCL_GETROWS_HPP
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 3d24d2165..d804e6606 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -39,8 +39,12 @@
 #include "ggml-sycl/backend.hpp"
 #include "ggml-sycl/presets.hpp"
 #include "ggml-sycl/gemm.hpp"
+#include "ggml-sycl/sycl_hw.hpp"
+#include "ggml-sycl/getrows.hpp"
 
 static bool g_sycl_loaded = false;
+int g_ggml_sycl_debug = 0;
+int g_ggml_sycl_disable_optimize = 0;
 
 static ggml_sycl_device_info ggml_sycl_init() {
     ggml_sycl_device_info info = {};
@@ -63,14 +67,18 @@ static ggml_sycl_device_info ggml_sycl_init() {
     for (int i = 0; i < info.device_count; ++i) {
         info.devices[i].vmm = 0;
         dpct::device_info prop;
+        sycl::device device = dpct::dev_mgr::instance().get_device(i);
+
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
-            prop, dpct::dev_mgr::instance().get_device(i))));
+            prop, device)));
 
         info.default_tensor_split[i] = total_vram;
         total_vram += prop.get_global_mem_size();
 
         info.devices[i].cc =
             100 * prop.get_major_version() + 10 * prop.get_minor_version();
+        info.devices[i].hw_info = get_device_hw_info(&device);
+        info.devices[i].opt_feature = check_gpu_optimize_feature(info.devices[i].hw_info.arch);
 
         info.max_work_group_sizes[i] = prop.get_max_work_group_size();
     }
@@ -109,6 +117,27 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
             global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
 }
 
+void print_device_opt_feature(int device_count) {
+    GGML_LOG_INFO("SYCL Optimization Feature:\n");
+    GGML_LOG_INFO(
+        "|ID|        Device Type|Reorder|\n");
+    GGML_LOG_INFO(
+        "|--|-------------------|-------|\n");
+    std::map<std::string, size_t> DeviceNums;
+    for (int id = 0; id < device_count; ++id) {
+      sycl::device device = dpct::dev_mgr::instance().get_device(id);
+      std::string backend_type = get_device_backend_and_type(device);
+      int type_id = DeviceNums[backend_type]++;
+      std::stringstream device_type;
+      device_type << "[" << backend_type << ":" << std::to_string(type_id)
+                  << "]";
+      std::string device_type_s = device_type.str();
+      device_type_s = std::regex_replace(device_type_s, std::regex("ext_oneapi_"), "");
+      GGML_LOG_INFO("|%2d|%19s|%7s|\n", id, device_type_s.c_str(),
+        ggml_sycl_info().devices[id].opt_feature.reorder ? "Y": "N");
+    }
+
+}
 void ggml_backend_sycl_print_sycl_devices() {
     GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
     int device_count = dpct::dev_mgr::instance().device_count();
@@ -137,6 +166,8 @@ void ggml_backend_sycl_print_sycl_devices() {
                   << "]";
       print_device_detail(id, device, device_type.str());
     }
+
+    print_device_opt_feature(device_count);
 }
 
 static inline int get_sycl_env(const char *env_name, int default_val) {
@@ -157,18 +188,22 @@ static void ggml_check_sycl() try {
     static bool initialized = false;
 
     if (!initialized) {
-        GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
         g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
-        GGML_LOG_INFO("GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
+        g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
+        GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
+        GGML_LOG_INFO("Running with Environment Variables:\n");
+        GGML_LOG_INFO("  GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
+        GGML_LOG_INFO("  GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
+        GGML_LOG_INFO("Build with Macros:\n");
 #if defined(GGML_SYCL_FORCE_MMQ)
-        GGML_LOG_INFO("GGML_SYCL_FORCE_MMQ:   yes\n");
+        GGML_LOG_INFO("  GGML_SYCL_FORCE_MMQ: yes\n");
 #else
-        GGML_LOG_INFO("GGML_SYCL_FORCE_MMQ:   no\n");
+        GGML_LOG_INFO("  GGML_SYCL_FORCE_MMQ: no\n");
 #endif
 #if defined(GGML_SYCL_F16)
-        GGML_LOG_INFO("GGML_SYCL_F16: yes\n");
+        GGML_LOG_INFO("  GGML_SYCL_F16: yes\n");
 #else
-        GGML_LOG_INFO("GGML_SYCL_F16: no\n");
+        GGML_LOG_INFO("  GGML_SYCL_F16: no\n");
 #endif
 
 /* NOT REMOVE, keep it for next optimize for XMX.
@@ -240,19 +275,27 @@ struct ggml_backend_sycl_buffer_context {
     void * dev_ptr = nullptr;
     queue_ptr stream;
     std::string name;
+    optimize_feature opt_feature;
+    std::vector<ggml_tensor_extra_gpu *> tensor_extras;
 
-     ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
+    ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) :
         device(device), dev_ptr(dev_ptr), stream(stream) {
             check_allow_gpu_index(device);
             name = (GGML_SYCL_NAME + std::to_string(device));
+            opt_feature = ggml_sycl_info().devices[device].opt_feature;
         }
 
-
     ~ggml_backend_sycl_buffer_context() {
         if (dev_ptr != nullptr) {
             ggml_sycl_set_device(device);
             SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
         }
+
+        //release extra used by tensors
+        for (ggml_tensor_extra_gpu * extra : tensor_extras) {
+            release_extra_gpu(extra);
+        }
+
     }
 };
 
@@ -280,16 +323,19 @@ static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
     return ctx->dev_ptr;
 }
 
-static void
+static enum ggml_status
 ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
                                      ggml_tensor *tensor) try {
     ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
 
     if (tensor->view_src != NULL) {
         assert(tensor->view_src->buffer->buft == buffer->buft);
-        return;
+        return GGML_STATUS_SUCCESS;
     }
 
+    ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
+    tensor->extra = extra;
+    ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
 
     if (ggml_is_quantized(tensor->type)) {
         // initialize padding to 0 to avoid possible NaN values
@@ -302,6 +348,7 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
                 padded_size - original_size).wait()));
         }
     }
+    return GGML_STATUS_SUCCESS;
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -315,7 +362,6 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
                                                 size_t size) try {
 
     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
-
     ggml_sycl_set_device(ctx->device);
     auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
     SYCL_CHECK(
@@ -659,32 +705,7 @@ struct ggml_backend_sycl_split_buffer_type_context {
 struct ggml_backend_sycl_split_buffer_context {
     ~ggml_backend_sycl_split_buffer_context() try {
         for (ggml_tensor_extra_gpu * extra : tensor_extras) {
-            for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
-                for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
-                    if (extra->events[i][is] != nullptr) {
-                        /*
-                        DPCT1009:206: SYCL uses exceptions to report errors and
-                        does not use the error codes. The original code was
-                        commented out and a warning string was inserted. You
-                        need to rewrite this code.
-                        */
-                        SYCL_CHECK(CHECK_TRY_ERROR(
-                            dpct::destroy_event(extra->events[i][is])));
-                    }
-                }
-                if (extra->data_device[i] != nullptr) {
-                    /*
-                    DPCT1009:207: SYCL uses exceptions to report errors and does
-                    not use the error codes. The original code was commented out
-                    and a warning string was inserted. You need to rewrite this
-                    code.
-                    */
-                    ggml_sycl_set_device(i);
-                    SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(
-                        extra->data_device[i], *(streams[i]))));
-                }
-            }
-            delete extra;
+            release_extra_gpu(extra, streams);
         }
     }
     catch (sycl::exception const &exc) {
@@ -709,7 +730,7 @@ static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buff
     GGML_UNUSED(buffer);
 }
 
-static void
+static enum ggml_status
 ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
                                            ggml_tensor *tensor) try {
     GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
@@ -722,7 +743,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
     ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
 
     ctx->tensor_extras.push_back(extra);
-        ctx->streams.push_back(&(dpct::get_current_device().default_queue()));
+    ctx->streams.push_back(&(dpct::get_current_device().default_queue()));
 
     for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
         int64_t row_low, row_high;
@@ -784,6 +805,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
         }
     }
     tensor->extra = extra;
+    return GGML_STATUS_SUCCESS;
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -1336,83 +1358,6 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
     reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
 }
 
-template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static void k_get_rows(
-            const void * src0, const int32_t * src1, dst_t * dst,
-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
-            size_t s10, size_t s11, size_t s12,
-            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
-
-    const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
-                     item_ct1.get_local_id(2)) *
-                    2;
-    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
-                     item_ct1.get_local_id(0)) /
-                    ne12;
-    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
-                     item_ct1.get_local_id(0)) %
-                    ne12;
-
-    if (i00 >= ne00) {
-        return;
-    }
-
-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
-
-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
-    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
-
-    const int ib = i00/qk; // block index
-    const int iqs = (i00%qk)/qr; // quant index
-    const int iybs = i00 - i00%qk; // dst block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    // dequantize
-    dfloat2 v;
-    dequantize_kernel(src0_row, ib, iqs, v);
-
-    dst_row[iybs + iqs + 0] = v.x();
-    dst_row[iybs + iqs + y_offset] = v.y();
-}
-
-template<typename src0_t, typename dst_t>
-static void k_get_rows_float(
-            const src0_t * src0, const int32_t * src1, dst_t * dst,
-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
-            size_t s10, size_t s11, size_t s12,
-            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
-
-    const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
-                    item_ct1.get_local_id(2);
-    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
-                     item_ct1.get_local_id(0)) /
-                    ne12;
-    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
-                     item_ct1.get_local_id(0)) %
-                    ne12;
-
-    if (i00 >= ne00) {
-        return;
-    }
-
-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
-
-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
-    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
-
-    dst_row[i00] = src0_row[i00];
-}
-
 static void mul_mat_p021_f16_f32(
     const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
@@ -1895,81 +1840,6 @@ static  void pool2d_nchw_kernel(
         o_ptr[cur_oh * ow + cur_ow] = res;
 }
 
-template <int qk, int qr, dequantize_kernel_t dq>
-static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
-                          ggml_tensor *dst, const void *src0_dd,
-                          const int32_t *src1_dd, float *dst_dd,
-                          queue_ptr stream) {
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
-    const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE);
-    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
-
-    // strides in elements
-    //const size_t s0 = nb0 / ggml_element_size(dst);
-    const size_t s1 = nb1 / ggml_element_size(dst);
-    const size_t s2 = nb2 / ggml_element_size(dst);
-    const size_t s3 = nb3 / ggml_element_size(dst);
-
-    const size_t s10 = nb10 / ggml_element_size(src1);
-    const size_t s11 = nb11 / ggml_element_size(src1);
-    const size_t s12 = nb12 / ggml_element_size(src1);
-    //const size_t s13 = nb13 / ggml_element_size(src1);
-
-    GGML_ASSERT(ne00 % 2 == 0);
-
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             k_get_rows<qk, qr, dq>(
-                                 src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
-                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
-                         });
-
-    GGML_UNUSED(dst);
-    GGML_UNUSED(ctx);
-}
-
-template <typename src0_t>
-static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
-                                const ggml_tensor *src1, ggml_tensor *dst,
-                                const src0_t *src0_dd, const int32_t *src1_dd,
-                                float *dst_dd, queue_ptr stream) {
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
-    const int block_num_x = (ne00 + SYCL_GET_ROWS_BLOCK_SIZE - 1) / SYCL_GET_ROWS_BLOCK_SIZE;
-    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
-
-    // strides in elements
-    //const size_t s0 = nb0 / ggml_element_size(dst);
-    const size_t s1 = nb1 / ggml_element_size(dst);
-    const size_t s2 = nb2 / ggml_element_size(dst);
-    const size_t s3 = nb3 / ggml_element_size(dst);
-
-    const size_t s10 = nb10 / ggml_element_size(src1);
-    const size_t s11 = nb11 / ggml_element_size(src1);
-    const size_t s12 = nb12 / ggml_element_size(src1);
-    //const size_t s13 = nb13 / ggml_element_size(src1);
-
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
-                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
-            });
-    }
-
-    GGML_UNUSED(dst);
-    GGML_UNUSED(ctx);
-}
-
 static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
                                    const int ky, const int kx_padded,
                                    queue_ptr stream) {
@@ -2493,52 +2363,6 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
-                                  const ggml_tensor *src1, ggml_tensor *dst,
-                                  const float *src0_d, const float *src1_d,
-                                  float *dst_d, const queue_ptr &stream) {
-
-    GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
-    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
-    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
-
-    const int32_t * src1_i32 = (const int32_t *) src1_d;
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            get_rows_sycl_float(ctx, src0, src1, dst, (const sycl::half *)src0_d,
-                                src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_F32:
-            get_rows_sycl_float(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q4_0:
-            get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            get_rows_sycl<QK4_1, QR4_1, dequantize_q4_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            get_rows_sycl<QK5_0, QR5_0, dequantize_q5_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            get_rows_sycl<QK5_1, QR5_1, dequantize_q5_1>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            get_rows_sycl<QK8_0, QR8_0, dequantize_q8_0>(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        default:
-            // TODO: k-quants
-            GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
-            GGML_ABORT("fatal error");
-            break;
-    }
-}
-
-
 static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
                                 const ggml_tensor *src1, ggml_tensor *dst,
                                 const float *src0_d, const float *src1_d,
@@ -2588,11 +2412,10 @@ inline void ggml_sycl_op_mul_mat_sycl(
     if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
         use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] &&
         dst->op_params[0] == GGML_PREC_DEFAULT) {
-
         // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");
         ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
         if (src0->type != GGML_TYPE_F16) {
-            const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type);
+            const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type, dst);
             GGML_ASSERT(to_fp16_sycl != nullptr);
             size_t ne = row_diff*ne00;
             src0_as_f16.alloc(ne);
@@ -2604,7 +2427,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
 
         ggml_sycl_pool_alloc<sycl::half> src1_as_f16(ctx.pool());
         if (src1->type != GGML_TYPE_F16) {
-            const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
+            const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
             GGML_ASSERT(to_fp16_sycl != nullptr);
             size_t ne = src1_ncols*ne10;
             src1_as_f16.alloc(ne);
@@ -2625,13 +2448,13 @@ inline void ggml_sycl_op_mul_mat_sycl(
             src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
             dst_f16.get(), dpct::library_data_t::real_half, ldc,
             dpct::library_data_t::real_half)));
-        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
+        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
         to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
 #else
         auto dnnl_stream = ctx.stream_dnnl(stream);
         DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
             src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(), dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>());
-        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
+        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
         to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
 #endif
     }
@@ -2640,13 +2463,13 @@ inline void ggml_sycl_op_mul_mat_sycl(
         ggml_sycl_pool_alloc<float> src0_ddq_as_f32(ctx.pool());
         ggml_sycl_pool_alloc<float> src1_ddq_as_f32(ctx.pool());
         if (src0->type != GGML_TYPE_F32) {
-            const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type);
+            const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst);
             GGML_ASSERT(to_fp32_sycl != nullptr);
             src0_ddq_as_f32.alloc(row_diff*ne00);
             to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
         }
         if (src1->type != GGML_TYPE_F32) {
-            const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type);
+            const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type, dst);
             GGML_ASSERT(to_fp32_sycl != nullptr);
             src1_ddq_as_f32.alloc(src1_ncols*ne10);
             to_fp32_sycl(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
@@ -3084,7 +2907,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
     for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
         const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0;
         const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
-
         for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
             if ((!split && i != ctx.device) || dev[i].row_low == dev[i].row_high) {
                 continue;
@@ -3392,7 +3214,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
     // convert src1 to fp16
     ggml_sycl_pool_alloc<sycl::half> src1_f16_alloc(ctx.pool());
     if (src1->type != GGML_TYPE_F16) {
-        const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
+        const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
         const int64_t ne_src1 = ggml_nelements(src1);
         src1_f16_alloc.alloc(ne_src1);
         GGML_ASSERT(to_fp16_sycl != nullptr);
@@ -3508,6 +3330,7 @@ bool ggml_sycl_supports_dmmv(enum ggml_type type) {
 }
 
 static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+
     const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
     int64_t min_compute_capability = INT_MAX;
 
@@ -3569,6 +3392,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
         ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
     } else if (use_dequantize_mul_mat_vec) {
         ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
+        // save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
     } else if (use_mul_mat_vec_q) {
         ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
     } else if (use_mul_mat_q) {
@@ -4250,10 +4074,72 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
+void reorder_qw(char *data_device, const int ncols, const int nrows,
+                size_t size, size_t offset, dpct::queue_ptr stream) {
+    auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
+    SYCL_CHECK(
+        CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
+            .wait()));
+    GGML_ASSERT((size % sizeof(block_q4_0) == 0));
+    GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
+    int offset_blks = offset / sizeof(block_q4_0);
+    auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
+    auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
+
+    stream->parallel_for(
+        size / sizeof(block_q4_0),
+            [=](auto i) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
+            const block_q4_0* x = (const block_q4_0*)tmp_buf;
+            const int ib = i;
+
+            for (int j = 0; j < QK4_0/2; j ++)
+            {
+                *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
+            }
+            *(d_ptr + ib) = x[ib].d;
+        });
+
+    sycl::free(tmp_buf, *stream);
+}
+
+void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
+    char*data_device = (char*)src0->data;
+    size_t ncols = src0->ne[0];
+    size_t nrows = src0->ne[1];
+    size_t size = ggml_nbytes(src0);
+
+    reorder_qw(data_device, ncols, nrows, size, 0, stream);
+}
+
+void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
+    ggml_tensor *src0 = dst->src[0];
+    ggml_tensor *src1 = dst->src[1];
+
+    if (dst->op == GGML_OP_MUL_MAT && src0->type == GGML_TYPE_Q4_0 &&
+        src1->ne[2]==1 && src1->ne[3]==1) {
+        reorder_qw(src0, stream);
+        ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
+        GGML_ASSERT(extra);
+        extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
+    }
+}
+
+void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
+    dpct::queue_ptr stream = ctx->stream();
+    if (ctx->optimized_graph) {
+       return;
+    }
+    ctx->optimized_graph = true;
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream);
+    }
+}
 static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
     ggml_sycl_set_main_device(sycl_ctx->device);
 
+    if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp
index 563e0655f..eb20bd251 100644
--- a/ggml/src/ggml-sycl/softmax.cpp
+++ b/ggml/src/ggml-sycl/softmax.cpp
@@ -249,13 +249,16 @@ void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 
     if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F16) {
         const sycl::half * src1_dd = static_cast<sycl::half *>(dst->src[1]->data);
+        GGML_SYCL_DEBUG("%s: F16 mask\n", __func__);
         soft_max_f32_sycl<sycl::half>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias,
                           main_stream, ctx.device);
     } else if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F32) {
         const float * src1_dd = static_cast<const float *>(dst->src[1]->data);
+        GGML_SYCL_DEBUG("%s: F32 mask\n", __func__);
         soft_max_f32_sycl<float>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
     } else {
         /* mask unavailable */
+        GGML_SYCL_DEBUG("%s: No mask\n", __func__);
         soft_max_f32_sycl<float>(src0_dd, nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
     }
 }
diff --git a/ggml/src/ggml-sycl/sycl_hw.cpp b/ggml/src/ggml-sycl/sycl_hw.cpp
new file mode 100644
index 000000000..da121ffc2
--- /dev/null
+++ b/ggml/src/ggml-sycl/sycl_hw.cpp
@@ -0,0 +1,13 @@
+#include "sycl_hw.hpp"
+
+
+sycl_hw_info get_device_hw_info(sycl::device *device_ptr) {
+  sycl_hw_info res;
+  int32_t id = device_ptr->get_info<sycl::ext::intel::info::device::device_id>();
+  res.device_id = id;
+
+  syclex::architecture arch = device_ptr->get_info<syclex::info::device::architecture>();
+  res.arch = arch;
+
+  return res;
+}
diff --git a/ggml/src/ggml-sycl/sycl_hw.hpp b/ggml/src/ggml-sycl/sycl_hw.hpp
new file mode 100644
index 000000000..bf689450c
--- /dev/null
+++ b/ggml/src/ggml-sycl/sycl_hw.hpp
@@ -0,0 +1,23 @@
+#ifndef SYCL_HW_HPP
+#define SYCL_HW_HPP
+
+#include <algorithm>
+#include <stdio.h>
+#include <vector>
+#include <map>
+
+#include <sycl/sycl.hpp>
+
+namespace syclex = sycl::ext::oneapi::experimental;
+
+struct sycl_hw_info {
+  syclex::architecture arch;
+  int32_t device_id;
+};
+
+bool is_in_vector(std::vector<int> &vec, int item);
+
+sycl_hw_info get_device_hw_info(sycl::device *device_ptr);
+
+
+#endif // SYCL_HW_HPP
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 4d9b837ce..0bcb2fe4b 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -245,15 +245,19 @@ struct vk_device_struct {
     vk_pipeline pipeline_norm_f32;
     vk_pipeline pipeline_group_norm_f32;
     vk_pipeline pipeline_rms_norm_f32;
+    vk_pipeline pipeline_rms_norm_back_f32;
     vk_pipeline pipeline_gelu_f32;
     vk_pipeline pipeline_gelu_quick_f32;
     vk_pipeline pipeline_silu_f32;
+    vk_pipeline pipeline_silu_back_f32;
     vk_pipeline pipeline_relu_f32;
     vk_pipeline pipeline_leaky_relu_f32;
     vk_pipeline pipeline_tanh_f32;
+    vk_pipeline pipeline_sigmoid_f32;
     vk_pipeline pipeline_diag_mask_inf_f32;
     vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
     vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512;
+    vk_pipeline pipeline_soft_max_back_f32;
     vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
     vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
     vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16;
@@ -508,6 +512,7 @@ struct vk_op_rope_push_constants {
     uint32_t s1;
     uint32_t s2;
     int32_t sections[4];
+    uint32_t is_back;
 };
 
 struct vk_op_soft_max_push_constants {
@@ -1991,6 +1996,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
         }
     } else if (device->vendor_id == VK_VENDOR_ID_INTEL)
         rm_stdq = 2;
+    uint32_t rm_iq = 2 * rm_kq;
 
     for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) {
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32_"+std::to_string(i+1),  mul_mat_vec_f32_f32_f32_len,  mul_mat_vec_f32_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
@@ -2005,15 +2011,15 @@ static void ggml_vk_load_shaders(vk_device& device) {
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f32_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ1_S][i],   "mul_mat_vec_iq1_s_f32_f32_"+std::to_string(i+1),   mul_mat_vec_iq1_s_f32_f32_len,   mul_mat_vec_iq1_s_f32_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ1_M][i],   "mul_mat_vec_iq1_m_f32_f32_"+std::to_string(i+1),   mul_mat_vec_iq1_m_f32_f32_len,   mul_mat_vec_iq1_m_f32_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xxs_f32_f32_len, mul_mat_vec_iq2_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_XS][i],  "mul_mat_vec_iq2_xs_f32_f32_"+std::to_string(i+1),  mul_mat_vec_iq2_xs_f32_f32_len,  mul_mat_vec_iq2_xs_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_S][i],   "mul_mat_vec_iq2_s_f32_f32_"+std::to_string(i+1),   mul_mat_vec_iq2_s_f32_f32_len,   mul_mat_vec_iq2_s_f32_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f32_f32_len, mul_mat_vec_iq3_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_S][i],   "mul_mat_vec_iq3_s_f32_f32_"+std::to_string(i+1),   mul_mat_vec_iq3_s_f32_f32_len,   mul_mat_vec_iq3_s_f32_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_XS][i],  "mul_mat_vec_iq4_xs_f32_f32_"+std::to_string(i+1),  mul_mat_vec_iq4_xs_f32_f32_len,  mul_mat_vec_iq4_xs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL][i],  "mul_mat_vec_iq4_nl_f32_f32_"+std::to_string(i+1),  mul_mat_vec_iq4_nl_f32_f32_len,  mul_mat_vec_iq4_nl_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ1_S][i],   "mul_mat_vec_iq1_s_f32_f32_"+std::to_string(i+1),   mul_mat_vec_iq1_s_f32_f32_len,   mul_mat_vec_iq1_s_f32_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ1_M][i],   "mul_mat_vec_iq1_m_f32_f32_"+std::to_string(i+1),   mul_mat_vec_iq1_m_f32_f32_len,   mul_mat_vec_iq1_m_f32_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xxs_f32_f32_len, mul_mat_vec_iq2_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_XS][i],  "mul_mat_vec_iq2_xs_f32_f32_"+std::to_string(i+1),  mul_mat_vec_iq2_xs_f32_f32_len,  mul_mat_vec_iq2_xs_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ2_S][i],   "mul_mat_vec_iq2_s_f32_f32_"+std::to_string(i+1),   mul_mat_vec_iq2_s_f32_f32_len,   mul_mat_vec_iq2_s_f32_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f32_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f32_f32_len, mul_mat_vec_iq3_xxs_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ3_S][i],   "mul_mat_vec_iq3_s_f32_f32_"+std::to_string(i+1),   mul_mat_vec_iq3_s_f32_f32_len,   mul_mat_vec_iq3_s_f32_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_XS][i],  "mul_mat_vec_iq4_xs_f32_f32_"+std::to_string(i+1),  mul_mat_vec_iq4_xs_f32_f32_len,  mul_mat_vec_iq4_xs_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL][i],  "mul_mat_vec_iq4_nl_f32_f32_"+std::to_string(i+1),  mul_mat_vec_iq4_nl_f32_f32_len,  mul_mat_vec_iq4_nl_f32_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
 
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32_"+std::to_string(i+1),  mul_mat_vec_f32_f16_f32_len,  mul_mat_vec_f32_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32_"+std::to_string(i+1),  mul_mat_vec_f16_f16_f32_len,  mul_mat_vec_f16_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2, i+1}, 1);
@@ -2027,15 +2033,15 @@ static void ggml_vk_load_shaders(vk_device& device) {
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K][i], "mul_mat_vec_q4_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K][i], "mul_mat_vec_q5_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
         ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K][i], "mul_mat_vec_q6_k_f16_f32_"+std::to_string(i+1), mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ1_S][i],   "mul_mat_vec_iq1_s_f16_f32_"+std::to_string(i+1),   mul_mat_vec_iq1_s_f16_f32_len,   mul_mat_vec_iq1_s_f16_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ1_M][i],   "mul_mat_vec_iq1_m_f16_f32_"+std::to_string(i+1),   mul_mat_vec_iq1_m_f16_f32_len,   mul_mat_vec_iq1_m_f16_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xxs_f16_f32_len, mul_mat_vec_iq2_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_XS][i],  "mul_mat_vec_iq2_xs_f16_f32_"+std::to_string(i+1),  mul_mat_vec_iq2_xs_f16_f32_len,  mul_mat_vec_iq2_xs_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_S][i],   "mul_mat_vec_iq2_s_f16_f32_"+std::to_string(i+1),   mul_mat_vec_iq2_s_f16_f32_len,   mul_mat_vec_iq2_s_f16_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f16_f32_len, mul_mat_vec_iq3_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_S][i],   "mul_mat_vec_iq3_s_f16_f32_"+std::to_string(i+1),   mul_mat_vec_iq3_s_f16_f32_len,   mul_mat_vec_iq3_s_f16_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_XS][i],  "mul_mat_vec_iq4_xs_f16_f32_"+std::to_string(i+1),  mul_mat_vec_iq4_xs_f16_f32_len,  mul_mat_vec_iq4_xs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq, i+1}, 1, true);
-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL][i],  "mul_mat_vec_iq4_nl_f16_f32_"+std::to_string(i+1),  mul_mat_vec_iq4_nl_f16_f32_len,  mul_mat_vec_iq4_nl_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ1_S][i],   "mul_mat_vec_iq1_s_f16_f32_"+std::to_string(i+1),   mul_mat_vec_iq1_s_f16_f32_len,   mul_mat_vec_iq1_s_f16_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ1_M][i],   "mul_mat_vec_iq1_m_f16_f32_"+std::to_string(i+1),   mul_mat_vec_iq1_m_f16_f32_len,   mul_mat_vec_iq1_m_f16_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_XXS][i], "mul_mat_vec_iq2_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq2_xxs_f16_f32_len, mul_mat_vec_iq2_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_XS][i],  "mul_mat_vec_iq2_xs_f16_f32_"+std::to_string(i+1),  mul_mat_vec_iq2_xs_f16_f32_len,  mul_mat_vec_iq2_xs_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ2_S][i],   "mul_mat_vec_iq2_s_f16_f32_"+std::to_string(i+1),   mul_mat_vec_iq2_s_f16_f32_len,   mul_mat_vec_iq2_s_f16_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_XXS][i], "mul_mat_vec_iq3_xxs_f16_f32_"+std::to_string(i+1), mul_mat_vec_iq3_xxs_f16_f32_len, mul_mat_vec_iq3_xxs_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ3_S][i],   "mul_mat_vec_iq3_s_f16_f32_"+std::to_string(i+1),   mul_mat_vec_iq3_s_f16_f32_len,   mul_mat_vec_iq3_s_f16_f32_data,   "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_XS][i],  "mul_mat_vec_iq4_xs_f16_f32_"+std::to_string(i+1),  mul_mat_vec_iq4_xs_f16_f32_len,  mul_mat_vec_iq4_xs_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL][i],  "mul_mat_vec_iq4_nl_f16_f32_"+std::to_string(i+1),  mul_mat_vec_iq4_nl_f16_f32_len,  mul_mat_vec_iq4_nl_f16_f32_data,  "main", 3, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq, i+1}, 1, true);
     }
 
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32",  mul_mat_vec_id_f32_f32_len,  mul_mat_vec_id_f32_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1);
@@ -2050,15 +2056,15 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ1_S],   "mul_mat_vec_id_iq1_s_f32",   mul_mat_vec_id_iq1_s_f32_len,   mul_mat_vec_id_iq1_s_f32_data,   "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ1_M],   "mul_mat_vec_id_iq1_m_f32",   mul_mat_vec_id_iq1_m_f32_len,   mul_mat_vec_id_iq1_m_f32_data,   "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XXS], "mul_mat_vec_id_iq2_xxs_f32", mul_mat_vec_id_iq2_xxs_f32_len, mul_mat_vec_id_iq2_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XS],  "mul_mat_vec_id_iq2_xs_f32",  mul_mat_vec_id_iq2_xs_f32_len,  mul_mat_vec_id_iq2_xs_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_S],   "mul_mat_vec_id_iq2_s_f32",   mul_mat_vec_id_iq2_s_f32_len,   mul_mat_vec_id_iq2_s_f32_data,   "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_XXS], "mul_mat_vec_id_iq3_xxs_f32", mul_mat_vec_id_iq3_xxs_f32_len, mul_mat_vec_id_iq3_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_S],   "mul_mat_vec_id_iq3_s_f32",   mul_mat_vec_id_iq3_s_f32_len,   mul_mat_vec_id_iq3_s_f32_data,   "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_XS],  "mul_mat_vec_id_iq4_xs_f32",  mul_mat_vec_id_iq4_xs_f32_len,  mul_mat_vec_id_iq4_xs_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true);
-    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL],  "mul_mat_vec_id_iq4_nl_f32",  mul_mat_vec_id_iq4_nl_f32_len,  mul_mat_vec_id_iq4_nl_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ1_S],   "mul_mat_vec_id_iq1_s_f32",   mul_mat_vec_id_iq1_s_f32_len,   mul_mat_vec_id_iq1_s_f32_data,   "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ1_M],   "mul_mat_vec_id_iq1_m_f32",   mul_mat_vec_id_iq1_m_f32_len,   mul_mat_vec_id_iq1_m_f32_data,   "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XXS], "mul_mat_vec_id_iq2_xxs_f32", mul_mat_vec_id_iq2_xxs_f32_len, mul_mat_vec_id_iq2_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_XS],  "mul_mat_vec_id_iq2_xs_f32",  mul_mat_vec_id_iq2_xs_f32_len,  mul_mat_vec_id_iq2_xs_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ2_S],   "mul_mat_vec_id_iq2_s_f32",   mul_mat_vec_id_iq2_s_f32_len,   mul_mat_vec_id_iq2_s_f32_data,   "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_XXS], "mul_mat_vec_id_iq3_xxs_f32", mul_mat_vec_id_iq3_xxs_f32_len, mul_mat_vec_id_iq3_xxs_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ3_S],   "mul_mat_vec_id_iq3_s_f32",   mul_mat_vec_id_iq3_s_f32_len,   mul_mat_vec_id_iq3_s_f32_data,   "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_XS],  "mul_mat_vec_id_iq4_xs_f32",  mul_mat_vec_id_iq4_xs_f32_len,  mul_mat_vec_id_iq4_xs_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL],  "mul_mat_vec_id_iq4_nl_f32",  mul_mat_vec_id_iq4_nl_f32_len,  mul_mat_vec_id_iq4_nl_f32_data,  "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_iq, 1, 1}, {subgroup_size_16, rm_iq}, 1, true);
 
     // dequant shaders
     ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16",   dequant_f32_len,  dequant_f32_data,  "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
@@ -2125,6 +2131,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_rms_norm_back_f32, "rms_norm_back_f32", rms_norm_back_f32_len, rms_norm_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
@@ -2184,9 +2191,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_gelu_quick_f32, "gelu_quick_f32", gelu_quick_f32_len, gelu_quick_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_silu_back_f32, "silu_back_f32", silu_back_f32_len, silu_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_tanh_f32, "tanh_f32", tanh_f32_len, tanh_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_sigmoid_f32, "sigmoid_f32", sigmoid_f32_len, sigmoid_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {1, 512, 1}, {}, 1, true);
 
@@ -2194,6 +2203,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_wg512, "soft_max_f32_wg512", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
     ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
     ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
@@ -4191,7 +4201,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
     }
     if (qy_needs_dequant) {
         d_Y = ctx->prealloc_y;
-        GGML_ASSERT(d_Y->size >= y_sz * ne02 * ne03);
+        GGML_ASSERT(d_Y->size >= y_sz * ne12 * ne13);
     } else {
         d_Y = d_Qy;
         y_buf_offset = qy_buf_offset;
@@ -4768,7 +4778,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
     }
     if (qy_needs_dequant) {
         d_Y = ctx->prealloc_y;
-        GGML_ASSERT(d_Y->size >= y_sz * ne02 * ne03);
+        GGML_ASSERT(d_Y->size >= y_sz * ne12 * ne13);
     } else {
         d_Y = d_Qy;
         y_buf_offset = qy_buf_offset;
@@ -5291,6 +5301,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
     case GGML_OP_CONT:
     case GGML_OP_DUP:
         return ggml_vk_get_cpy_pipeline(ctx, src0, dst, dst->type);
+    case GGML_OP_SILU_BACK:
+        if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_silu_back_f32;
+        }
+        return nullptr;
     case GGML_OP_NORM:
         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
             return ctx->device->pipeline_norm_f32;
@@ -5306,6 +5321,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
             return ctx->device->pipeline_rms_norm_f32;
         }
         return nullptr;
+    case GGML_OP_RMS_NORM_BACK:
+        if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_rms_norm_back_f32;
+        }
+        return nullptr;
     case GGML_OP_UNARY:
         switch (ggml_get_unary_op(dst)) {
             case GGML_UNARY_OP_SILU:
@@ -5333,6 +5353,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
                     return ctx->device->pipeline_tanh_f32;
                 }
                 break;
+            case GGML_UNARY_OP_SIGMOID:
+                if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+                    return ctx->device->pipeline_sigmoid_f32;
+                }
+                break;
             default:
                 break;
         }
@@ -5352,7 +5377,13 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
             return src0->ne[0] > 1024 ? ctx->device->pipeline_soft_max_f32_f16_wg512 : ctx->device->pipeline_soft_max_f32_f16;
         }
         return nullptr;
+    case GGML_OP_SOFT_MAX_BACK:
+        if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_soft_max_back_f32;
+        }
+        return nullptr;
     case GGML_OP_ROPE:
+    case GGML_OP_ROPE_BACK:
         {
             const int mode = ((const int32_t *) dst->op_params)[2];
             const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
@@ -5680,7 +5711,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
     switch (op) {
     case GGML_OP_NORM:
     case GGML_OP_RMS_NORM:
+    case GGML_OP_RMS_NORM_BACK:
     case GGML_OP_SOFT_MAX:
+    case GGML_OP_SOFT_MAX_BACK:
     case GGML_OP_SUM_ROWS:
     case GGML_OP_ARGMAX:
         {
@@ -5704,6 +5737,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
         } break;
     case GGML_OP_DIAG_MASK_INF:
     case GGML_OP_ROPE:
+    case GGML_OP_ROPE_BACK:
         elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
         break;
     case GGML_OP_GET_ROWS:
@@ -5799,7 +5833,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
 
         ggml_vk_sync_buffers(subctx);
         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
-    } else if (op == GGML_OP_ROPE) {
+    } else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) {
         // Empty src2 is possible in rope, but the shader needs a buffer
         vk_subbuffer subbuf_z;
         if (use_src2) {
@@ -6321,6 +6355,10 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const
     }, dryrun);
 }
 
+static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
+}
+
 static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
     float * op_params = (float *)dst->op_params;
 
@@ -6343,6 +6381,11 @@ static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx,
     ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
 }
 
+static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    float * op_params = (float *)dst->op_params;
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_RMS_NORM_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
+}
+
 static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
     ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
 }
@@ -6378,7 +6421,12 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
     }, dryrun);
 }
 
-static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
+static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    float * op_params = (float *)dst->op_params;
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], op_params[1] }, dryrun);
+}
+
+static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool backprop, bool dryrun = false) {
     const int n_dims        = ((int32_t *) dst->op_params)[1];
     const int mode          = ((int32_t *) dst->op_params)[2];
     // const int n_ctx         = ((int32_t *) dst->op_params)[3];
@@ -6406,7 +6454,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, cons
         (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
         freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
         src2 != nullptr, (uint32_t)src0->ne[2], s1, s2,
-        sections[0], sections[1], sections[2], sections[3],
+        sections[0], sections[1], sections[2], sections[3], backprop
     }, dryrun);
 }
 
@@ -7303,6 +7351,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
         case GGML_UNARY_OP_GELU_QUICK:
         case GGML_UNARY_OP_RELU:
         case GGML_UNARY_OP_TANH:
+        case GGML_UNARY_OP_SIGMOID:
             break;
         default:
             return false;
@@ -7327,12 +7376,16 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     case GGML_OP_CPY:
     case GGML_OP_CONT:
     case GGML_OP_DUP:
+    case GGML_OP_SILU_BACK:
     case GGML_OP_NORM:
     case GGML_OP_GROUP_NORM:
     case GGML_OP_RMS_NORM:
+    case GGML_OP_RMS_NORM_BACK:
     case GGML_OP_DIAG_MASK_INF:
     case GGML_OP_SOFT_MAX:
+    case GGML_OP_SOFT_MAX_BACK:
     case GGML_OP_ROPE:
+    case GGML_OP_ROPE_BACK:
     case GGML_OP_MUL_MAT:
     case GGML_OP_MUL_MAT_ID:
     case GGML_OP_ARGSORT:
@@ -7385,13 +7438,17 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
         case GGML_OP_CPY:
         case GGML_OP_CONT:
         case GGML_OP_DUP:
+        case GGML_OP_SILU_BACK:
         case GGML_OP_NORM:
         case GGML_OP_GROUP_NORM:
         case GGML_OP_RMS_NORM:
+        case GGML_OP_RMS_NORM_BACK:
         case GGML_OP_UNARY:
         case GGML_OP_DIAG_MASK_INF:
         case GGML_OP_SOFT_MAX:
+        case GGML_OP_SOFT_MAX_BACK:
         case GGML_OP_ROPE:
+        case GGML_OP_ROPE_BACK:
         case GGML_OP_ARGSORT:
         case GGML_OP_SUM:
         case GGML_OP_SUM_ROWS:
@@ -7483,6 +7540,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     case GGML_OP_DUP:
         ggml_vk_cpy(ctx, compute_ctx, src0, node, dryrun);
 
+        break;
+    case GGML_OP_SILU_BACK:
+        ggml_vk_silu_back(ctx, compute_ctx, src0, src1, node, dryrun);
+
         break;
     case GGML_OP_NORM:
         ggml_vk_norm(ctx, compute_ctx, src0, node, dryrun);
@@ -7495,6 +7556,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     case GGML_OP_RMS_NORM:
         ggml_vk_rms_norm(ctx, compute_ctx, src0, node, dryrun);
 
+        break;
+    case GGML_OP_RMS_NORM_BACK:
+        ggml_vk_rms_norm_back(ctx, compute_ctx, src0, src1, node, dryrun);
+
         break;
     case GGML_OP_UNARY:
         switch (ggml_get_unary_op(node)) {
@@ -7503,6 +7568,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
         case GGML_UNARY_OP_GELU_QUICK:
         case GGML_UNARY_OP_RELU:
         case GGML_UNARY_OP_TANH:
+        case GGML_UNARY_OP_SIGMOID:
             ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
             break;
         default:
@@ -7516,9 +7582,17 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     case GGML_OP_SOFT_MAX:
         ggml_vk_soft_max(ctx, compute_ctx, src0, src1, node, dryrun);
 
+        break;
+    case GGML_OP_SOFT_MAX_BACK:
+        ggml_vk_soft_max_back(ctx, compute_ctx, src0, src1, node, dryrun);
+
         break;
     case GGML_OP_ROPE:
-        ggml_vk_rope(ctx, compute_ctx, src0, src1, src2, node, dryrun);
+        ggml_vk_rope(ctx, compute_ctx, src0, src1, src2, node, false, dryrun);
+
+        break;
+    case GGML_OP_ROPE_BACK:
+        ggml_vk_rope(ctx, compute_ctx, src0, src1, src2, node, true, dryrun);
 
         break;
     case GGML_OP_ARGSORT:
@@ -7644,12 +7718,16 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
     case GGML_OP_CPY:
     case GGML_OP_CONT:
     case GGML_OP_DUP:
+    case GGML_OP_SILU_BACK:
     case GGML_OP_NORM:
     case GGML_OP_GROUP_NORM:
     case GGML_OP_RMS_NORM:
+    case GGML_OP_RMS_NORM_BACK:
     case GGML_OP_DIAG_MASK_INF:
     case GGML_OP_SOFT_MAX:
+    case GGML_OP_SOFT_MAX_BACK:
     case GGML_OP_ROPE:
+    case GGML_OP_ROPE_BACK:
     case GGML_OP_RESHAPE:
     case GGML_OP_VIEW:
     case GGML_OP_PERMUTE:
@@ -7678,6 +7756,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
         case GGML_UNARY_OP_GELU_QUICK:
         case GGML_UNARY_OP_RELU:
         case GGML_UNARY_OP_TANH:
+        case GGML_UNARY_OP_SIGMOID:
             buf = tensor->buffer;
             break;
         default:
@@ -7852,11 +7931,12 @@ static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
     UNUSED(buffer);
 }
 
-static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+static enum ggml_status ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     VK_LOG_DEBUG("ggml_backend_vk_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
     if (tensor->view_src != nullptr) {
         GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
     }
+    return GGML_STATUS_SUCCESS;
 }
 
 static void ggml_backend_vk_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
@@ -8379,6 +8459,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                 case GGML_UNARY_OP_SILU:
                 case GGML_UNARY_OP_RELU:
                 case GGML_UNARY_OP_TANH:
+                case GGML_UNARY_OP_SIGMOID:
                     return ggml_is_contiguous(op->src[0]);
                 default:
                     return false;
@@ -8568,6 +8649,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
         case GGML_OP_REPEAT_BACK:
             return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_ROPE:
+        case GGML_OP_ROPE_BACK:
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
@@ -8584,6 +8666,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
         case GGML_OP_MUL:
         case GGML_OP_DIV:
         case GGML_OP_CONCAT:
+        case GGML_OP_SILU_BACK:
+        case GGML_OP_RMS_NORM_BACK:
         case GGML_OP_UPSCALE:
         case GGML_OP_SCALE:
         case GGML_OP_SQR:
@@ -8593,6 +8677,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
         case GGML_OP_PAD:
         case GGML_OP_DIAG_MASK_INF:
         case GGML_OP_SOFT_MAX:
+        case GGML_OP_SOFT_MAX_BACK:
         case GGML_OP_ARGSORT:
         case GGML_OP_SUM:
         case GGML_OP_SUM_ROWS:
@@ -8984,15 +9069,22 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
         tensor_clone = ggml_group_norm(ggml_ctx, src_clone[0], *(int *)tensor->op_params, ((float *)tensor->op_params)[1]);
     } else if (tensor->op == GGML_OP_RMS_NORM) {
         tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params);
+    } else if (tensor->op == GGML_OP_RMS_NORM_BACK) {
+        const float eps = ((float *) tensor->op_params)[0];
+        tensor_clone = ggml_rms_norm_back(ggml_ctx, src_clone[0], src_clone[1], eps);
+    } else if (tensor->op == GGML_OP_SILU_BACK) {
+        tensor_clone = ggml_silu_back(ggml_ctx, src_clone[0], src_clone[1]);
     } else if (tensor->op == GGML_OP_SOFT_MAX) {
         if (src1 != nullptr) {
             tensor_clone = ggml_soft_max_ext(ggml_ctx, src_clone[0], src_clone[1], ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
         } else {
             tensor_clone = ggml_soft_max(ggml_ctx, src_clone[0]);
         }
+    } else if (tensor->op == GGML_OP_SOFT_MAX_BACK) {
+        tensor_clone = ggml_soft_max_ext_back(ggml_ctx, src_clone[0], src_clone[1], ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
     } else if (tensor->op == GGML_OP_DIAG_MASK_INF) {
         tensor_clone = ggml_diag_mask_inf(ggml_ctx, src_clone[0], *(int *)tensor->op_params);
-    } else if (tensor->op == GGML_OP_ROPE) {
+    } else if (tensor->op == GGML_OP_ROPE || tensor->op == GGML_OP_ROPE_BACK) {
         const int n_dims      = ((int32_t *) tensor->op_params)[1];
         const int mode        = ((int32_t *) tensor->op_params)[2];
         //const int n_ctx_ggml       = ((int32_t *) tensor->op_params)[3];
@@ -9005,9 +9097,17 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
         const float beta_slow       = ((float *) tensor->op_params)[10];
         if (mode & GGML_ROPE_TYPE_MROPE) {
             int32_t *sections = ((int32_t *) tensor->op_params) + 11;
-            tensor_clone = ggml_rope_multi(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+            if (tensor->op == GGML_OP_ROPE) {
+                tensor_clone = ggml_rope_multi(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+            } else {
+                tensor_clone = ggml_rope_multi_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+            }
         } else {
-            tensor_clone = ggml_rope_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+            if (tensor->op == GGML_OP_ROPE) {
+                tensor_clone = ggml_rope_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+            } else {
+                tensor_clone = ggml_rope_ext_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+            }
         }
     } else if (tensor->op == GGML_OP_UNARY) {
         switch (ggml_get_unary_op(tensor)) {
@@ -9026,6 +9126,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
         case GGML_UNARY_OP_TANH:
             tensor_clone = ggml_tanh(ggml_ctx, src_clone[0]);
             break;
+        case GGML_UNARY_OP_SIGMOID:
+            tensor_clone = ggml_sigmoid(ggml_ctx, src_clone[0]);
+            break;
         default:
             std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
             GGML_ABORT("fatal error");
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
index 10318e876..8835c442e 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp
@@ -82,9 +82,9 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
     return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1]));
 }
 vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
-    uint32_t v0 = data_a_packed16[a_offset + ib].qs[iqs/2];
-    uint32_t v1 = data_a_packed16[a_offset + ib].qs[iqs/2 + 1];
-    return vec4(int8_t(v0 & 0xFF), int8_t(v0 >> 8), int8_t(v1 & 0xFF), int8_t(v1 >> 8));
+    const i8vec2 v0 = unpack8(data_a_packed16[a_offset + ib].qs[iqs/2]);
+    const i8vec2 v1 = unpack8(data_a_packed16[a_offset + ib].qs[iqs/2 + 1]);
+    return vec4(v0.x, v0.y, v1.x, v1.y);
 }
 #endif
 
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
index 4770469ed..4ccbe613a 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp
@@ -92,7 +92,7 @@ float16_t dequantFuncQ8_0(const in decodeBufQ8_0 bl, const in uint blockCoords[2
     const uint iqs = idx;
 
     // Load 16b and select the byte for this element
-    int32_t qs = unpack8(int32_t(bl.block.qs[(iqs & 0x1E) >> 1]))[iqs & 1];
+    int32_t qs = unpack8(bl.block.qs[(iqs & 0x1E) >> 1])[iqs & 1];
     float16_t ret = float16_t(qs) * d;
     return ret;
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
index c9f855687..cfd645a38 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp
@@ -1,5 +1,7 @@
 #version 450
 
+#extension GL_EXT_control_flow_attributes : enable
+
 #include "types.comp"
 #include "generic_binary_head.comp"
 #include "dequant_funcs.comp"
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
index 122b1e93f..09aa849e8 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp
@@ -40,6 +40,20 @@ void main() {
     const uint batch = gl_GlobalInvocationID.z / p.IC;
     const uint ic = gl_GlobalInvocationID.z % p.IC;
 
+    const uint src_base = ic * p.offset_delta + batch * p.batch_offset;
+    const uint dst_base = ((batch * p.OH + oh) * p.OW) * p.CHW + ic * (p.KW * p.KH);
+    const int oh_s1 = int(oh) * p.s1;
+    const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
+
+    const uint base_linear_idx = gidx * NUM_ITER;
+
+    const uint max_ky = ksize / p.OW;
+
+    uint current_kx = base_linear_idx / ksize;
+    const uint rem = base_linear_idx - (current_kx * ksize);
+    uint current_ky = rem / p.OW;
+    uint current_ix = rem % p.OW;
+
     A_TYPE values[NUM_ITER];
     uint offset_dst[NUM_ITER];
     [[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
@@ -48,36 +62,35 @@ void main() {
 
     [[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
 
-        const uint i = gidx * NUM_ITER + idx;
+        const uint linear_idx = base_linear_idx + idx;
 
-        const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
-        const uint kx = i / ksize;
-        const uint kd = kx * ksize;
-        const uint ky = (i - kd) / p.OW;
-        const uint ix = i % p.OW;
-
-        const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
-        const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
-
-        offset_dst[idx] =
-            ((batch * p.OH + oh) * p.OW + ix) * p.CHW +
-            (ic * (p.KW * p.KH) + ky * p.KW + kx);
-
-        if (i >= p.pelements) {
+        if (linear_idx >= p.pelements) {
             continue;
         }
 
-        if (iih < p.IH && iiw < p.IW) {
-            const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
-            values[idx] = data_a[offset_src + iih * p.IW + iiw];
+        const uint iiw = current_ix * p.s0 + current_kx * p.d0 - p.p0;
+        const uint iih = oh_s1 + current_ky * p.d1 - p.p1;
+
+        offset_dst[idx] = dst_base + current_ix * p.CHW + current_ky * p.KW + current_kx;
+
+        if ((iih < p.IH) && (iiw < p.IW)) {
+            values[idx] = data_a[src_base + iih * p.IW + iiw];
+        }
+
+        if (++current_ix == p.OW) {
+            current_ix = 0;
+            if (++current_ky == max_ky) {
+                current_ky = 0;
+                current_kx++;
+            }
         }
     }
 
     [[unroll]] for (uint idx = 0; idx < NUM_ITER; ++idx) {
 
-        const uint i = gidx * NUM_ITER + idx;
+        const uint linear_idx = base_linear_idx + idx;
 
-        if (i >= p.pelements) {
+        if (linear_idx >= p.pelements) {
             continue;
         }
 
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp
new file mode 100644
index 000000000..9718a05e5
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp
@@ -0,0 +1,90 @@
+#version 450
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+
+#include "mul_mat_vec_base.comp"
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
+
+void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
+    const uint y_idx = i * QUANT_K + 16 * itid;
+    const uint nibble_shift = 4 * (itid & 1);
+    const uint ib32 = itid / 2; // 0..7
+
+    uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        const float d = float(data_a[ibi].d);
+        const uint scale = (data_a[ibi].scales[ib32] >> nibble_shift) & 0xF;
+        const float db = d * (0.5 + scale) * 0.25;
+
+        const uint qh = data_a[ibi].qh[ib32];
+        const u8vec2 qs16 = unpack8(data_a_packed16[ibi].qs[itid]);
+        const u8vec2 sign16 = unpack8(data_a_packed16[ibi].qs[QUANT_K / 16 + itid]);
+        [[unroll]] for (uint l = 0; l < 2; ++l) {
+            const uint8_t sign = sign16[l];
+            const uint qs = qs16[l] | ((qh << (8 - nibble_shift - 2 * l)) & 0x300);
+            const uvec2 grid = iq2s_grid[qs];
+            const vec4 grid0 = vec4(unpack8(grid.x));
+            const vec4 grid1 = vec4(unpack8(grid.y));
+
+            [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+                vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
+                vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
+
+                FLOAT_TYPE sum =
+                      fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign &   1) != 0 ? -grid0.x : grid0.x),
+                      fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign &   2) != 0 ? -grid0.y : grid0.y),
+                      fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign &   4) != 0 ? -grid0.z : grid0.z),
+                      fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign &   8) != 0 ? -grid0.w : grid0.w),
+                      fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign &  16) != 0 ? -grid1.x : grid1.x),
+                      fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign &  32) != 0 ? -grid1.y : grid1.y),
+                      fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign &  64) != 0 ? -grid1.z : grid1.z),
+                      fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w),
+                      FLOAT_TYPE(0.0)))))))));
+                temp[j][n] = fma(db, sum, temp[j][n]);
+            }
+        }
+        ibi += num_blocks_per_row;
+    }
+}
+
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+    uint a_offset, b_offset, d_offset;
+    get_offsets(a_offset, b_offset, d_offset);
+
+    const uint num_blocks_per_row = p.ncols / QUANT_K;
+
+    // 16 threads are used to process each block
+    const uint blocks_per_wg = gl_WorkGroupSize.x/16;
+    const uint tid = gl_LocalInvocationID.x;
+    const uint itid = tid % 16;  // 0...15
+    const uint ix = tid / 16;
+
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+            temp[j][i] = FLOAT_TYPE(0);
+        }
+    }
+
+    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += blocks_per_wg)
+        calc_superblock(a_offset, b_offset, itid, i, num_blocks_per_row, first_row, num_rows);
+
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    init_iq_shmem(gl_WorkGroupSize);
+
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp
new file mode 100644
index 000000000..c49604324
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp
@@ -0,0 +1,87 @@
+#version 450
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+
+#include "mul_mat_vec_base.comp"
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
+
+void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
+    const uint y_idx = i * QUANT_K + 16 * itid;
+    const uint nibble_shift = 4 * (itid & 1);
+    const uint ib32 = itid / 2; // 0..7
+
+    uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        const float d = float(data_a[ibi].d);
+        const uint scale = (data_a[ibi].scales[ib32] >> nibble_shift) & 0xF;
+        const float db = d * (0.5 + scale) * 0.25;
+
+        [[unroll]] for (uint l = 0; l < 2; ++l) {
+            const uint qs = data_a[ibi].qs[2 * itid + l];
+            const uint sign = qs >> 9;
+            const uint sign7 = bitCount(sign);
+            const vec4 grid0 = vec4(unpack8(iq2xs_grid[qs & 511].x));
+            const vec4 grid1 = vec4(unpack8(iq2xs_grid[qs & 511].y));
+
+            [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+                vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
+                vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
+
+                FLOAT_TYPE sum =
+                      fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign &   1) != 0 ? -grid0.x : grid0.x),
+                      fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign &   2) != 0 ? -grid0.y : grid0.y),
+                      fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign &   4) != 0 ? -grid0.z : grid0.z),
+                      fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign &   8) != 0 ? -grid0.w : grid0.w),
+                      fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign &  16) != 0 ? -grid1.x : grid1.x),
+                      fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign &  32) != 0 ? -grid1.y : grid1.y),
+                      fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign &  64) != 0 ? -grid1.z : grid1.z),
+                      fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign7 &  1) != 0 ? -grid1.w : grid1.w),
+                      FLOAT_TYPE(0.0)))))))));
+                temp[j][n] = fma(db, sum, temp[j][n]);
+            }
+        }
+        ibi += num_blocks_per_row;
+    }
+}
+
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+    uint a_offset, b_offset, d_offset;
+    get_offsets(a_offset, b_offset, d_offset);
+
+    const uint num_blocks_per_row = p.ncols / QUANT_K;
+
+    // 16 threads are used to process each block
+    const uint blocks_per_wg = gl_WorkGroupSize.x/16;
+    const uint tid = gl_LocalInvocationID.x;
+    const uint itid = tid % 16;  // 0...15
+    const uint ix = tid / 16;
+
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+            temp[j][i] = FLOAT_TYPE(0);
+        }
+    }
+
+    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += blocks_per_wg)
+        calc_superblock(a_offset, b_offset, itid, i, num_blocks_per_row, first_row, num_rows);
+
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    init_iq_shmem(gl_WorkGroupSize);
+
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp
new file mode 100644
index 000000000..94d4b92e1
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp
@@ -0,0 +1,87 @@
+#version 450
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+
+#include "mul_mat_vec_base.comp"
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
+
+void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
+    const uint y_idx = i * QUANT_K + 16 * itid;
+    const uint ib32 = itid / 2; // 0..7
+
+    uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        const float d = float(data_a[ibi].d);
+        const uint signscale = pack32(u16vec2(
+            data_a_packed16[ibi].qs[4 * ib32 + 2],
+            data_a_packed16[ibi].qs[4 * ib32 + 3]));
+        const float db = d * 0.25 * (0.5 + (signscale >> 28));
+        [[unroll]] for (uint l = 0; l < 2; ++l) {
+            const uint qs = data_a[ibi].qs[8 * ib32 + 2 * (itid & 1) + l];
+            const uint sign = bitfieldExtract(signscale, 7 * int(2 * (itid & 1) + l), 7);
+            const uint sign7 = bitCount(sign);
+            const vec4 grid0 = vec4(unpack8(iq2xxs_grid[qs].x));
+            const vec4 grid1 = vec4(unpack8(iq2xxs_grid[qs].y));
+
+            [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+                const vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
+                const vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
+
+                FLOAT_TYPE sum =
+                      fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign &   1) != 0 ? -grid0.x : grid0.x),
+                      fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign &   2) != 0 ? -grid0.y : grid0.y),
+                      fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign &   4) != 0 ? -grid0.z : grid0.z),
+                      fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign &   8) != 0 ? -grid0.w : grid0.w),
+                      fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign &  16) != 0 ? -grid1.x : grid1.x),
+                      fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign &  32) != 0 ? -grid1.y : grid1.y),
+                      fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign &  64) != 0 ? -grid1.z : grid1.z),
+                      fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign7 &  1) != 0 ? -grid1.w : grid1.w),
+                      FLOAT_TYPE(0.0)))))))));
+                temp[j][n] = fma(db, sum, temp[j][n]);
+            }
+        }
+        ibi += num_blocks_per_row;
+    }
+}
+
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+    uint a_offset, b_offset, d_offset;
+    get_offsets(a_offset, b_offset, d_offset);
+
+    const uint num_blocks_per_row = p.ncols / QUANT_K;
+
+    // 16 threads are used to process each block
+    const uint blocks_per_wg = gl_WorkGroupSize.x/16;
+    const uint tid = gl_LocalInvocationID.x;
+    const uint itid = tid % 16;  // 0...15
+    const uint ix = tid / 16;
+
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+            temp[j][i] = FLOAT_TYPE(0);
+        }
+    }
+
+    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += blocks_per_wg)
+        calc_superblock(a_offset, b_offset, itid, i, num_blocks_per_row, first_row, num_rows);
+
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    init_iq_shmem(gl_WorkGroupSize);
+
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp
new file mode 100644
index 000000000..af48f3290
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp
@@ -0,0 +1,90 @@
+#version 450
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+
+#include "mul_mat_vec_base.comp"
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
+
+void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
+    const uint y_idx = i * QUANT_K + 32 * ib32;
+
+    uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        const float d = float(data_a[ibi].d);
+        const uint scale = (data_a[ibi].scales[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
+        const float dscale = d * (1 + 2 * scale);
+        const uint qh = data_a[ibi].qh[ib32];
+        FLOAT_TYPE sum[NUM_COLS];
+        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+            sum[j] = 0.0;
+        }
+        [[unroll]] for (uint l = 0; l < 4; ++l) {
+            const u8vec2 qs = unpack8(data_a_packed16[ibi].qs[4 * ib32 + l]);
+            const uint sign = data_a[ibi].signs[4 * ib32 + l];
+            const vec4 grid0 = vec4(unpack8(iq3s_grid[qs.x | ((qh << (8 - 2*l)) & 0x100)]));
+            const vec4 grid1 = vec4(unpack8(iq3s_grid[qs.y | ((qh << (7 - 2*l)) & 0x100)]));
+
+            [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+                const vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
+                const vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
+
+                sum[j] =
+                      fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign &   1) != 0 ? -grid0.x : grid0.x),
+                      fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign &   2) != 0 ? -grid0.y : grid0.y),
+                      fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign &   4) != 0 ? -grid0.z : grid0.z),
+                      fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign &   8) != 0 ? -grid0.w : grid0.w),
+                      fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign &  16) != 0 ? -grid1.x : grid1.x),
+                      fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign &  32) != 0 ? -grid1.y : grid1.y),
+                      fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign &  64) != 0 ? -grid1.z : grid1.z),
+                      fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign & 128) != 0 ? -grid1.w : grid1.w),
+                      sum[j]))))))));
+            }
+        }
+        [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+            temp[j][n] = fma(dscale, sum[j], temp[j][n]);
+        }
+        ibi += num_blocks_per_row;
+    }
+}
+
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+    uint a_offset, b_offset, d_offset;
+    get_offsets(a_offset, b_offset, d_offset);
+
+    const uint num_blocks_per_row = p.ncols / QUANT_K;
+
+    // 8 threads are used to process each block
+    const uint blocks_per_wg = gl_WorkGroupSize.x/8;
+    const uint tid = gl_LocalInvocationID.x;
+    const uint itid = tid % 8;  // 0...7
+    const uint ix = tid / 8;
+
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+            temp[j][i] = FLOAT_TYPE(0);
+        }
+    }
+
+    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += blocks_per_wg)
+        calc_superblock(a_offset, b_offset, itid, i, num_blocks_per_row, first_row, num_rows);
+
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    init_iq_shmem(gl_WorkGroupSize);
+
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp
new file mode 100644
index 000000000..3fe9dc3a4
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp
@@ -0,0 +1,88 @@
+#version 450
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+
+#include "mul_mat_vec_base.comp"
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
+
+void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
+    const uint y_idx = i * QUANT_K + 16 * itid;
+    const uint ib32 = itid / 2; // 0..7
+
+    uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
+    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+        const float d = float(data_a[ibi].d);
+        const uint signscale = pack32(u16vec2(
+            data_a_packed16[ibi].qs[QUANT_K / 8 + 2 * ib32],
+            data_a_packed16[ibi].qs[QUANT_K / 8 + 2 * ib32 + 1]));
+        const float db = d * 0.5 * (0.5 + (signscale >> 28));
+        [[unroll]] for (uint l = 0; l < 2; ++l) {
+            const uint qs0 = data_a[ibi].qs[8 * ib32 + 4 * (itid & 1) + 2 * l];
+            const uint qs1 = data_a[ibi].qs[8 * ib32 + 4 * (itid & 1) + 2 * l + 1];
+            const uint sign = bitfieldExtract(signscale, 7 * int(2 * (itid & 1) + l), 7);
+            const uint sign7 = bitCount(sign);
+            const vec4 grid0 = vec4(unpack8(iq3xxs_grid[qs0]));
+            const vec4 grid1 = vec4(unpack8(iq3xxs_grid[qs1]));
+
+            [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+                const vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
+                const vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
+
+                FLOAT_TYPE sum =
+                      fma(FLOAT_TYPE(b0.x), FLOAT_TYPE((sign &   1) != 0 ? -grid0.x : grid0.x),
+                      fma(FLOAT_TYPE(b0.y), FLOAT_TYPE((sign &   2) != 0 ? -grid0.y : grid0.y),
+                      fma(FLOAT_TYPE(b0.z), FLOAT_TYPE((sign &   4) != 0 ? -grid0.z : grid0.z),
+                      fma(FLOAT_TYPE(b0.w), FLOAT_TYPE((sign &   8) != 0 ? -grid0.w : grid0.w),
+                      fma(FLOAT_TYPE(b4.x), FLOAT_TYPE((sign &  16) != 0 ? -grid1.x : grid1.x),
+                      fma(FLOAT_TYPE(b4.y), FLOAT_TYPE((sign &  32) != 0 ? -grid1.y : grid1.y),
+                      fma(FLOAT_TYPE(b4.z), FLOAT_TYPE((sign &  64) != 0 ? -grid1.z : grid1.z),
+                      fma(FLOAT_TYPE(b4.w), FLOAT_TYPE((sign7 &  1) != 0 ? -grid1.w : grid1.w),
+                      FLOAT_TYPE(0.0)))))))));
+                temp[j][n] = fma(db, sum, temp[j][n]);
+            }
+        }
+        ibi += num_blocks_per_row;
+    }
+}
+
+void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
+    uint a_offset, b_offset, d_offset;
+    get_offsets(a_offset, b_offset, d_offset);
+
+    const uint num_blocks_per_row = p.ncols / QUANT_K;
+
+    // 16 threads are used to process each block
+    const uint blocks_per_wg = gl_WorkGroupSize.x/16;
+    const uint tid = gl_LocalInvocationID.x;
+    const uint itid = tid % 16;  // 0...15
+    const uint ix = tid / 16;
+
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
+            temp[j][i] = FLOAT_TYPE(0);
+        }
+    }
+
+    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += blocks_per_wg)
+        calc_superblock(a_offset, b_offset, itid, i, num_blocks_per_row, first_row, num_rows);
+
+    reduce_result(temp, d_offset, first_row, num_rows, tid);
+}
+
+void main() {
+    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);
+
+    init_iq_shmem(gl_WorkGroupSize);
+
+    // do NUM_ROWS at a time, unless there aren't enough remaining rows
+    if (first_row + NUM_ROWS <= p.stride_d) {
+        compute_outputs(first_row, NUM_ROWS);
+    } else {
+        if (first_row >= p.stride_d) {
+            return;
+        }
+        compute_outputs(first_row, p.stride_d - first_row);
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
index 39657195c..a8fd93fde 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
@@ -32,6 +32,13 @@
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+#if defined(A_TYPE_PACKED16)
+layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];};
+#endif
+#if defined(A_TYPE_PACKED32)
+layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];};
+#endif
+
 layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
 layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
 
@@ -243,74 +250,100 @@ void main() {
 #endif
 #elif defined(DATA_A_Q4_0)
             const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 4 * loadr_a;
 
-            const uint ib = idx / 16;
-            const uint iqs = idx & 0xF;
+            const uint ib = idx / 4;
+            const uint iqs = idx & 0x03;
 
-            const float d = float(data_a[ib].d);
-            const uint vui = uint(data_a[ib].qs[iqs]);
-            const vec2 v = (vec2(vui & 0xF, vui >> 4) - 8.0f) * d;
+            const float d = float(data_a_packed16[ib].d);
+            const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);
+            const vec4 v0 = (vec4(unpack8(vui & 0x0F0F0F0F)) - 8.0f) * d;
+            const vec4 v1 = (vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) - 8.0f) * d;
 
-            buf_a[buf_idx     ] = FLOAT_TYPE(v.x);
-            buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
+            buf_a[buf_idx     ] = FLOAT_TYPE(v0.x);
+            buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y);
+            buf_a[buf_idx + 2 ] = FLOAT_TYPE(v0.z);
+            buf_a[buf_idx + 3 ] = FLOAT_TYPE(v0.w);
+            buf_a[buf_idx + 16] = FLOAT_TYPE(v1.x);
+            buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y);
+            buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z);
+            buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w);
 #elif defined(DATA_A_Q4_1)
             const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 4 * loadr_a;
 
-            const uint ib = idx / 16;
-            const uint iqs = idx & 0xF;
+            const uint ib = idx / 4;
+            const uint iqs = idx & 0x03;
 
-            const float d = float(data_a[ib].d);
-            const float m = float(data_a[ib].m);
-            const uint vui = uint(data_a[ib].qs[iqs]);
-            const vec2 v = vec2(vui & 0xF, vui >> 4) * d + m;
+            const float d = float(data_a_packed16[ib].d);
+            const float m = float(data_a_packed16[ib].m);
+            const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);
+            const vec4 v0 = vec4(unpack8(vui & 0x0F0F0F0F)) * d + m;
+            const vec4 v1 = vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) * d + m;
 
-            buf_a[buf_idx     ] = FLOAT_TYPE(v.x);
-            buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
+            buf_a[buf_idx     ] = FLOAT_TYPE(v0.x);
+            buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y);
+            buf_a[buf_idx + 2 ] = FLOAT_TYPE(v0.z);
+            buf_a[buf_idx + 3 ] = FLOAT_TYPE(v0.w);
+            buf_a[buf_idx + 16] = FLOAT_TYPE(v1.x);
+            buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y);
+            buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z);
+            buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w);
 #elif defined(DATA_A_Q5_0)
             const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a;
 
-            const uint ib = idx / 16;
-            const uint iqs = idx & 0xF;
+            const uint ib = idx / 8;
+            const uint iqs = idx & 0x07;
 
-            const float d = float(data_a[ib].d);
-            const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0];
-            const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
-            const uint vui = uint(data_a[ib].qs[iqs]);
-            const vec2 v = (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f) * d;
+            const float d = float(data_a_packed16[ib].d);
+            const uint uint_qh = uint(data_a_packed16[ib].qh[1]) << 16 | uint(data_a_packed16[ib].qh[0]);
+            const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10);
+            const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10);
+
+            const uint vui = uint(data_a_packed16[ib].qs[iqs]);
+            const vec4 v = (vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) - 16.0f) * d;
 
             buf_a[buf_idx     ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1 ] = FLOAT_TYPE(v.z);
             buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
+            buf_a[buf_idx + 17] = FLOAT_TYPE(v.w);
 #elif defined(DATA_A_Q5_1)
             const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a;
 
-            const uint ib = idx / 16;
-            const uint iqs = idx & 0xF;
+            const uint ib = idx / 8;
+            const uint iqs = idx & 0x07;
 
-            const float d = float(data_a[ib].d);
-            const float m = float(data_a[ib].m);
-            const uint uint_qh = data_a[ib].qh;
-            const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
-            const uint vui = uint(data_a[ib].qs[iqs]);
-            const vec2 v = vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) * d + m;
+            const float d = float(data_a_packed16[ib].d);
+            const float m = float(data_a_packed16[ib].m);
+            const uint uint_qh = data_a_packed16[ib].qh;
+            const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10);
+            const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10);
+
+            const uint vui = uint(data_a_packed16[ib].qs[iqs]);
+            const vec4 v = vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) * d + m;
 
             buf_a[buf_idx     ] = FLOAT_TYPE(v.x);
+            buf_a[buf_idx + 1 ] = FLOAT_TYPE(v.z);
             buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
+            buf_a[buf_idx + 17] = FLOAT_TYPE(v.w);
 #elif defined(DATA_A_Q8_0)
             const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
             const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
 
-            const uint ib = idx / 16;
-            const uint iqs = (idx & 0xF) * 2;
+            const uint ib = idx / 8;
+            const uint iqs = idx & 0x07;
 
-            const float d = float(data_a[ib].d);
-            const vec2 v = vec2(int(data_a[ib].qs[iqs]), int(data_a[ib].qs[iqs + 1])) * d;
+            const float d = float(data_a_packed16[ib].d);
+            const i8vec2 v0 = unpack8(data_a_packed16[ib].qs[2*iqs]);
+            const i8vec2 v1 = unpack8(data_a_packed16[ib].qs[2*iqs + 1]);
+            const vec4 v = vec4(v0.x, v0.y, v1.x, v1.y) * d;
 
             buf_a[buf_idx    ] = FLOAT_TYPE(v.x);
             buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
+            buf_a[buf_idx + 2] = FLOAT_TYPE(v.z);
+            buf_a[buf_idx + 3] = FLOAT_TYPE(v.w);
 #elif defined(DATA_A_Q2_K)
             const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
             const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
@@ -623,17 +656,18 @@ void main() {
             buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
 #elif defined(DATA_A_IQ4_NL)
             const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
-            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
+            const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a;
 
-            const uint ib = idx / 16;
-            const uint iqs = idx & 0xF;
+            const uint ib = idx / 8;
+            const uint iqs = idx & 0x07;
 
-            const float d = float(data_a[ib].d);
-            const uint vui = uint(data_a[ib].qs[iqs]);
-            const vec2 v = vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
+            const FLOAT_TYPE d = FLOAT_TYPE(data_a_packed16[ib].d);
+            const uint vui = uint(data_a_packed16[ib].qs[iqs]);
 
-            buf_a[buf_idx     ] = FLOAT_TYPE(v.x);
-            buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
+            buf_a[buf_idx     ] = FLOAT_TYPE(kvalues_iq4nl[vui & 0xF]) * d;
+            buf_a[buf_idx + 1 ] = FLOAT_TYPE(kvalues_iq4nl[bitfieldExtract(vui, 8, 4)]) * d;
+            buf_a[buf_idx + 16] = FLOAT_TYPE(kvalues_iq4nl[bitfieldExtract(vui, 4, 4)]) * d;
+            buf_a[buf_idx + 17] = FLOAT_TYPE(kvalues_iq4nl[vui >> 12]) * d;
 #endif
         }
         [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp
new file mode 100644
index 000000000..76009f3df
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp
@@ -0,0 +1,55 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+#define BLOCK_SIZE 512
+
+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer G {A_TYPE data_a[];};
+layout (binding = 1) readonly buffer X {B_TYPE data_b[];};
+layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
+
+shared FLOAT_TYPE sum_xx[BLOCK_SIZE];
+shared FLOAT_TYPE sum_xg[BLOCK_SIZE];
+
+void main() {
+    const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
+    const uint tid = gl_LocalInvocationID.x;
+
+    // Compute derivative of x[i]/norm(x) = g[i]/norm(x) - x[i] dot(x,g)/KX / norm(x)^1.5
+
+    // partial sums for thread in warp
+    sum_xx[tid] = FLOAT_TYPE(0.0f);
+    sum_xg[tid] = FLOAT_TYPE(0.0f);
+
+    [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
+        const FLOAT_TYPE gi = FLOAT_TYPE(data_a[row*p.KX + col]);
+        const FLOAT_TYPE xi = FLOAT_TYPE(data_b[row*p.KX + col]);
+        sum_xx[tid] += xi * xi;
+        sum_xg[tid] += xi * gi;
+    }
+
+    // sum up partial sums and write back result
+    barrier();
+    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            sum_xx[tid] += sum_xx[tid + s];
+            sum_xg[tid] += sum_xg[tid + s];
+        }
+        barrier();
+    }
+
+    const FLOAT_TYPE eps = FLOAT_TYPE(p.param1);
+    const FLOAT_TYPE mean = sum_xx[0] / FLOAT_TYPE(p.KX);
+    const FLOAT_TYPE scale_g = inversesqrt(mean + eps);
+    const FLOAT_TYPE scale_x = -scale_g * sum_xg[0] / (sum_xx[0] + FLOAT_TYPE(p.KX) * eps);
+
+    [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
+        data_d[row*p.KX + col] = D_TYPE(
+            scale_g * FLOAT_TYPE(data_a[row*p.KX + col]) +
+            scale_x * FLOAT_TYPE(data_b[row*p.KX + col]));
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp
index 38075b755..96c9c4cbd 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp
@@ -29,6 +29,7 @@ layout (push_constant) uniform parameter {
     uint s1;
     uint s2;
     int sections[4];
+    uint is_back;
 } p;
 
 float rope_yarn_ramp(const float low, const float high, const uint i0) {
@@ -48,6 +49,10 @@ void rope_yarn(const float theta_extrap, const uint i0, out float cos_theta, out
         // Get n-d magnitude scaling corrected for interpolation
         mscale *= 1.0f + 0.1f * log(1.0f / p.freq_scale);
     }
+    // Backprogagation uses inverted rotation
+    if (p.is_back != 0) {
+        theta = -theta;
+    }
     cos_theta = cos(theta) * mscale;
     sin_theta = sin(theta) * mscale;
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp b/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp
new file mode 100644
index 000000000..776581e2c
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp
@@ -0,0 +1,20 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+    if (i >= p.KX) {
+        return;
+    }
+    data_d[i] = D_TYPE(1. / (1 + exp(-1. *data_a[i])));
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp b/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp
new file mode 100644
index 000000000..f9afa9b13
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp
@@ -0,0 +1,26 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer G {A_TYPE data_g[];};
+layout (binding = 1) readonly buffer X {B_TYPE data_x[];};
+layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
+
+void main() {
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+    if (i >= p.KX) {
+        return;
+    }
+
+    // Compute derivative of SiLU(x): 1/(1+exp(-x)) - x*exp(-x)/(1+exp(-x))^2
+
+    const float xi = float(data_x[i]);
+    const float s = 1.0f / (1.0f + exp(-xi));
+    data_d[i] = D_TYPE(data_g[i] * (s + xi * s * (1 - s)));
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp
new file mode 100644
index 000000000..29bd77d7e
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp
@@ -0,0 +1,50 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : enable
+
+#include "generic_head.comp"
+#include "types.comp"
+
+layout(constant_id = 0) const uint BLOCK_SIZE = 32;
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+// In this shader Y = softmax(X) and X is not provided as input.
+
+layout (binding = 0) readonly buffer G {A_TYPE data_g[];};
+layout (binding = 1) readonly buffer Y {B_TYPE data_y[];};
+layout (binding = 2) buffer D {D_TYPE data_d[];};
+
+shared FLOAT_TYPE sum_yg[BLOCK_SIZE];
+
+void main() {
+    const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
+    const uint tid = gl_LocalInvocationID.x;
+
+    FLOAT_TYPE scale = p.param1;
+
+    // partial sums for thread in warp
+    sum_yg[tid] = FLOAT_TYPE(0.0f);
+
+    [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
+        const FLOAT_TYPE gi = FLOAT_TYPE(data_g[row*p.KX + col]);
+        const FLOAT_TYPE yi = FLOAT_TYPE(data_y[row*p.KX + col]);
+        sum_yg[tid] += yi * gi;
+    }
+
+    // sum up partial sums and write back result
+    barrier();
+    [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            sum_yg[tid] += sum_yg[tid + s];
+        }
+        barrier();
+    }
+
+    const FLOAT_TYPE dot_yg = sum_yg[0];
+
+    [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
+        data_d[row*p.KX + col] = D_TYPE(scale
+            * (FLOAT_TYPE(data_g[row*p.KX + col]) - dot_yg)
+            * FLOAT_TYPE(data_y[row*p.KX + col]));
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
index dfa16cda5..f01179326 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp
@@ -139,7 +139,7 @@ struct block_q8_0
 struct block_q8_0_packed16
 {
     float16_t d;
-    uint16_t qs[32/2];
+    int16_t qs[32/2];
 };
 
 #if defined(DATA_A_Q8_0)
@@ -466,10 +466,13 @@ shared uint16_t iq1s_grid[2048];
 void init_iq_shmem(uvec3 wgsize)
 {
     // copy the table into shared memory and sync
-    for (uint i = gl_LocalInvocationIndex.x; i < iq1s_grid_const.length(); i += wgsize.x) {
-        u16vec2 g = unpack16(iq1s_grid_const[i]);
-        iq1s_grid[2*i+0] = g.x;
-        iq1s_grid[2*i+1] = g.y;
+    [[unroll]] for (uint i = 0; i < iq1s_grid_const.length(); i += wgsize.x) {
+        uint idx = i + gl_LocalInvocationIndex.x;
+        if (iq1s_grid_const.length() % wgsize.x == 0 || idx < iq1s_grid_const.length()) {
+            u16vec2 g = unpack16(iq1s_grid_const[idx]);
+            iq1s_grid[2*idx+0] = g.x;
+            iq1s_grid[2*idx+1] = g.y;
+        }
     }
     barrier();
 }
@@ -565,8 +568,10 @@ shared uvec2 iq2xxs_grid[256];
 void init_iq_shmem(uvec3 wgsize)
 {
     // copy the table into shared memory and sync
-    for (uint i = gl_LocalInvocationIndex.x; i < iq2xxs_grid.length(); i += wgsize.x) {
-        iq2xxs_grid[i] = iq2xxs_grid_const[i];
+    [[unroll]] for (uint i = 0; i < iq2xxs_grid.length(); i += wgsize.x) {
+        if (iq2xxs_grid_const.length() % wgsize.x == 0 || i + gl_LocalInvocationIndex.x < iq2xxs_grid_const.length()) {
+            iq2xxs_grid[i + gl_LocalInvocationIndex.x] = iq2xxs_grid_const[i + gl_LocalInvocationIndex.x];
+        }
     }
     barrier();
 }
@@ -733,8 +738,10 @@ shared uvec2 iq2xs_grid[512];
 void init_iq_shmem(uvec3 wgsize)
 {
     // copy the table into shared memory and sync
-    for (uint i = gl_LocalInvocationIndex.x; i < iq2xs_grid.length(); i += wgsize.x) {
-        iq2xs_grid[i] = iq2xs_grid_const[i];
+    [[unroll]] for (uint i = 0; i < iq2xs_grid.length(); i += wgsize.x) {
+        if (iq2xs_grid.length() % wgsize.x == 0 || i + gl_LocalInvocationIndex.x < iq2xs_grid_const.length()) {
+            iq2xs_grid[i + gl_LocalInvocationIndex.x] = iq2xs_grid_const[i + gl_LocalInvocationIndex.x];
+        }
     }
     barrier();
 }
@@ -756,6 +763,14 @@ struct block_iq2_s
     uint8_t scales[QUANT_K_IQ2_S/32];
 };
 
+struct block_iq2_s_packed16
+{
+    float16_t d;
+    uint16_t qs[QUANT_K_IQ2_S/8];
+    uint16_t qh[QUANT_K_IQ2_S/64];
+    uint16_t scales[QUANT_K_IQ2_S/64];
+};
+
 #if defined(DATA_A_IQ2_S)
 
 const uvec2 iq2s_grid_const[1024] = {
@@ -1023,8 +1038,10 @@ shared uvec2 iq2s_grid[1024];
 void init_iq_shmem(uvec3 wgsize)
 {
     // copy the table into shared memory and sync
-    for (uint i = gl_LocalInvocationIndex.x; i < iq2s_grid.length(); i += wgsize.x) {
-        iq2s_grid[i] = iq2s_grid_const[i];
+    [[unroll]] for (uint i = 0; i < iq2s_grid.length(); i += wgsize.x) {
+        if (iq2s_grid.length() % wgsize.x == 0 || i + gl_LocalInvocationIndex.x < iq2s_grid_const.length()) {
+            iq2s_grid[i + gl_LocalInvocationIndex.x] = iq2s_grid_const[i + gl_LocalInvocationIndex.x];
+        }
     }
     barrier();
 }
@@ -1032,6 +1049,7 @@ void init_iq_shmem(uvec3 wgsize)
 #define QUANT_K QUANT_K_IQ2_S
 #define QUANT_R QUANT_R_IQ2_S
 #define A_TYPE block_iq2_s
+#define A_TYPE_PACKED16 block_iq2_s_packed16
 #endif
 
 #define QUANT_K_IQ3_XXS 256
@@ -1092,8 +1110,10 @@ shared uint32_t iq3xxs_grid[256];
 void init_iq_shmem(uvec3 wgsize)
 {
     // copy the table into shared memory and sync
-    for (uint i = gl_LocalInvocationIndex.x; i < iq3xxs_grid.length(); i += wgsize.x) {
-        iq3xxs_grid[i] = iq3xxs_grid_const[i];
+    [[unroll]] for (uint i = 0; i < iq3xxs_grid.length(); i += wgsize.x) {
+        if (iq3xxs_grid.length() % wgsize.x == 0 || i + gl_LocalInvocationIndex.x < iq3xxs_grid.length()) {
+            iq3xxs_grid[i + gl_LocalInvocationIndex.x] = iq3xxs_grid_const[i + gl_LocalInvocationIndex.x];
+        }
     }
     barrier();
 }
@@ -1200,8 +1220,10 @@ shared uint32_t iq3s_grid[512];
 void init_iq_shmem(uvec3 wgsize)
 {
     // copy the table into shared memory and sync
-    for (uint i = gl_LocalInvocationIndex.x; i < iq3s_grid.length(); i += wgsize.x) {
-        iq3s_grid[i] = iq3s_grid_const[i];
+    [[unroll]] for (uint i = 0; i < iq3s_grid.length(); i += wgsize.x) {
+        if (iq3s_grid.length() % wgsize.x == 0 || i + gl_LocalInvocationIndex.x < iq3s_grid.length()) {
+            iq3s_grid[i + gl_LocalInvocationIndex.x] = iq3s_grid_const[i + gl_LocalInvocationIndex.x];
+        }
     }
     barrier();
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index a9317c57c..1f77fcfa1 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -331,11 +331,17 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool
     string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc);
 
     for (const auto& tname : type_names) {
+        std::string load_vec_quant = "2";
+        if ((tname == "q4_0") || (tname == "q4_1"))
+            load_vec_quant = "8";
+        else if ((tname == "q5_0") || (tname == "q5_1") || (tname == "q8_0") || (tname == "iq4_nl"))
+            load_vec_quant = "4";
+
         std::string data_a_key = "DATA_A_" + to_uppercase(tname);
         // For unaligned, load one at a time for f32/f16, or two at a time for quants
-        std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" : "2";
+        std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" : load_vec_quant;
         // For aligned matmul loads
-        std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : "2";
+        std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : load_vec_quant;
 
         // don't generate f32 variants for coopmat2
         if (!coopmat2) {
@@ -402,7 +408,7 @@ void process_shaders() {
     for (const auto& tname : type_names) {
         // mul mat vec
         std::string data_a_key = "DATA_A_" + to_uppercase(tname);
-        std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
+        std::string shader = (string_ends_with(tname, "_k") || string_starts_with(tname, "iq1_") || string_starts_with(tname, "iq2_") || string_starts_with(tname, "iq3_")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
 
         string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC2", "vec2"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
         string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC2", "f16vec2"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
@@ -433,6 +439,7 @@ void process_shaders() {
     string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
     string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
     string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("rms_norm_back_f32", "rms_norm_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
 
     string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
@@ -483,14 +490,17 @@ void process_shaders() {
     string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("silu_back_f32", "silu_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("sigmoid_f32", "sigmoid.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
 
     string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
 
     string_to_spv("soft_max_f32", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
     string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
+    string_to_spv("soft_max_back_f32", "soft_max_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
 
     string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 985acc034..5a0a31d26 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -241,7 +241,11 @@ void ggml_log_callback_default(enum ggml_log_level level, const char * text, voi
 
 
 void * ggml_aligned_malloc(size_t size) {
+#if defined(__s390x__)
+    const int alignment = 256;
+#else
     const int alignment = 64;
+#endif
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
     return _aligned_malloc(size, alignment);
diff --git a/gguf-py/examples/reader.py b/gguf-py/examples/reader.py
index d841048c6..703b782b5 100644
--- a/gguf-py/examples/reader.py
+++ b/gguf-py/examples/reader.py
@@ -2,12 +2,14 @@
 import logging
 import sys
 from pathlib import Path
-from gguf.gguf_reader import GGUFReader
 
 logger = logging.getLogger("reader")
 
+# Necessary to load the local gguf package
 sys.path.insert(0, str(Path(__file__).parent.parent))
 
+from gguf.gguf_reader import GGUFReader
+
 
 def read_gguf_file(gguf_file_path):
     """
diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index e17a4e831..5991cdb76 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -6,6 +6,7 @@ from __future__ import annotations
 
 import logging
 import os
+import sys
 from collections import OrderedDict
 from typing import Any, Literal, NamedTuple, TypeVar, Union
 
@@ -15,7 +16,6 @@ import numpy.typing as npt
 from .quants import quant_shape_to_byte_shape
 
 if __name__ == "__main__":
-    import sys
     from pathlib import Path
 
     # Allow running file in package as a script.
@@ -28,6 +28,7 @@ from gguf.constants import (
     GGUF_VERSION,
     GGMLQuantizationType,
     GGUFValueType,
+    GGUFEndian,
 )
 
 logger = logging.getLogger(__name__)
@@ -53,6 +54,48 @@ class ReaderField(NamedTuple):
 
     types: list[GGUFValueType] = []
 
+    def contents(self, index_or_slice: int | slice = slice(None)) -> Any:
+        if self.types:
+            to_string = lambda x: str(x.tobytes(), encoding='utf-8') # noqa: E731
+            main_type = self.types[0]
+
+            if main_type == GGUFValueType.ARRAY:
+                sub_type = self.types[-1]
+
+                if sub_type == GGUFValueType.STRING:
+                    indices = self.data[index_or_slice]
+
+                    if isinstance(index_or_slice, int):
+                        return to_string(self.parts[indices]) # type: ignore
+                    else:
+                        return [to_string(self.parts[idx]) for idx in indices] # type: ignore
+                else:
+                    # FIXME: When/if _get_field_parts() support multi-dimensional arrays, this must do so too
+
+                    # Check if it's unsafe to perform slice optimization on data
+                    # if any(True for idx in self.data if len(self.parts[idx]) != 1):
+                    #     optim_slice = slice(None)
+                    # else:
+                    #     optim_slice = index_or_slice
+                    #     index_or_slice = slice(None)
+
+                    # if isinstance(optim_slice, int):
+                    #     return self.parts[self.data[optim_slice]].tolist()[0]
+                    # else:
+                    #     return [pv for idx in self.data[optim_slice] for pv in self.parts[idx].tolist()][index_or_slice]
+
+                    if isinstance(index_or_slice, int):
+                        return self.parts[self.data[index_or_slice]].tolist()[0]
+                    else:
+                        return [pv for idx in self.data[index_or_slice] for pv in self.parts[idx].tolist()]
+
+            if main_type == GGUFValueType.STRING:
+                return to_string(self.parts[-1])
+            else:
+                return self.parts[-1].tolist()[0]
+
+        return None
+
 
 class ReaderTensor(NamedTuple):
     name: str
@@ -101,10 +144,19 @@ class GGUFReader:
             # If we get 0 here that means it's (probably) a GGUF file created for
             # the opposite byte order of the machine this script is running on.
             self.byte_order = 'S'
-            temp_version = temp_version.newbyteorder(self.byte_order)
+            temp_version = temp_version.view(temp_version.dtype.newbyteorder(self.byte_order))
         version = temp_version[0]
         if version not in READER_SUPPORTED_VERSIONS:
             raise ValueError(f'Sorry, file appears to be version {version} which we cannot handle')
+        if sys.byteorder == "little":
+            # Host is little endian
+            host_endian = GGUFEndian.LITTLE
+            swapped_endian = GGUFEndian.BIG
+        else:
+            # Sorry PDP or other weird systems that don't use BE or LE.
+            host_endian = GGUFEndian.BIG
+            swapped_endian = GGUFEndian.LITTLE
+        self.endianess = swapped_endian if self.byte_order == "S" else host_endian
         self.fields: OrderedDict[str, ReaderField] = OrderedDict()
         self.tensors: list[ReaderTensor] = []
         offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
@@ -146,9 +198,7 @@ class GGUFReader:
         itemsize = int(np.empty([], dtype = dtype).itemsize)
         end_offs = offset + itemsize * count
         arr = self.data[offset:end_offs].view(dtype=dtype)[:count]
-        if override_order is None:
-            return arr
-        return arr.view(arr.dtype.newbyteorder(override_order))
+        return arr.view(arr.dtype.newbyteorder(self.byte_order if override_order is None else override_order))
 
     def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
         if field.name in self.fields:
@@ -190,6 +240,7 @@ class GGUFReader:
             offs += int(alen.nbytes)
             aparts: list[npt.NDArray[Any]] = [raw_itype, alen]
             data_idxs: list[int] = []
+            # FIXME: Handle multi-dimensional arrays properly instead of flattening
             for idx in range(alen[0]):
                 curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(offs, raw_itype[0])
                 if idx == 0:
diff --git a/gguf-py/gguf/scripts/gguf_convert_endian.py b/gguf-py/gguf/scripts/gguf_convert_endian.py
index f97e91bd4..0e0febaa7 100755
--- a/gguf-py/gguf/scripts/gguf_convert_endian.py
+++ b/gguf-py/gguf/scripts/gguf_convert_endian.py
@@ -20,22 +20,15 @@ logger = logging.getLogger("gguf-convert-endian")
 
 
 def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None:
-    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
-        # Host is little endian
-        host_endian = "little"
-        swapped_endian = "big"
+    file_endian = reader.endianess.name
+    if reader.byte_order == 'S':
+        host_endian = 'BIG' if file_endian == 'LITTLE' else 'LITTLE'
     else:
-        # Sorry PDP or other weird systems that don't use BE or LE.
-        host_endian = "big"
-        swapped_endian = "little"
-    if reader.byte_order == "S":
-        file_endian = swapped_endian
-    else:
-        file_endian = host_endian
-    order = host_endian if args.order == "native" else args.order
-    logger.info(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian")
+        host_endian = file_endian
+    order = host_endian if args.order == "native" else args.order.upper()
+    logger.info(f"* Host is {host_endian} endian, GGUF file seems to be {file_endian} endian")
     if file_endian == order:
-        logger.info(f"* File is already {order.upper()} endian. Nothing to do.")
+        logger.info(f"* File is already {order} endian. Nothing to do.")
         sys.exit(0)
     logger.info("* Checking tensors for conversion compatibility")
     for tensor in reader.tensors:
@@ -43,9 +36,11 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
             gguf.GGMLQuantizationType.F32,
             gguf.GGMLQuantizationType.F16,
             gguf.GGMLQuantizationType.Q8_0,
+            gguf.GGMLQuantizationType.Q4_K,
+            gguf.GGMLQuantizationType.Q6_K,
         ):
             raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}")
-    logger.info(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}")
+    logger.info(f"* Preparing to convert from {file_endian} to {order}")
     if args.dry_run:
         return
     logger.warning("*** Warning *** Warning *** Warning **")
@@ -96,6 +91,59 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
                 if block_num % 100000 == 0:
                     inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
 
+        elif tensor.tensor_type == gguf.GGMLQuantizationType.Q4_K:
+            # Handle Q4_K tensor blocks (block_q4_k)
+            # Specific handling of block_q4_k is required.
+            # Each block_q4_k consists of 2 f16 values followed by 140 int8 values.
+
+            # first flatten structure
+            newshape = 1
+            for i in tensor.data.shape:
+                newshape *= i
+
+            tensor.data.resize(newshape)
+
+            block_size = 144
+            n_blocks = len(tensor.data) // block_size
+            for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
+                block_offs = block_num * block_size
+
+                # Byte-Swap f16 sized fields
+                delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
+                delta.byteswap(inplace=True)
+
+                delta = tensor.data[block_offs + 2:block_offs + 4].view(dtype=np.uint16)
+                delta.byteswap(inplace=True)
+
+                # Byte-Swap
+                if block_num % 100000 == 0:
+                    inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
+
+        elif tensor.tensor_type == gguf.GGMLQuantizationType.Q6_K:
+            # Handle Q6_K tensor blocks (block_q6_k)
+            # Specific handling of block_q6_k is required.
+            # Each block_q6_k consists of 208 int8 values followed by 1 f16 value.
+
+            # first flatten structure
+            newshape = 1
+            for i in tensor.data.shape:
+                newshape *= i
+
+            tensor.data.resize(newshape)
+
+            block_size = 210
+            n_blocks = len(tensor.data) // block_size
+            for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
+                block_offs = block_num * block_size
+
+                # Byte-Swap f16 sized field
+                delta = tensor.data[block_offs + 208:block_offs + 210].view(dtype=np.uint16)
+                delta.byteswap(inplace=True)
+
+                # Byte-Swap
+                if block_num % 100000 == 0:
+                    inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
+
         else:
             # Handle other tensor types
             tensor.data.byteswap(inplace=True)
diff --git a/gguf-py/gguf/scripts/gguf_dump.py b/gguf-py/gguf/scripts/gguf_dump.py
index 20f23d729..e282892d6 100755
--- a/gguf-py/gguf/scripts/gguf_dump.py
+++ b/gguf-py/gguf/scripts/gguf_dump.py
@@ -9,8 +9,6 @@ import sys
 from pathlib import Path
 from typing import Any
 
-import numpy as np
-
 # Necessary to load the local gguf package
 if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent.parent / 'gguf-py').exists():
     sys.path.insert(0, str(Path(__file__).parent.parent.parent))
@@ -21,11 +19,11 @@ logger = logging.getLogger("gguf-dump")
 
 
 def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]:
-    host_endian = 'LITTLE' if np.uint32(1) == np.uint32(1).newbyteorder("<") else 'BIG'
+    file_endian = reader.endianess.name
     if reader.byte_order == 'S':
-        file_endian = 'BIG' if host_endian == 'LITTLE' else 'LITTLE'
+        host_endian = 'BIG' if file_endian == 'LITTLE' else 'LITTLE'
     else:
-        file_endian = host_endian
+        host_endian = file_endian
     return (host_endian, file_endian)
 
 
@@ -45,12 +43,20 @@ def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
             pretty_type = str(field.types[-1].name)
 
         log_message = f'  {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}'
-        if len(field.types) == 1:
+        if field.types:
             curr_type = field.types[0]
             if curr_type == GGUFValueType.STRING:
-                log_message += ' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf-8')[:60]))
-            elif field.types[0] in reader.gguf_scalar_to_np:
-                log_message += ' = {0}'.format(field.parts[-1][0])
+                content = field.contents()
+                if len(content) > 60:
+                    content = content[:57] + '...'
+                log_message += ' = {0}'.format(repr(content))
+            elif curr_type in reader.gguf_scalar_to_np:
+                log_message += ' = {0}'.format(field.contents())
+            else:
+                content = repr(field.contents(slice(6)))
+                if len(field.data) > 6:
+                    content = content[:-1] + ', ...]'
+                log_message += ' = {0}'.format(content)
         print(log_message)  # noqa: NP100
     if args.no_tensors:
         return
@@ -82,15 +88,9 @@ def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None:
             curr["array_types"] = [t.name for t in field.types][1:]
             if not args.json_array:
                 continue
-            itype = field.types[-1]
-            if itype == GGUFValueType.STRING:
-                curr["value"] = [str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data]
-            else:
-                curr["value"] = [pv for idx in field.data for pv in field.parts[idx].tolist()]
-        elif field.types[0] == GGUFValueType.STRING:
-            curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8")
+            curr["value"] = field.contents()
         else:
-            curr["value"] = field.parts[-1].tolist()[0]
+            curr["value"] = field.contents()
     if not args.no_tensors:
         for idx, tensor in enumerate(reader.tensors):
             tensors[tensor.name] = {
diff --git a/gguf-py/gguf/scripts/gguf_new_metadata.py b/gguf-py/gguf/scripts/gguf_new_metadata.py
index a8cfc9d58..7aff6c925 100755
--- a/gguf-py/gguf/scripts/gguf_new_metadata.py
+++ b/gguf-py/gguf/scripts/gguf_new_metadata.py
@@ -8,7 +8,6 @@ import sys
 import json
 from pathlib import Path
 
-import numpy as np
 from tqdm import tqdm
 from typing import Any, Sequence, NamedTuple
 
@@ -27,45 +26,10 @@ class MetadataDetails(NamedTuple):
     description: str = ''
 
 
-def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian:
-    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
-        # Host is little endian
-        host_endian = gguf.GGUFEndian.LITTLE
-        swapped_endian = gguf.GGUFEndian.BIG
-    else:
-        # Sorry PDP or other weird systems that don't use BE or LE.
-        host_endian = gguf.GGUFEndian.BIG
-        swapped_endian = gguf.GGUFEndian.LITTLE
-
-    if reader.byte_order == "S":
-        return swapped_endian
-    else:
-        return host_endian
-
-
-def decode_field(field: gguf.ReaderField | None) -> Any:
-    if field and field.types:
-        main_type = field.types[0]
-
-        if main_type == gguf.GGUFValueType.ARRAY:
-            sub_type = field.types[-1]
-
-            if sub_type == gguf.GGUFValueType.STRING:
-                return [str(bytes(field.parts[idx]), encoding='utf-8') for idx in field.data]
-            else:
-                return [pv for idx in field.data for pv in field.parts[idx].tolist()]
-        if main_type == gguf.GGUFValueType.STRING:
-            return str(bytes(field.parts[-1]), encoding='utf-8')
-        else:
-            return field.parts[-1][0]
-
-    return None
-
-
 def get_field_data(reader: gguf.GGUFReader, key: str) -> Any:
     field = reader.get_field(key)
 
-    return decode_field(field)
+    return field.contents() if field else None
 
 
 def find_token(token_list: Sequence[int], token: str) -> Sequence[int]:
@@ -93,7 +57,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
             logger.debug(f'Removing {field.name}')
             continue
 
-        old_val = MetadataDetails(field.types[0], decode_field(field))
+        old_val = MetadataDetails(field.types[0], field.contents())
         val = new_metadata.get(field.name, old_val)
 
         if field.name in new_metadata:
@@ -192,7 +156,6 @@ def main() -> None:
     reader = gguf.GGUFReader(args.input, 'r')
 
     arch = get_field_data(reader, gguf.Keys.General.ARCHITECTURE)
-    endianess = get_byteorder(reader)
 
     token_list = get_field_data(reader, gguf.Keys.Tokenizer.LIST) or []
 
@@ -230,7 +193,7 @@ def main() -> None:
             sys.exit(0)
 
     logger.info(f'* Writing: {args.output}')
-    writer = gguf.GGUFWriter(args.output, arch=arch, endianess=endianess)
+    writer = gguf.GGUFWriter(args.output, arch=arch, endianess=reader.endianess)
 
     alignment = get_field_data(reader, gguf.Keys.General.ALIGNMENT)
     if alignment is not None:
diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml
index b4a47333d..d214e8720 100644
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.15.0"
+version = "0.16.0"
 description = "Read and write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index b2a393298..3a7170c94 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -11,6 +11,7 @@
 #include <time.h>
 #include <mutex>
 #include <unordered_map>
+#include <unordered_set>
 #include "model_adapter.h"
 #include "otherarch.h"
 #include "llama.h"
@@ -1188,18 +1189,8 @@ void sample_rep_pen(int n_ctx, int rep_pen_range, float rep_pen, float rep_pen_s
     const int64_t t_start_sample_us = ggml_time_us();
 
     // Create a frequency map to count occurrences of each token in last_tokens
-    std::unordered_map<llama_token, int> token_count_near;
-    std::unordered_map<llama_token, int> token_count_far;
-    for (size_t i = 0; i < last_n_repeat; ++i) {
-        if((i*2) >= last_n_repeat)
-        {
-            token_count_near[last_tokens[i]]++;
-        }
-        else
-        {
-            token_count_far[last_tokens[i]]++;
-        }
-    }
+    std::unordered_set<llama_token> tokens_near(last_tokens + last_n_repeat / 2, last_tokens + last_n_repeat);
+    std::unordered_set<llama_token> tokens_far(last_tokens, last_tokens + last_n_repeat / 2);
 
     float rep_pen_reduced = rep_pen;
     if(rep_pen_reduced>1.0f)
@@ -1207,15 +1198,13 @@ void sample_rep_pen(int n_ctx, int rep_pen_range, float rep_pen, float rep_pen_s
        rep_pen_reduced = 1.0f + ((rep_pen-1.0f)*rep_pen_slope);
     }
     for (size_t i = 0; i < candidates->size; ++i) {
-        const auto token_in_near = token_count_near.find(candidates->data[i].id);
-        const auto token_in_far = token_count_far.find(candidates->data[i].id);
-        bool in_near = (token_in_near != token_count_near.end());
-        bool in_far = (token_in_far != token_count_far.end());
-        if (!in_near && !in_far) {
+        const bool token_in_near = tokens_near.find(candidates->data[i].id) != tokens_near.end();
+        const bool token_in_far = tokens_far.find(candidates->data[i].id) != tokens_far.end();
+        if (!token_in_near && !token_in_far) {
             continue;
         }
 
-        float penalty = (in_near?rep_pen:rep_pen_reduced);
+        float penalty = (token_in_near?rep_pen:rep_pen_reduced);
 
         // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
         // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
@@ -1229,7 +1218,6 @@ void sample_rep_pen(int n_ctx, int rep_pen_range, float rep_pen, float rep_pen_s
     }
 
     candidates->sorted = false;
-
 }
 
 void sample_top_p(llama_token_data_array * cur_p, float p, size_t min_keep) {
@@ -1321,33 +1309,24 @@ void sample_tail_free(llama_token_data_array * cur_p, float z, size_t min_keep)
     sample_softmax(cur_p);
 
     // Compute the first and second derivatives
-    std::vector<float> first_derivatives(cur_p->size - 1);
     std::vector<float> second_derivatives(cur_p->size - 2);
+    float second_derivatives_sum = 0.0f;
 
-    for (size_t i = 0; i < first_derivatives.size(); ++i) {
-        first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p;
-    }
     for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
-    }
-
-    // Calculate absolute value of second derivatives
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        second_derivatives[i] = std::abs(second_derivatives[i]);
+        float first_derivatives_1 = cur_p->data[i].p - cur_p->data[i + 1].p;
+        float first_derivatives_2 = cur_p->data[i + 1].p - cur_p->data[i + 2].p;
+        second_derivatives[i] = std::abs(first_derivatives_1 - first_derivatives_2);
+        second_derivatives_sum += second_derivatives[i];
     }
 
     // Normalize the second derivatives
-    {
-        const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
-
-        if (second_derivatives_sum > 1e-6f) {
-            for (float & value : second_derivatives) {
-                value /= second_derivatives_sum;
-            }
-        } else {
-            for (float & value : second_derivatives) {
-                value = 1.0f / second_derivatives.size();
-            }
+    if (second_derivatives_sum > 1e-6f) {
+        for (float & value : second_derivatives) {
+            value /= second_derivatives_sum;
+        }
+    } else {
+        for (float & value : second_derivatives) {
+            value = 1.0f / second_derivatives.size();
         }
     }
 
@@ -1637,24 +1616,6 @@ const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dyna
             id = sample_token_mirostat_v2(&candidates_p, rng, mirostat_tau, mirostat_eta, &mirostat_mu);
         }
     }
-    else if (nsigma > 0.0f)
-    {
-        sample_top_k(&candidates_p, top_k);
-        if (dynatemp_range > 0) {
-            float dynatemp_min = temp - dynatemp_range;
-            float dynatemp_max = temp + dynatemp_range;
-            //do not allow negative values
-            dynatemp_min       = dynatemp_min < 0 ? 0 : dynatemp_min;
-            dynatemp_max       = dynatemp_max < 0 ? 0 : dynatemp_max;
-            dynatemp_exponent  = dynatemp_exponent < 0 ? 0 : dynatemp_exponent;
-            sample_entropy(&candidates_p, dynatemp_min, dynatemp_max, dynatemp_exponent, smoothing_factor);
-        } else {
-            sample_temperature(&candidates_p, temp, smoothing_factor);
-        }
-        sample_top_n_sigma(&candidates_p, nsigma);
-        sample_xtc(&candidates_p, xtc_threshold, xtc_probability, rng);
-        id = sample_token(&candidates_p, rng);
-    }
     else
     {
         for (int i = 0; i < sampler_order.size(); i++)
@@ -1692,6 +1653,10 @@ const std::vector<samplers> & sampler_order, llama_grammar * grammar, float dyna
                     {
                         sample_temperature(&candidates_p, temp, smoothing_factor);
                     }
+                    if (nsigma > 0.0f)
+                    {
+                        sample_top_n_sigma(&candidates_p, nsigma);
+                    }
                     break;
                 case KCPP_SAMPLER_REP_PEN:
                     sample_rep_pen(n_ctx, rep_pen_range, rep_pen, rep_pen_slope, presence_penalty, &candidates_p);
@@ -3555,7 +3520,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
 
             if (!evalres)
             {
-                fprintf(stderr, "\nFailed to predict at %d! Check your context buffer sizes!\n",n_past);
+                fprintf(stderr, "\nFailed to predict at token position %d! Check your context buffer sizes!\n",n_past);
                 output.text = nullptr;
                 output.status = 0;
                 output.prompt_tokens = output.completion_tokens = 0;
diff --git a/include/llama.h b/include/llama.h
index 1d9f6dee7..f4dd92219 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -107,6 +107,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
         LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
         LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
+        LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
     };
 
     enum llama_rope_type {
@@ -479,6 +480,7 @@ extern "C" {
     LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
 
     // Get the model's RoPE frequency scaling factor
     LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
diff --git a/kcpp_adapters/AutoGuess.json b/kcpp_adapters/AutoGuess.json
index e4e19bdf2..8015a8323 100644
--- a/kcpp_adapters/AutoGuess.json
+++ b/kcpp_adapters/AutoGuess.json
@@ -98,6 +98,17 @@
         "assistant_start": "<|assistant|>\n",
         "assistant_end": "<|end|>\n"
     }
+}, {
+    "search": ["'<|' + message['role'] + '|>'"],
+    "name": "Phi 4 (mini)",
+    "adapter": {
+        "system_start": "<|system|>\n",
+        "system_end": "<|end|>\n",
+        "user_start": "<|user|>\n",
+        "user_end": "<|end|>\n",
+        "assistant_start": "<|assistant|>\n",
+        "assistant_end": "<|end|>\n"
+    }
 }, {
     "search": ["<|START_OF_TURN_TOKEN|>"],
     "name": "Cohere (Aya Expanse 32B based)",
diff --git a/kcpp_docs.embd b/kcpp_docs.embd
index 9ce588a83..49abc72c7 100644
--- a/kcpp_docs.embd
+++ b/kcpp_docs.embd
@@ -141,8 +141,9 @@
                          },
                          "dynatemp_range": {
                             "default": 0,
-                            "description": "If greater than 0, uses dynamic temperature. Dynamic temperature range will be between Temp+Range and Temp-Range. If less or equal to 0 , uses static temperature.",
-                            "minimum": 0,
+                            "description": "If not equal to 0, uses dynamic temperature. Dynamic temperature range will be between Temp+Range and Temp-Range. If equal to 0 , uses static temperature.",
+                            "minimum": -5,
+                            "maximum": 5,
                             "type": "number"
                          },
                          "smoothing_factor": {
diff --git a/klite.embd b/klite.embd
index 342adaeb2..945b97332 100644
--- a/klite.embd
+++ b/klite.embd
@@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
 -->
 
 <script>
-	const LITEVER = 219;
+	const LITEVER = 221;
 	const urlParams = new URLSearchParams(window.location.search);
 	var localflag = true;
 	const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@@ -174,6 +174,15 @@ Current version indicated by LITEVER below.
 	.bigclampedcontainer {
 		width: 1770px;
 	}}
+	.centeredcontainer {
+		width: calc(100% - 662px)!important;
+	}
+	@media (max-width: 960px) {
+		.centeredcontainer {
+			width: 33%!important;
+		}
+	}
+
 
 	/* Viewports */
 	.normal_viewport_height
@@ -343,12 +352,8 @@ Current version indicated by LITEVER below.
 	.settingsmenu {
 		display: flex;
 		flex-wrap: wrap;
-		background-color: #4d4d4d;
 		padding: 6px;
 	}
-	body.connected .settingsmenu{
-		background-color: #295071;
-	}
 	.settingsbody
 	{
 		height: calc(86vh - 94px);
@@ -363,7 +368,7 @@ Current version indicated by LITEVER below.
 		padding-bottom: 5px;
 		padding-top: 5px;
 		display: inline-block;
-		border-bottom: 1px solid #12324f;
+		border-bottom: 1px solid #465d73;
 	}
 	.settingitem.wide{
 		width: 100%;
@@ -461,13 +466,13 @@ Current version indicated by LITEVER below.
 	/* Save menu */
 	.saveloadpopup {
 		width: 660px;
-		background-color: #262626;
+		background-color: #263040;
 		margin-top: 120px;
 	}
 	@media (max-width: 768px) {
 		.saveloadpopup {
 			width: 100%;
-			background-color: #262626;
+			background-color: #263040;
 			margin-top: 120px;
 		}
 	}
@@ -734,18 +739,18 @@ Current version indicated by LITEVER below.
 
 	/* Popup dialogs */
 	.workerpopup {
-		background-color: #262626;
+		background-color: #263040;
 		margin-top: 100px;
 	}
 	@media (max-width: 768px) {
 		.workerpopup {
 			width: 100%;
-			background-color: #262626;
+			background-color: #263040;
 			margin-top: 100px;
 		}
 	}
 	.nspopup {
-		background-color: #262626;
+		background-color: #263040;
 		margin-top: 200px;
 	}
 	.nspopup.moderate {
@@ -763,6 +768,16 @@ Current version indicated by LITEVER below.
 	.nspopup.fixsize {
 		width: 330px;
 	}
+	.nspopup.sidepanelsize {
+		width: 330px!important;
+		margin-top: 0px!important;
+	}
+	@media (max-width: 960px) {
+		.nspopup.sidepanelsize {
+			width: 33vw!important;
+			margin-top: 0px!important;
+		}
+	}
 	.nspopup.flexsize {
 		width: 600px;
 	}
@@ -816,6 +831,14 @@ Current version indicated by LITEVER below.
 		flex-direction: column;
 		align-items: center;
 	}
+	.popupcontainer.side{
+		width: unset;
+	}
+	.popupcontainer.sideright{
+		width: unset;
+		left:unset;
+		right:0px;
+	}
 	.popupbg {
 		position: fixed;
 		top: 0;
@@ -953,14 +976,14 @@ Current version indicated by LITEVER below.
 	/* Scenario menu */
 	.scenariopopup {
 		width: 600px;
-		background-color: #262626;
+		background-color: #263040;
 		margin-top: 60px;
 	}
 
 	@media (max-width: 768px) {
 		.scenariopopup {
 			width: 100%;
-			background-color: #262626;
+			background-color: #263040;
 			margin-top: 70px;
 		}
 	}
@@ -2067,6 +2090,9 @@ Current version indicated by LITEVER below.
 	.color_green {
 		color: #3bf723;
 	}
+	.color_lightgreen {
+		color: #b6ffa6;
+	}
 	.color_offwhite {
 		color: #bedae9;
 	}
@@ -2864,6 +2890,7 @@ Current version indicated by LITEVER below.
 
 	const claude_submit_endpoint = "/complete";
 	const claude_submit_endpoint_v3 = "/messages";
+	const claude_models_endpoint = "/v1/models";
 
 	const default_openrouter_base = "https://openrouter.ai/api/v1";
 	const default_mistralai_base = "https://api.mistral.ai/v1";
@@ -3029,6 +3056,7 @@ Current version indicated by LITEVER below.
 	var thinking_action = 1; //0=display, 1=collapse, 2=hide, 3=remove
 	var start_thinking_tag = "<think>";
 	var force_thinking_tag = false;
+	var strip_past_thinking = true;
 	var voice_typing_mode = 0; //0=off, 1=on, 2=ptt
 	var koboldcpp_has_whisper = false; //does backend support voice typing
 	var voice_is_recording = false; //currently recording voice?
@@ -3085,7 +3113,7 @@ Current version indicated by LITEVER below.
 
 		autoscroll: true, //automatically scroll to bottom on render
 		printer_view: false, //automatically scroll to bottom on render
-		viewport_width_mode: 0, //0=adapt, 1=clamp, 2=unlock
+		viewport_width_mode: 0, //0=adapt, 1=clamp, 2=hdclamp, 3=unlock
 		trimsentences: true, //trim to last punctuation
 		trimwhitespace: false, //trim trailing whitespace
 		compressnewlines: false, //compress multiple newlines
@@ -3159,6 +3187,7 @@ Current version indicated by LITEVER below.
 		show_advanced_load: false, //if true, every load opens the selector window
 		import_tavern_prompt: true, //when opening character cards, prompt for chat or instruct mode
 		invert_colors: false,
+		sidepanel_mode: false,
 		passed_ai_warning: false, //used to store AI safety panel acknowledgement state
 		entersubmit: true, //enter sends the prompt
 		darkmode: true,
@@ -3983,6 +4012,7 @@ Current version indicated by LITEVER below.
 
 	//invert colors
 	toggle_invert_colors();
+	toggle_sidepanel_mode();
 
 	//start the polling script for async generation status checking every Xs
 	setInterval(poll_pending_response, poll_interval_base_text);
@@ -5427,7 +5457,7 @@ Current version indicated by LITEVER below.
 					display_endpoint_container();
 				}
 			}).catch((error) => {
-				console.log("Error: " + error);
+				console.error(error);
 				msgbox("Failed to connect to AI Horde Service!\nPlease check your network connection.<br><br>You may still be able to connect to an alternative service, <a href='#' class='color_blueurl' onclick='hide_popups();display_endpoint_container()'>click here to view options</a>.","Error Encountered",true);
 				document.body.classList.remove("connected");
 				document.getElementById("connectstatus").innerHTML = "Offline Mode";
@@ -5468,9 +5498,8 @@ Current version indicated by LITEVER below.
 		//read the url params, and autoload a shared story if found
 		const foundStory = urlParams.get('s');
 		const foundScenario = urlParams.get('scenario');
-		const foundChub = urlParams.get('chub');
-		const foundPyg = urlParams.get('pyg');
-		const foundAicc = urlParams.get('aicc');
+		const foundScenarioSource = scenario_sources.find(scenario => urlParams.get(scenario.urlParam))
+
 		let foundQuery = urlParams.get('query');
 		if (!foundQuery || foundQuery == "")
 		{
@@ -5496,19 +5525,10 @@ Current version indicated by LITEVER below.
 			}
 			//purge url params
 			window.history.replaceState(null, null, window.location.pathname);
-		} else if (foundChub && foundChub != "") {
+		} else if (foundScenarioSource) {
+			console.log(foundScenarioSource, urlParams.get(foundScenarioSource.urlParam));
 			display_scenarios();
-			get_chubai_scenario(foundChub);
-			//purge url params
-			window.history.replaceState(null, null, window.location.pathname);
-		} else if (foundPyg && foundPyg != "") {
-			display_scenarios();
-			get_pygchat_scenario(foundPyg);
-			//purge url params
-			window.history.replaceState(null, null, window.location.pathname);
-		} else if (foundAicc && foundAicc != "") {
-			display_scenarios();
-			get_aicc_scenario(foundAicc);
+			import_scenario(foundScenarioSource, urlParams.get(foundScenarioSource.urlParam));
 			//purge url params
 			window.history.replaceState(null, null, window.location.pathname);
 		}
@@ -6401,6 +6421,7 @@ Current version indicated by LITEVER below.
 		new_save_storyobj.thinking_action = thinking_action;
 		new_save_storyobj.start_thinking_tag = start_thinking_tag;
 		new_save_storyobj.force_thinking_tag = force_thinking_tag;
+		new_save_storyobj.strip_past_thinking = strip_past_thinking;
 
 		if (export_settings) {
 			new_save_storyobj.savedsettings = JSON.parse(JSON.stringify(localsettings));
@@ -6551,6 +6572,7 @@ Current version indicated by LITEVER below.
 											gametext_arr.push(text);
 											render_gametext(true);
 											sync_multiplayer(true);
+											update_for_sidepanel();
 										}, null, true)
 								} else {
 									msgbox("Could not load selected file. Is it valid?");
@@ -6732,6 +6754,10 @@ Current version indicated by LITEVER below.
 				{
 					force_thinking_tag = storyobj.force_thinking_tag;
 				}
+				if(storyobj.strip_past_thinking)
+				{
+					strip_past_thinking = storyobj.strip_past_thinking;
+				}
 				if(storyobj.start_thinking_tag)
 				{
 					start_thinking_tag = storyobj.start_thinking_tag;
@@ -6900,6 +6926,7 @@ Current version indicated by LITEVER below.
 				//force load everything
 				import_settings(true, true, true, true, true, true);
 			}
+			update_for_sidepanel();
 			render_gametext(true);
 			if(!ignore_multiplayer_sync) //we don't want an infinite loop
 			{
@@ -7085,6 +7112,7 @@ Current version indicated by LITEVER below.
 					current_wi = load_agnai_wi(obj,chatopponent,myname);
 				}
 			}
+			update_for_sidepanel();
 			render_gametext(true);
 			sync_multiplayer(true);
 		}
@@ -7127,6 +7155,7 @@ Current version indicated by LITEVER below.
 		current_memory = memory + scenario + examplemsg + "\n***";
 		localsettings.opmode = 3;
 		localsettings.gui_type_chat = 2;
+		update_for_sidepanel();
 		render_gametext(true);
 		sync_multiplayer(true);
 	}
@@ -7150,6 +7179,7 @@ Current version indicated by LITEVER below.
 		{
 			current_wi = load_nai_wi(obj.lorebook);
 		}
+		update_for_sidepanel();
 		render_gametext(true);
 		sync_multiplayer(true);
 	}
@@ -7209,78 +7239,7 @@ Current version indicated by LITEVER below.
 		return loadedwi;
 	}
 
-	function get_aetherroom_scenario()
-	{
-		inputBox("Enter aetherroom.club prompt URL, or 4-digit prompt number","Import from aetherroom.club","","https://aetherroom.club/1234", ()=>{
-			let userinput = getInputBoxValue().toLowerCase().trim();
-			if(userinput=="")
-			{
-				//pass
-			}
-			else
-			{
-				if (userinput.includes("aetherroom.club/")) {
-					//is a url, extract the ID
-					userinput = userinput.replace("/api/","/");
-					userinput = userinput.split("aetherroom.club/")[1];
-					userinput = userinput.split("/")[0];
-					userinput = userinput.split("#")[0];
-					userinput = userinput.split("?")[0];
-				}
-				//remove common malformed ids to reduce load
-				if(userinput!="" && is_numeric(userinput) && userinput>0 && userinput<50000)
-				{
-					fetch(apply_proxy_url("https://aetherroom.club/api/"+userinput,true))
-					.then(x => x.json())
-					.then(data => {
-						console.log(data);
-						temp_scenario =
-						{
-							"title":data.title?data.title:"",
-							"desc":data.description?data.description:"",
-							"opmode":2,
-							"adventure_context_mod":false,
-							"adventure_switch_mode":1,
-							"prompt":data.promptContent?data.promptContent:"",
-							"memory": data.memory?data.memory:"",
-							"authorsnote": data.authorsNote?data.authorsNote:"",
-							"worldinfo": []
-						};
-						if (data.worldInfos)
-						{
-							for (let w = 0; w < data.worldInfos.length; ++w) {
-								let keys = data.worldInfos[w].keys;
-								let entry = data.worldInfos[w].entry;
-
-								let nwi = {
-									"key": (keys ? keys : ""),
-									"keysecondary": "",
-									"keyanti": "",
-									"content": (entry ? entry : ""),
-									"comment": "",
-									"folder": null,
-									"selective": false,
-									"constant": false,
-									"probability":100
-								};
-								temp_scenario.worldinfo.push(nwi);
-							}
-						}
-						preview_temp_scenario();
-					}).catch((error) => {
-						temp_scenario = null;
-						document.getElementById("scenariodesc").innerText = "Error: Selected scenario is invalid.";
-						console.log("Error: " + error);
-					});
-				}else{
-					temp_scenario = null;
-					document.getElementById("scenariodesc").innerText = "Error: User input is invalid\n\n Please ensure you have input a valid aetherroom.club URL or ID (e.g. https://aetherroom.club/1234 or just 1234)";
-				}
-			}
-		},false);
-	}
-
-	function load_temp_scenario_from_tavernobj(obj,loadall)
+	function load_temp_scenario_from_tavernobj(obj)
 	{
 		if(obj!=null)
 		{
@@ -7290,47 +7249,44 @@ Current version indicated by LITEVER below.
 				obj = obj.data;
 			}
 
-			if(loadall)
+			let chatopponent = obj.name?obj.name:"Bot";
+			let memory = obj.description?("Persona: "+obj.description):"";
+			memory += obj.personality?("\nPersonality: "+obj.personality):"";
+			let scenario = obj.scenario?obj.scenario:"";
+			let examplemsg = obj.mes_example?obj.mes_example:"";
+			let greeting = obj.first_mes?obj.first_mes:"";
+			let sysprompt = obj.system_prompt?obj.system_prompt:"";
+
+			//aliases
+			if(examplemsg=="" && obj.mesExample!="")
 			{
-				let chatopponent = obj.name?obj.name:"Bot";
-				let memory = obj.description?("Persona: "+obj.description):"";
-				memory += obj.personality?("\nPersonality: "+obj.personality):"";
-				let scenario = obj.scenario?obj.scenario:"";
-				let examplemsg = obj.mes_example?obj.mes_example:"";
-				let greeting = obj.first_mes?obj.first_mes:"";
-				let sysprompt = obj.system_prompt?obj.system_prompt:"";
-
-				//aliases
-				if(examplemsg=="" && obj.mesExample!="")
-				{
-					examplemsg = obj.mesExample;
-				}
-				if(greeting=="" && obj.firstMes!="")
-				{
-					greeting = obj.firstMes;
-				}
-
-				if(scenario!="")
-				{
-					scenario = "\n[Scenario: "+scenario+"]";
-				}
-				if(examplemsg!="")
-				{
-					examplemsg = "\n"+examplemsg;
-				}
-				if(sysprompt!="")
-				{
-					sysprompt = sysprompt+"\n";
-				}
-				let combinedmem = sysprompt + memory + scenario + examplemsg;
-				temp_scenario.title = chatopponent;
-				let prev2 = replaceAll(obj.description,"{{char}}",chatopponent,true);
-				prev2 = replaceAll(prev2,"{{user}}","User",true);
-				temp_scenario.desc = prev2;
-				temp_scenario.chatopponent = chatopponent;
-				temp_scenario.prompt = ("\n{{char}}: "+ greeting);
-				temp_scenario.memory = combinedmem;
+				examplemsg = obj.mesExample;
 			}
+			if(greeting=="" && obj.firstMes!="")
+			{
+				greeting = obj.firstMes;
+			}
+
+			if(scenario!="")
+			{
+				scenario = "\n[Scenario: "+scenario+"]";
+			}
+			if(examplemsg!="")
+			{
+				examplemsg = "\n"+examplemsg;
+			}
+			if(sysprompt!="")
+			{
+				sysprompt = sysprompt+"\n";
+			}
+			let combinedmem = sysprompt + memory + scenario + examplemsg;
+			temp_scenario.title = chatopponent;
+			let prev2 = replaceAll(obj.description,"{{char}}",chatopponent,true);
+			prev2 = replaceAll(prev2,"{{user}}","User",true);
+			temp_scenario.desc = prev2;
+			temp_scenario.chatopponent = chatopponent;
+			temp_scenario.prompt = ("\n{{char}}: "+ greeting);
+			temp_scenario.memory = combinedmem;
 
 			//since cai format has no wi, try to grab it from tavern format
 			if(obj.character_book && obj.character_book.entries && obj.character_book.entries.length>0)
@@ -7342,355 +7298,6 @@ Current version indicated by LITEVER below.
 		}
 	}
 
-	function get_chubai_portrait(userinput, card_is_defective, original_no_exist)
-	{
-		//try to obtain the full portrait image
-		fetch("https://api.chub.ai/api/characters/download", {
-			method: 'POST',
-			headers: {
-				'Content-Type': 'application/json',
-			},
-			body: JSON.stringify({
-			"format": "tavern",
-			"fullPath": userinput,
-			"version": "main"
-			}),
-			referrerPolicy: 'no-referrer',
-			})
-			.then(rb => {
-				if(rb.ok)
-				{
-					return rb.blob();
-				}else{
-					throw new Error('Cannot fetch tavern image');
-				}
-			})
-			.then(blob => {
-				preview_temp_scenario();
-
-				readTavernPngFromBlob(blob,(obj)=>{
-					load_temp_scenario_from_tavernobj(obj,card_is_defective);
-				});
-
-				const objectURL = URL.createObjectURL(blob);
-				const compressedImg = compressImage(objectURL, (compressedImageURI, aspectratio)=>{
-					temp_scenario.image = compressedImageURI;
-					temp_scenario.image_aspect = aspectratio;
-					preview_temp_scenario();
-				}, true, true, AVATAR_PX);
-			})
-			.catch(error => {
-				if(original_no_exist)
-				{
-					temp_scenario = null;
-					document.getElementById("scenariodesc").innerText = "Error: Selected scenario is invalid.";
-					console.log("Error: " + error);
-				}
-				else
-				{
-					preview_temp_scenario();
-					console.error("Error fetching tavern image:", error);
-				}
-
-		});
-	}
-
-	function get_pygchat_scenario(charstr="")
-	{
-		const loadpyg = function(userinput)
-		{
-			if(userinput=="")
-			{
-				//pass
-			}
-			else
-			{
-				if (userinput.match(/pygmalion\.chat\//i)) {
-					const urlParams = new URLSearchParams(userinput);
-					const cid = urlParams.get('id');
-					if(cid && cid!="")
-					{
-						userinput = cid;
-					}
-				}
-				userinput = userinput.endsWith('/') ? userinput.slice(0, -1) : userinput;
-				if(userinput!="")
-				{
-					temp_scenario = {
-						"title":"",
-						"desc": "",
-						"opmode":3,
-						"chatname": "User",
-						"chatopponent": "",
-						"gui_type":1,
-						"prompt":"",
-						"memory": "",
-						"authorsnote": "",
-						"worldinfo": [],
-					};
-
-					document.getElementById("scenariodesc").innerText = "Loading scenario from Pygmalion.Chat...";
-					let charurl = "https://server.pygmalion.chat/api/export/character/"+userinput+"/v2";
-					fetch(apply_proxy_url(charurl,true), {
-					method: 'GET',
-					headers: {
-						'Content-Type': 'application/json',
-					},
-				//	body: JSON.stringify({ "character_id": userinput }),
-					referrerPolicy: 'no-referrer',
-					})
-					.then(x => {
-						if(x.ok)
-						{
-							return x.json();
-						}else{
-							console.log('Cannot fetch pyg scenario: try fallback to tavern image');
-							throw new Error('Cannot fetch character from pygmalion.chat');
-							return null;
-						}
-					})
-					.then(data => {
-						console.log(data);
-						if(data && data.character) //if fetch was successful
-						{
-							load_temp_scenario_from_tavernobj(data.character,true);
-							if(data.character.data && data.character.data.avatar)
-							{
-								const compressedImg = compressImage(data.character.data.avatar, (compressedImageURI, aspectratio)=>{
-									temp_scenario.image = compressedImageURI;
-									temp_scenario.image_aspect = aspectratio;
-									preview_temp_scenario();
-								}, true, true, AVATAR_PX);
-							}
-						}else{
-							temp_scenario = null;
-							document.getElementById("scenariodesc").innerText = "Error: Selected scenario is invalid.";
-						}
-
-					}).catch((error) => {
-						temp_scenario = null;
-						document.getElementById("scenariodesc").innerText = "Error: Selected scenario is invalid.";
-						console.log("Error: " + error);
-					});
-				}else{
-					temp_scenario = null;
-					document.getElementById("scenariodesc").innerText = "Error: User input is invalid\n\n Please ensure you have input a valid Pygmalion.Chat UUID or URL.";
-				}
-			}
-		}
-
-		if(charstr=="")
-		{
-			inputBox("Enter pygmalion.chat character UUID","Import from pygmalion.chat","","d7950ca8-c241-4725-8de1-42866e389ebf", ()=>{
-				let userinput = getInputBoxValue().trim();
-				loadpyg(userinput);
-			},false);
-		}else{
-			loadpyg(charstr);
-		}
-	}
-
-	function get_aicc_scenario(inputstr="")
-	{
-		const loadaicc = function(userinput)
-		{
-			if(userinput=="")
-			{
-				//pass
-			}
-			else
-			{
-
-				let useraw = false;
-				if (userinput.match(/aicharactercards\.com\//i) && userinput.match(/sdm_process_download/i))
-				{
-					useraw = true;
-				}
-				else {
-					userinput = userinput.split("#")[0].split("?")[0];
-					userinput = userinput.endsWith('/') ? userinput.slice(0, -1) : userinput;
-					if (userinput.match(/aicharactercards\.com\//i) || userinput.match(/AICC\//i)) {
-						// is a URL, extract the character name
-						let tmp = userinput.split("/");
-						if(tmp.length >= 2)
-						{
-							userinput = tmp[tmp.length-2] + "/" + tmp[tmp.length-1];
-						}
-					}
-				}
-
-				if(userinput!="")
-				{
-					temp_scenario = {
-						"title":"",
-						"desc": "",
-						"opmode":3,
-						"chatname": "User",
-						"chatopponent": "",
-						"gui_type":1,
-						"prompt":"",
-						"memory": "",
-						"authorsnote": "",
-						"worldinfo": [],
-					};
-
-					let finalurl = useraw?userinput:"https://aicharactercards.com/wp-json/pngapi/v1/image/"+userinput;
-					finalurl = apply_proxy_url(finalurl,true);
-					//try to obtain the full portrait image
-					fetch(finalurl, {
-						method: 'GET',
-						redirect: 'follow',
-						referrerPolicy: 'no-referrer',
-						})
-						.then(rb => {
-							if(rb.ok)
-							{
-								return rb.blob();
-							}else{
-								throw new Error('Cannot fetch tavern image');
-							}
-						})
-						.then(blob => {
-							preview_temp_scenario();
-
-							readTavernPngFromBlob(blob,(obj)=>{
-								load_temp_scenario_from_tavernobj(obj,true);
-							});
-
-							const objectURL = URL.createObjectURL(blob);
-							const compressedImg = compressImage(objectURL, (compressedImageURI, aspectratio)=>{
-								temp_scenario.image = compressedImageURI;
-								temp_scenario.image_aspect = aspectratio;
-								preview_temp_scenario();
-							}, true, true, AVATAR_PX);
-						})
-						.catch(error => {
-							temp_scenario = null;
-							document.getElementById("scenariodesc").innerText = "Error: Selected scenario is invalid.";
-							console.log("Error: " + error);
-
-						});
-				} else {
-					temp_scenario = null;
-					document.getElementById("scenariodesc").innerText = "Error: User input is invalid\n\n Please ensure you have input a valid AICC URL or ID.";
-				}
-			}
-		}
-
-		if(inputstr=="")
-		{
-			inputBox("Enter aicharactercards.com prompt URL","Import from aicharactercards.com","","https://aicharactercards.com/character-cards/work-jobs/deffcolony/lara-lightland", ()=>{
-				let userinput = getInputBoxValue().trim();
-				loadaicc(userinput);
-			},false);
-		}else{
-			loadaicc(inputstr);
-		}
-	}
-
-	function get_chubai_scenario(chubstr="")
-	{
-		const loadchub = function(userinput)
-		{
-			if(userinput=="")
-			{
-				//pass
-			}
-			else
-			{
-				if (userinput.match(/chub\.ai\//i)) {
-					// is a URL, extract the character name
-					userinput = userinput.replace(/\/characters\//i, '/');
-					userinput = userinput.split(/chub\.ai\//i)[1].split("#")[0].split("?")[0];
-				} else if (userinput.match(/characterhub\.org\//i)) {
-					// is a URL, extract the character name
-					userinput = userinput.replace(/\/characters\//i, '/');
-					userinput = userinput.split(/characterhub\.org\//i)[1].split("#")[0].split("?")[0];
-				}
-				userinput = userinput.endsWith('/') ? userinput.slice(0, -1) : userinput;
-				if(userinput!="")
-				{
-					temp_scenario = {
-						"title":"",
-						"desc": "",
-						"opmode":3,
-						"chatname": "User",
-						"chatopponent": "",
-						"gui_type":1,
-						"prompt":"",
-						"memory": "",
-						"authorsnote": "",
-						"worldinfo": [],
-					};
-
-					document.getElementById("scenariodesc").innerText = "Loading scenario from CharacterHub / Chub...";
-					fetch("https://api.chub.ai/api/characters/download", {
-					method: 'POST',
-					headers: {
-						'Content-Type': 'application/json',
-					},
-					body: JSON.stringify({
-					"format": "cai",
-					"fullPath": userinput,
-					"version": "main"
-					}),
-					referrerPolicy: 'no-referrer',
-					})
-					.then(x => {
-						if(x.ok)
-						{
-							return x.json();
-						}else{
-							console.log('Cannot fetch chub scenario: try fallback to tavern image');
-							//cai format failed, try fallback to portrait only
-							get_chubai_portrait(userinput, true, true);
-							return null;
-						}
-					})
-					.then(data => {
-						if(data) //if cai fetch was successful
-						{
-							console.log(data);
-							let botname = data.name?data.name:"Bot";
-							let cdef = data.definition?data.definition.replace("END_OF_DIALOG","").trim():"";
-							let cdesc = data.description?data.description:"";
-							let greeting = data.greeting?data.greeting:"";
-							let previewtxt = (data.title ? data.title + '\n\n' : '') + replaceAll(cdesc,"{{char}}",botname,true);
-							previewtxt = replaceAll(previewtxt,"{{user}}","User",true);
-
-							temp_scenario.title = data.name?data.name:"";
-							temp_scenario.desc = previewtxt;
-							temp_scenario.chatopponent = botname;
-							temp_scenario.prompt = ("\n{{char}}: "+greeting);
-							temp_scenario.memory = cdesc +"\n"+ cdef;
-
-							let card_is_defective = (data.name==""&&previewtxt==""&&greeting==""&&cdesc==""&&cdef=="");
-
-							get_chubai_portrait(userinput, card_is_defective, false);
-						}
-
-					}).catch((error) => {
-						temp_scenario = null;
-						document.getElementById("scenariodesc").innerText = "Error: Selected scenario is invalid.";
-						console.log("Error: " + error);
-					});
-				}else{
-					temp_scenario = null;
-					document.getElementById("scenariodesc").innerText = "Error: User input is invalid\n\n Please ensure you have input a valid CharacterHub / ChubAI URL or ID.";
-				}
-			}
-		}
-
-		if(chubstr=="")
-		{
-			inputBox("Enter characterhub.org or chub.ai prompt URL","Import from characterhub.org / chub.ai","","https://characterhub.org/characters/Anonymous/example-character", ()=>{
-				let userinput = getInputBoxValue().trim();
-				loadchub(userinput);
-			},false);
-		}else{
-			loadchub(chubstr);
-		}
-	}
 
 	function leave_multiplayer()
 	{
@@ -7837,6 +7444,7 @@ Current version indicated by LITEVER below.
 			localsettings.opmode = 3;
 			localsettings.gui_type_chat = 2;
 			localsettings.chatopponent = chatopponent;
+			update_for_sidepanel();
 			render_gametext();
 			sync_multiplayer(true);
 		}
@@ -7955,6 +7563,7 @@ Current version indicated by LITEVER below.
 			if (temp_scenario.instruct_endtag) { localsettings.instruct_endtag = temp_scenario.instruct_endtag; }
 		}
 
+		update_for_sidepanel();
 		render_gametext(true);
 		sync_multiplayer(true);
 	}
@@ -8060,17 +7669,370 @@ Current version indicated by LITEVER below.
 		}
 	}
 
+	/**
+	 * @type {{
+	 * 		name: string;
+	 * 		urlParam: string;
+	 *		inputBox: {
+	 *			text: string;
+	 *			placeholder: string;
+	 *		};
+	 *		extraction: (userInput: string) => string;
+	 *		fetch: (value: string) => Promise<void>;
+	 *	}[]}
+	 **/
+	const scenario_sources = [
+		// Aetherroom.club
+		{
+			name: "aetherroom.club",
+			urlParam: "aether",
+			inputBox: {
+				text: "Enter aetherroom.club prompt URL, or 4-digit prompt number",
+				placeholder: "https://aetherroom.club/1234"
+			},
+			extraction: (userInput) => {
+				userInput = userInput.toLowerCase();
+				if (userInput.includes("aetherroom.club/")) {
+					//is a url, extract the ID
+					userInput = userInput.replace("/api/","/");
+					userInput = userInput.split("aetherroom.club/")[1];
+					userInput = userInput.split("/")[0];
+					userInput = userInput.split("#")[0];
+					userInput = userInput.split("?")[0];
+				}
+				if(!(userInput!="" && is_numeric(userInput) && userInput>0 && userInput<50000))
+					throw new Error("User input is invalid\n\n Please ensure you have input a valid aetherroom.club URL or ID (e.g. https://aetherroom.club/1234 or just 1234)");
+				return userInput;
+			},
+			fetch: (userInput) => {
+				return fetch(apply_proxy_url("https://aetherroom.club/api/"+userInput,true))
+					.then(x => x.json())
+					.then(data => {
+						console.log(data);
+						temp_scenario =
+						{
+							"title":data.title?data.title:"",
+							"desc":data.description?data.description:"",
+							"opmode":2,
+							"adventure_context_mod":false,
+							"adventure_switch_mode":1,
+							"prompt":data.promptContent?data.promptContent:"",
+							"memory": data.memory?data.memory:"",
+							"authorsnote": data.authorsNote?data.authorsNote:"",
+							"worldinfo": []
+						};
+						if (data.worldInfos)
+						{
+							for (let w = 0; w < data.worldInfos.length; ++w) {
+								let keys = data.worldInfos[w].keys;
+								let entry = data.worldInfos[w].entry;
+
+								let nwi = {
+									"key": (keys ? keys : ""),
+									"keysecondary": "",
+									"keyanti": "",
+									"content": (entry ? entry : ""),
+									"comment": "",
+									"folder": null,
+									"selective": false,
+									"constant": false,
+									"probability":100
+								};
+								temp_scenario.worldinfo.push(nwi);
+							}
+						}
+						preview_temp_scenario();
+					}).catch((error) => {
+						console.error(error);
+						throw new Error("Error: Selected scenario is invalid.");
+					});
+			}
+		},
+		// Chub.ai
+		{
+			name: "characterhub.org / chub.ai",
+			urlParam: "chub",
+			inputBox: {
+				text: "Enter characterhub.org or chub.ai prompt URL",
+				placeholder: "https://characterhub.org/characters/Anonymous/example-character"
+			},
+			extraction: (userInput) => {
+				if (userInput.match(/chub\.ai\//i)) {
+					// is a URL, extract the character name
+					userInput = userInput.replace(/\/characters\//i, '/');
+					userInput = userInput.split(/chub\.ai\//i)[1].split("#")[0].split("?")[0];
+				} else if (userInput.match(/characterhub\.org\//i)) {
+					// is a URL, extract the character name
+					userInput = userInput.replace(/\/characters\//i, '/');
+					userInput = userInput.split(/characterhub\.org\//i)[1].split("#")[0].split("?")[0];
+				}
+				userInput = userInput.endsWith('/') ? userInput.slice(0, -1) : userInput;
+				return userInput;
+			},
+			fetch: (userInput) => {
+				temp_scenario = {
+					"title":"",
+					"desc": "",
+					"opmode":3,
+					"chatname": "User",
+					"chatopponent": "",
+					"gui_type":1,
+					"prompt":"",
+					"memory": "",
+					"authorsnote": "",
+					"worldinfo": [],
+				};
+
+				//try to obtain the full portrait image
+				return fetch("https://api.chub.ai/api/characters/download", {
+					method: 'POST',
+					headers: {
+						'Content-Type': 'application/json',
+					},
+					body: JSON.stringify({
+						"format": "tavern",
+						"fullPath": userInput,
+						"version": "main"
+					}),
+					referrerPolicy: 'no-referrer',
+				})
+				.then(rb => {
+					if (rb.ok) {
+						return rb.blob();
+					} else {
+						throw new Error('Cannot fetch tavern image');
+					}
+				})
+				.then(blob => {
+					preview_temp_scenario();
+
+					readTavernPngFromBlob(blob, (obj) => {
+						load_temp_scenario_from_tavernobj(obj);
+					});
+
+					const objectURL = URL.createObjectURL(blob);
+					const compressedImg = compressImage(objectURL, (compressedImageURI, aspectratio) => {
+						temp_scenario.image = compressedImageURI;
+						temp_scenario.image_aspect = aspectratio;
+						preview_temp_scenario();
+					}, true, true, AVATAR_PX);
+				})
+				.catch(error => {
+					console.error(error);
+					throw new Error("Error fetching tavern scenario.");
+				});
+			}
+		},
+		// Pygmalion.chat
+		{
+			name: "pygmalion.chat",
+			urlParam: "pyg",
+			inputBox: {
+				text: "Enter pygmalion.chat character UUID",
+				placeholder: "d7950ca8-c241-4725-8de1-42866e389ebf",
+			},
+			extraction: (userInput) => {
+				if (userInput.match(/pygmalion\.chat\//i)) {
+					const urlParams = new URLSearchParams(userInput);
+					const cid = urlParams.get('id');
+					if(cid && cid!="")
+					{
+						userInput = cid;
+					}
+				}
+				userInput = userInput.endsWith('/') ? userInput.slice(0, -1) : userInput;
+				userInput = userInput.trim().toLowerCase().replace("https://pygmalion.chat/character/","");
+				userInput = userInput.trim().toLowerCase().replace("https://pygmalion.chat//character/","");
+				return userInput;
+			},
+			fetch: (userInput) => {
+				temp_scenario = {
+					"title":"",
+					"desc": "",
+					"opmode":3,
+					"chatname": "User",
+					"chatopponent": "",
+					"gui_type":1,
+					"prompt":"",
+					"memory": "",
+					"authorsnote": "",
+					"worldinfo": [],
+				};
+
+				let charurl = "https://server.pygmalion.chat/api/export/character/"+userInput+"/v2";
+				return fetch(apply_proxy_url(charurl,true), {
+					method: 'GET',
+					headers: {
+						'Content-Type': 'application/json',
+					},
+					referrerPolicy: 'no-referrer',
+				})
+				.then(x => {
+					if(x.ok)
+						return x.json();
+					else
+						throw new Error("Invalid HTTP status on response: " + x.status);
+				})
+				.then(data => {
+					console.log(data);
+					if(data && data.character) //if fetch was successful
+					{
+						load_temp_scenario_from_tavernobj(data.character);
+						if(data.character.data && data.character.data.avatar)
+						{
+							const compressedImg = compressImage(data.character.data.avatar, (compressedImageURI, aspectratio)=>{
+								temp_scenario.image = compressedImageURI;
+								temp_scenario.image_aspect = aspectratio;
+								preview_temp_scenario();
+							}, true, true, AVATAR_PX);
+						}
+					}else{
+						throw new Error("Selected scenario is invalid.");
+					}
+
+				}).catch((error) => {
+					console.error(error);
+					throw new Error("Selected scenario is invalid.");
+				});
+			}
+		},
+		// aicharactercards.com
+		{
+			name: "aicharactercards.com",
+			urlParam: "aicc",
+			inputBox: {
+				text: "Enter aicharactercards.com prompt URL",
+				placeholder: "https://aicharactercards.com/character-cards/work-jobs/deffcolony/lara-lightland",
+			},
+			extraction: (userInput) => {
+				if (userInput.match(/aicharactercards\.com\//i) && userInput.match(/sdm_process_download/i))
+				{
+					return userInput;
+				} else {
+					userInput = userInput.split("#")[0].split("?")[0];
+					userInput = userInput.endsWith('/') ? userInput.slice(0, -1) : userInput;
+					if (userInput.match(/aicharactercards\.com\//i) || userInput.match(/AICC\//i)) {
+						// is a URL, extract the character name
+						let tmp = userInput.split("/");
+						if(tmp.length >= 2)
+						{
+							userInput = tmp[tmp.length-2] + "/" + tmp[tmp.length-1];
+						}
+					}
+					return "https://aicharactercards.com/wp-json/pngapi/v1/image/"+userInput;
+				}
+			},
+			fetch: (userInput) => {
+				temp_scenario = {
+					"title":"",
+					"desc": "",
+					"opmode":3,
+					"chatname": "User",
+					"chatopponent": "",
+					"gui_type":1,
+					"prompt":"",
+					"memory": "",
+					"authorsnote": "",
+					"worldinfo": [],
+				};
+
+				finalurl = apply_proxy_url(userInput,true);
+				//try to obtain the full portrait image
+				return fetch(finalurl, {
+					method: 'GET',
+					redirect: 'follow',
+					referrerPolicy: 'no-referrer',
+					})
+					.then(rb => {
+						if(rb.ok)
+						{
+							return rb.blob();
+						}else{
+							throw new Error('Cannot fetch tavern image');
+						}
+					})
+					.then(blob => {
+						preview_temp_scenario();
+
+						readTavernPngFromBlob(blob,(obj)=>{
+							load_temp_scenario_from_tavernobj(obj);
+						});
+
+						const objectURL = URL.createObjectURL(blob);
+						const compressedImg = compressImage(objectURL, (compressedImageURI, aspectratio)=>{
+							temp_scenario.image = compressedImageURI;
+							temp_scenario.image_aspect = aspectratio;
+							preview_temp_scenario();
+						}, true, true, AVATAR_PX);
+					})
+					.catch(error => {
+						throw new Error("Selected scenario is invalid.");
+						console.error(error);
+					});
+			}
+		},
+	];
+
+	/**
+	 * @param {typeof scenario_sources[number]} scenario_source
+	 * @param {string} [scenario_id]
+	 */
+	function import_scenario(scenario_source, scenario_id) {
+		if(!scenario_source) return;
+
+		function extractionWrapper(userInput) {
+			try {
+				return scenario_source.extraction(userInput)
+			} catch (error) {
+				console.error(error);
+				document.getElementById("scenariodesc").innerText = "Error: Error while extracting value from input: " + (error.message ? "Unknown error": error.message);
+				temp_scenario = null;
+			}
+			return "";
+		}
+
+		function fetchWrapper(userInput) {
+			// Fetch should return a promise, so we catch it like that instead of try catch block.
+			scenario_source.fetch(userInput)
+				.catch(error => {
+					console.error(error);
+					document.getElementById("scenariodesc").innerText = "Error: Error while fetching and parsing remote values: " + (error.message ? "Unknown error": error.message);
+					temp_scenario = null;
+				});
+		}
+
+		function loadScenario(input) {
+			if(!input) return;
+			document.getElementById("scenariodesc").innerText = `Loading scenario from ${scenario_source.name}...`;
+			const extracted = extractionWrapper(input);
+			if(extracted) return fetchWrapper(extracted);
+		}
+
+		if(scenario_id) loadScenario(scenario_id);
+		else inputBox(
+			scenario_source.inputBox.text,
+			"Import from " + scenario_source.inputBox.name,
+			"",
+			scenario_source.inputBox.placeholder,
+			(value) => {
+				const input = getInputBoxValue().trim();
+				loadScenario(input);
+			}
+		);
+	}
+
 	function display_scenarios()
 	{
 		mainmenu_untab(true);
 		temp_scenario = null;
 		document.getElementById("quickstartcontainer").classList.remove("hidden");
 
-		let scenarios = `<button type="button" name="" class="scenarioitem purple btn btn-primary" onclick="get_aetherroom_scenario()">Import from<br>aetherroom.club</button>`+
-		`<button type="button" name="" class="scenarioitem purple btn btn-primary" onclick="get_chubai_scenario()">Import from<br>characterhub.org / chub.ai</button>` +
-		`<button type="button" name="" class="scenarioitem purple btn btn-primary" onclick="get_pygchat_scenario()">Import from<br>pygmalion.chat</button>` +
-		`<button type="button" name="" class="scenarioitem purple btn btn-primary" onclick="get_aicc_scenario()">Import from<br>aicharactercards.com</button>` +
-		`<button type="button" name="" class="scenarioitem purple btn btn-primary" onclick="character_creator()">New Character Creator</button>`;
+		let scenarios = ``;
+		for(let i=0;i<scenario_sources.length;++i) {
+			scenarios += `<button type="button" name="" class="scenarioitem purple btn btn-primary" onclick="import_scenario(scenario_sources[${i}])">Import from<br>${scenario_sources[i].name}</button>`
+		}
+
+		scenarios += `<button type="button" name="" class="scenarioitem purple btn btn-primary" onclick="character_creator()">New Character Creator</button>`;
 		for(let i=0;i<scenario_db.length;++i)
 		{
 			let curr = scenario_db[i];
@@ -8447,14 +8409,18 @@ Current version indicated by LITEVER below.
 	}
 	function is_popup_open()
 	{
+		let context_and_settings_hidden = (
+			document.getElementById("memorycontainer").classList.contains("hidden") &&
+			document.getElementById("settingscontainer").classList.contains("hidden")
+		) || localsettings.sidepanel_mode;
+
 		return !(
+			context_and_settings_hidden &&
 			document.getElementById("inputboxcontainer").classList.contains("hidden") &&
 			document.getElementById("saveloadcontainer").classList.contains("hidden") &&
 			document.getElementById("newgamecontainer").classList.contains("hidden") &&
 			document.getElementById("yesnocontainer").classList.contains("hidden") &&
-			document.getElementById("settingscontainer").classList.contains("hidden") &&
 			document.getElementById("msgboxcontainer").classList.contains("hidden") &&
-			document.getElementById("memorycontainer").classList.contains("hidden") &&
 			document.getElementById("workercontainer").classList.contains("hidden") &&
 			document.getElementById("myownworkercontainer").classList.contains("hidden") &&
 			document.getElementById("sharecontainer").classList.contains("hidden") &&
@@ -8483,13 +8449,16 @@ Current version indicated by LITEVER below.
 		);
 	}
 	function hide_popups() {
+		if(!localsettings.sidepanel_mode)
+		{
+			document.getElementById("settingscontainer").classList.add("hidden");
+			document.getElementById("memorycontainer").classList.add("hidden");
+		}
 		document.getElementById("saveloadcontainer").classList.add("hidden");
 		document.getElementById("newgamecontainer").classList.add("hidden");
 		document.getElementById("yesnocontainer").classList.add("hidden");
-		document.getElementById("settingscontainer").classList.add("hidden");
 		document.getElementById("inputboxcontainer").classList.add("hidden");
 		document.getElementById("msgboxcontainer").classList.add("hidden");
-		document.getElementById("memorycontainer").classList.add("hidden");
 		document.getElementById("workercontainer").classList.add("hidden");
 		document.getElementById("myownworkercontainer").classList.add("hidden");
 		document.getElementById("sharecontainer").classList.add("hidden");
@@ -8526,13 +8495,8 @@ Current version indicated by LITEVER below.
 		{
 			let a1 = parseFloat(document.getElementById("dynatemp_min").value);
 			let a2 = parseFloat(document.getElementById("dynatemp_max").value);
-			if (a2<a1)
-			{
-				a2 = a1;
-				document.getElementById("dynatemp_max").value = document.getElementById("dynatemp_min").value;
-			}
 			let avg = (a1+a2)*0.5;
-			let diff = Math.abs(a2 - a1)*0.5;
+			let diff = (a2 - a1)*0.5;
 			document.getElementById("dynatemp_range").value = diff.toFixed(3);
 			document.getElementById("dynatemp_outtemp").value = avg.toFixed(3);
 			document.getElementById("temperature").value = avg.toFixed(3);
@@ -8543,7 +8507,7 @@ Current version indicated by LITEVER below.
 	function confirm_dynatemp()
 	{
 		document.getElementById("dynatempcontainer").classList.add("hidden");
-		document.getElementById("dynatemp_overview").innerText = (document.getElementById("dynatemp_range").value>0?"ON":"OFF");
+		document.getElementById("dynatemp_overview").innerText = (document.getElementById("dynatemp_range").value!=0?"ON":"OFF");
 	}
 	function show_dynatemp()
 	{
@@ -8998,6 +8962,73 @@ Current version indicated by LITEVER below.
 		}
 		toggleoaichatcompl();
 	}
+	function claude_fetch_models()
+	{
+		let desired_claude_key = document.getElementById("custom_claude_key").value.trim();
+		let claude_ep = document.getElementById("custom_claude_endpoint").value.trim();
+		if(claude_ep.toLowerCase().includes("api.anthropic.com"))
+		{
+			//official API has broken cors settings
+			claude_ep = apply_proxy_url(claude_ep,true);
+		}
+		if(desired_claude_key=="")
+		{
+			msgbox("Claude requires an API key to fetch model list!");
+			return;
+		}
+
+		let dropdown = document.getElementById("custom_claude_model");
+		fetch((claude_ep + claude_models_endpoint), {
+			method: 'GET',
+			headers: {
+				'Content-Type': 'application/json',
+				'anthropic-version':'2023-06-01',
+				'x-api-key': desired_claude_key,
+			},
+			referrerPolicy: 'no-referrer',
+		})
+		.then((response) => response.json())
+		.then((data) => {
+			console.log(data);
+			if (data && data.data.length > 0)
+			{
+				for (var i = dropdown.options.length - 1; i >= 0; i--) {
+					var option = dropdown.options[i];
+					dropdown.remove(option);
+				}
+				let selidx = 0;
+				for(var i = 0; i < data.data.length; i++) {
+					var opt = data.data[i];
+					var el = document.createElement("option");
+					let optname = opt.id;
+					el.textContent = optname;
+					el.value = optname;
+					dropdown.appendChild(el);
+				}
+				dropdown.selectedIndex = selidx;
+				togglepalmmodel();
+			}
+			else
+			{
+				let errmsg = "";
+				if(data && data.error)
+				{
+					errmsg = data.error;
+				}
+				else
+				{
+					errmsg = data;
+				}
+				msgbox(JSON.stringify(errmsg),"Error Encountered",false,false);
+			}
+		})
+		.catch(error => {
+			console.log("Error: " + error);
+			msgbox("Error: " + error,"Error Encountered",false,false,()=>{
+				hide_msgbox();
+			});
+		});
+	}
 	function gemini_fetch_models()
 	{
 		let desired_gemini_key = document.getElementById("custom_palm_key").value.trim();
@@ -9163,6 +9194,13 @@ Current version indicated by LITEVER below.
 			document.getElementById("claudejailbreakprompt").classList.add("hidden");
 			document.getElementById("clauderenamecompatdiv").classList.remove("hidden");
 		}
+		if(document.getElementById("custom_claude_model").value.toLowerCase().includes("claude-3-7"))
+		{
+			document.getElementById("claudethinkingbox").classList.remove("hidden");
+		}else
+		{
+			document.getElementById("claudethinkingbox").classList.add("hidden");
+		}
 	}
 
 	let openrouter_fetch_attempted = false;
@@ -10843,6 +10881,7 @@ Current version indicated by LITEVER below.
 		document.getElementById("show_advanced_load").checked = localsettings.show_advanced_load;
 		document.getElementById("import_tavern_prompt").checked = localsettings.import_tavern_prompt;
 		document.getElementById("invert_colors").checked = localsettings.invert_colors;
+		document.getElementById("sidepanel_mode").checked = localsettings.sidepanel_mode;
 		document.getElementById("trimsentences").checked = localsettings.trimsentences;
 		document.getElementById("trimwhitespace").checked = localsettings.trimwhitespace;
 		document.getElementById("compressnewlines").checked = localsettings.compressnewlines;
@@ -10867,7 +10906,7 @@ Current version indicated by LITEVER below.
 		document.getElementById("dynatemp_exponent").value = localsettings.dynatemp_exponent;
 		document.getElementById("smoothing_factor").value = localsettings.smoothing_factor;
 		document.getElementById("nsigma").value = localsettings.nsigma;
-		document.getElementById("dynatemp_overview").innerText = (localsettings.dynatemp_range>0?"ON":"OFF");
+		document.getElementById("dynatemp_overview").innerText = (localsettings.dynatemp_range!=0?"ON":"OFF");
 		document.getElementById("presence_penalty").value = localsettings.presence_penalty;
 		document.getElementById("sampler_seed").value = localsettings.sampler_seed;
 		document.getElementById("top_k").value =  document.getElementById("top_k_slide").value = localsettings.top_k;
@@ -11072,7 +11111,7 @@ Current version indicated by LITEVER below.
 			document.getElementById("rep_pen_slope").value = found.rep_pen_slope;
 			document.getElementById("sampler_order").value = found.sampler_order.toString();
 			document.getElementById("presetsdesc").innerText = found.description;
-			document.getElementById("dynatemp_overview").innerText = (document.getElementById("dynatemp_range").value>0?"ON":"OFF");
+			document.getElementById("dynatemp_overview").innerText = (document.getElementById("dynatemp_range").value!=0?"ON":"OFF");
 		}else{
 			document.getElementById("presetsdesc").innerText = "";
 		}
@@ -11161,6 +11200,61 @@ Current version indicated by LITEVER below.
 		}
 	}
 
+	function update_for_sidepanel()
+	{
+		if (localsettings.sidepanel_mode)
+		{
+			display_settings();
+			btn_memory();
+		}
+	}
+
+	function toggle_sidepanel_mode()
+	{
+		if(window.innerWidth <= 600) //sidepanel mode cannot be used on mobile
+		{
+			document.getElementById("sidepanel_mode").checked = localsettings.sidepanel_mode = false;
+		}
+		if (localsettings.sidepanel_mode)
+		{
+			document.getElementById("settingscontainerbg").classList.add("hidden");
+			document.getElementById("settingscontainerfg").classList.add("sidepanelsize");
+			document.getElementById("memorycontainerbg").classList.add("hidden");
+			document.getElementById("memorycontainerfg").classList.add("sidepanelsize");
+			document.getElementById("settingscontainer").classList.add("side");
+			document.getElementById("memorycontainer").classList.add("sideright");
+			document.getElementById("maincontainer").classList.add("centeredcontainer");
+			document.getElementById("settingscontainerfooter").classList.add("hidden");
+			document.getElementById("memorycontainerfooter").classList.add("hidden");
+			document.getElementById("settingscontainerfooter2").classList.remove("hidden");
+			document.getElementById("memorycontainerfooter2").classList.remove("hidden");
+			if(document.getElementById("settingscontainer").classList.contains("hidden"))
+			{
+				display_settings();
+			}
+			if(document.getElementById("memorycontainer").classList.contains("hidden"))
+			{
+				btn_memory();
+			}
+		}
+		else
+		{
+			document.getElementById("settingscontainerbg").classList.remove("hidden");
+			document.getElementById("settingscontainerfg").classList.remove("sidepanelsize");
+			document.getElementById("memorycontainerbg").classList.remove("hidden");
+			document.getElementById("memorycontainerfg").classList.remove("sidepanelsize");
+			document.getElementById("settingscontainer").classList.remove("side");
+			document.getElementById("memorycontainer").classList.remove("sideright");
+			document.getElementById("maincontainer").classList.remove("centeredcontainer");
+			document.getElementById("settingscontainerfooter").classList.remove("hidden");
+			document.getElementById("memorycontainerfooter").classList.remove("hidden");
+			document.getElementById("settingscontainerfooter2").classList.add("hidden");
+			document.getElementById("memorycontainerfooter2").classList.add("hidden");
+			document.getElementById("settingscontainer").classList.add("hidden");
+			document.getElementById("memorycontainer").classList.add("hidden");
+		}
+	}
+
 	function toggle_invert_colors()
 	{
 		if(localsettings.invert_colors)
@@ -11259,6 +11353,7 @@ Current version indicated by LITEVER below.
 		localsettings.show_advanced_load = (document.getElementById("show_advanced_load").checked ? true : false);
 		localsettings.import_tavern_prompt = (document.getElementById("import_tavern_prompt").checked ? true : false);
 		localsettings.invert_colors = (document.getElementById("invert_colors").checked ? true : false);
+		localsettings.sidepanel_mode = (document.getElementById("sidepanel_mode").checked ? true : false);
 		localsettings.trimsentences = (document.getElementById("trimsentences").checked ? true : false);
 		localsettings.trimwhitespace = (document.getElementById("trimwhitespace").checked ? true : false);
 		localsettings.compressnewlines = (document.getElementById("compressnewlines").checked ? true : false);
@@ -11418,8 +11513,8 @@ Current version indicated by LITEVER below.
 		localsettings.rep_pen_slope = cleannum(localsettings.rep_pen_slope, 0, 20);
 		localsettings.top_p = cleannum(localsettings.top_p, 0.002, 1);
 		localsettings.min_p = cleannum(localsettings.min_p, 0.0, 1);
-		localsettings.dynatemp_range = cleannum(localsettings.dynatemp_range, 0.0, 5);
-		localsettings.dynatemp_range = (localsettings.dynatemp_range>localsettings.temperature?localsettings.temperature:localsettings.dynatemp_range);
+		localsettings.dynatemp_range = cleannum(localsettings.dynatemp_range, -5, 5);
+		localsettings.dynatemp_range = ((localsettings.dynatemp_range > localsettings.temperature) ? localsettings.temperature : ((localsettings.dynatemp_range < -localsettings.temperature) ? -localsettings.temperature : localsettings.dynatemp_range));
 		localsettings.dynatemp_exponent = cleannum(localsettings.dynatemp_exponent, 0.0, 10.0);
 		localsettings.smoothing_factor = cleannum(localsettings.smoothing_factor, 0.0, 10.0);
 		localsettings.nsigma = cleannum(localsettings.nsigma, 0.0, 5.0);
@@ -11439,6 +11534,7 @@ Current version indicated by LITEVER below.
 		localsettings.sampler_seed = cleannum(localsettings.sampler_seed, -1, 999999);
 		localsettings.token_count_multiplier = cleannum(localsettings.token_count_multiplier, 70, 130);
 		toggle_invert_colors();
+		toggle_sidepanel_mode();
 
 		voice_typing_mode = document.getElementById("voice_typing_mode").value;
 		if(voice_typing_mode>0 && is_using_kcpp_with_whisper())
@@ -11768,6 +11864,7 @@ Current version indicated by LITEVER below.
 		hide_popups();
 		restart_new_game(true, document.getElementById("keep_memory").checked);
 		sync_multiplayer(true);
+		update_for_sidepanel();
 		hide_popups();
 	}
 
@@ -11868,6 +11965,7 @@ Current version indicated by LITEVER below.
 		}
 		thinking_action = parseInt(document.getElementById("thinking_action").value);
 		force_thinking_tag = document.getElementById("force_thinking_tag").checked?true:false;
+		strip_past_thinking = document.getElementById("strip_past_thinking").checked?true:false;
 		start_thinking_tag = document.getElementById("start_thinking_tag").value;
 	}
 
@@ -12222,6 +12320,7 @@ Current version indicated by LITEVER below.
 			thinking_pattern = "<think>([\\s\\S]+?)<\/think>";
 			thinking_action = 1;
 			force_thinking_tag = false;
+			strip_past_thinking = true;
 			start_thinking_tag = "<think>";
 		}
 		warn_on_quit = false;
@@ -12353,7 +12452,7 @@ Current version indicated by LITEVER below.
 				let matchiter = 0;
 				inputtxt = inputtxt.replace(/%ExpandBtn%/g, function (m) {
 					let curr = matches[matchiter];
-					let expandedhtml = `<span><button type="button" title="Show Thoughts" class="btn btn-primary" style="font-size:12px;padding:2px 2px;" onclick="toggle_hide_thinking(this)">Show Thoughts (${curr.length} characters)</button><span class="hidden">${escape_html(curr)}</span></span>`;
+					let expandedhtml = `<span><button type="button" title="Show Thoughts" class="btn btn-primary" style="font-size:12px;padding:2px 2px;" onclick="toggle_hide_thinking(this)">Show Thoughts (${curr.length} characters)</button><span class="color_lightgreen hidden"><br>${escape_html(curr)}</span></span>`;
 					++matchiter;
 					return expandedhtml;
 				});
@@ -13585,6 +13684,13 @@ Current version indicated by LITEVER below.
 			let truncated_context = concat_gametext(true, "","","",false,true); //no need to truncate if memory is empty
 			truncated_context = truncated_context.replace(/\xA0/g,' '); //replace non breaking space nbsp
 
+			//remove past thoughts
+			if(strip_past_thinking && thinking_action>0 && thinking_pattern!="") //removal of cot
+			{
+				let pat = new RegExp(thinking_pattern, "gm");
+				truncated_context = truncated_context.replace(pat, "");
+			}
+
 			let max_allowed_characters = getMaxAllowedCharacters(truncated_context, maxctxlen, maxgenamt);
 
 			//for adventure mode, inject hidden context, even more if there's nothing in memory
@@ -14516,20 +14622,37 @@ Current version indicated by LITEVER below.
 				{
 					let sysprompt = document.getElementById("claudesystemprompt").value;
 					let assistantprompt = document.getElementById("claudejailbreakprompt").value;
+					let claudethinking = (document.getElementById("claudethinking").checked?true:false);
+
 					claude_payload =
 					{
 						"model": custom_claude_model,
 						"messages": [],
 						"max_tokens": submit_payload.params.max_length,
-						"top_k": (submit_payload.params.top_k<1?300:submit_payload.params.top_k),
 						"temperature": submit_payload.params.temperature,
-						"top_p": submit_payload.params.top_p,
 					};
 					claude_payload.messages.push({"role": "user", "content": submit_payload.prompt})
 					if(sysprompt)
 					{
 						claude_payload.system = sysprompt;
 					}
+
+					if(claudethinking && custom_claude_model.toLowerCase().includes("claude-3-7"))
+					{
+						claude_payload.thinking = {
+							"type": "enabled",
+							"budget_tokens": 1024
+						}
+						claude_payload.max_tokens += 1024;
+						claude_payload.temperature = 1;
+					}
+					else
+					{
+						//unsupported with thinking
+						claude_payload.top_k = (submit_payload.params.top_k<1?300:submit_payload.params.top_k);
+						claude_payload.top_p = submit_payload.params.top_p;
+					}
+
 					if(localsettings.opmode==1)
 					{
 						claude_payload.system = "Always respond with a direct partial continuation of the story immediately from the latest word.";
@@ -14606,9 +14729,24 @@ Current version indicated by LITEVER below.
 					.then((response) => response.json())
 					.then((data) => {
 						console.log("sync finished response: " + JSON.stringify(data));
-						if(custom_claude_key != "" && data.content && data.content.length > 0 && data.content[0].text)
+						if(custom_claude_key != "" && data.content && data.content.length > 0)
 						{
-							data.completion = data.content[0].text; //for claudev3
+							let comp = "";
+							for(let i=0;i<data.content.length;++i)
+							{
+								if(data.content[i].text)
+								{
+									comp += data.content[i].text; //for claudev3
+								}
+								if(data.content[i].thinking)
+								{
+									comp += `<think>${data.content[i].thinking}</think>\n\n`; //for claudev3
+								}
+							}
+							if(comp!="")
+							{
+								data.completion = comp;
+							}
 							if(localsettings.opmode==1 && gametext_arr.length>0 && data.completion!="")
 							{
 								data.completion = cleanup_story_completion(data.completion);
@@ -17545,6 +17683,15 @@ Current version indicated by LITEVER below.
 			document.getElementById("topbtn_admin").classList.add("hidden");
 		}
 
+		if(!localsettings.sidepanel_mode && (perfdata == null || selected_models.length != 0))
+		{
+			document.getElementById("topbtn_settings").classList.remove("hidden");
+			document.getElementById("btn_actmem").classList.remove("hidden");
+		}else{
+			document.getElementById("topbtn_settings").classList.add("hidden");
+			document.getElementById("btn_actmem").classList.add("hidden");
+		}
+
 		if (perfdata == null) {
 			document.getElementById("topbtn_reconnect").classList.remove("hidden");
 			if(localflag)
@@ -17556,7 +17703,6 @@ Current version indicated by LITEVER below.
 			document.getElementById("topbtn_ai").classList.add("hidden");
 			document.getElementById("topbtn_newgame").classList.remove("hidden");
 			document.getElementById("topbtn_save_load").classList.remove("hidden");
-			document.getElementById("topbtn_settings").classList.remove("hidden");
 			document.getElementById("topbtn_scenarios").classList.add("hidden");
 			document.getElementById("topbtn_quickplay").classList.add("hidden");
 		} else {
@@ -17573,13 +17719,11 @@ Current version indicated by LITEVER below.
 			if (selected_models.length == 0) {
 				document.getElementById("topbtn_newgame").classList.add("hidden");
 				document.getElementById("topbtn_save_load").classList.add("hidden");
-				document.getElementById("topbtn_settings").classList.add("hidden");
 				document.getElementById("topbtn_scenarios").classList.add("hidden");
 				document.getElementById("topbtn_quickplay").classList.remove("hidden");
 			} else {
 				document.getElementById("topbtn_newgame").classList.remove("hidden");
 				document.getElementById("topbtn_save_load").classList.remove("hidden");
-				document.getElementById("topbtn_settings").classList.remove("hidden");
 				document.getElementById("topbtn_scenarios").classList.remove("hidden");
 				document.getElementById("topbtn_quickplay").classList.add("hidden");
 			}
@@ -18802,6 +18946,7 @@ Current version indicated by LITEVER below.
 		document.getElementById("thinking_pattern").value = thinking_pattern;
 		document.getElementById("thinking_action").value = thinking_action;
 		document.getElementById("force_thinking_tag").checked = force_thinking_tag;
+		document.getElementById("strip_past_thinking").checked = strip_past_thinking;
 		document.getElementById("start_thinking_tag").value = start_thinking_tag;
 	}
 
@@ -20371,558 +20516,266 @@ Current version indicated by LITEVER below.
 		</div>
 	</div>
 
-	<div class="popupcontainer flex hidden" id="quickstartcontainer">
-		<div class="popupbg flex"></div>
-		<div class="scenariopopup">
+	<div class="popupcontainer flex hidden" id="memorycontainer">
+		<div class="popupbg flex" id="memorycontainerbg"></div>
+		<div class="nspopup flexsizebig evenhigher" id="memorycontainerfg">
 			<div class="popuptitlebar">
-				<div class="popuptitletext">Quick Start - Select A Scenario</div>
+				<div class="popuptitletext">Context Data</div>
 			</div>
+			<div><ul class="nav nav-tabs settingsnav">
+				<li id="memory_tab" class="active"><a class="" href="#" onclick="display_memory_tab(0)">Memory</a></li>
+				<li id="wi_tab"><a class="" href="#" onclick="display_memory_tab(1)">World Info</a></li>
+				<li id="documentdb_tab"><a class="" href="#" onclick="display_memory_tab(2)">TextDB</a></li>
+				<li id="websearch_tab"><a class="" href="#" onclick="display_memory_tab(3)">WebSearch</a></li>
+				<li id="token_tab"><a class="" href="#" onclick="display_memory_tab(4)">Tokens</a></li>
+			  </ul></div>
 
-			<div style="overflow: auto;">
-
-				<div class="scenariosearch">
-				<input class="scenariosearchbox1 form-control" type="text" placeholder="Quick Search" value=""
-					id="scenariosearch" oninput="scenario_search()">
-					<select class="scenariosearchbox2 form-control" id="scenariosearchdropdown" onchange="scenario_search()">
-						<option value="0">All</option>
-						<option value="1">Story</option>
-						<option value="2">Adventure</option>
-						<option value="3">Chat</option>
-						<option value="4">Instruct</option>
-					</select>
-				</div>
-				<div id="scenarioautopickbox" class="menutext" style="text-align: left; padding-left: 8px;">
-					Automatically select AI model <span class="helpicon">?
-						<span class="helptext">This option picks a suitable AI model based on the selected scenario. If no text model is currently selected, an appropriate one will be automatically picked for you.</span>
-					</span>
-					<input type="checkbox" id="scenarioautopickai" onchange="togglescenarioautopick()" checked>
-
-				</div>
-
-				<div id="scenariogrid" class="scenariogrid">
-				</div>
-				<div id="scenariodesc" class="scenariodesc">
-				</div>
-
-				<div class="popupfooter">
-					<button type="button" class="btn btn-primary" id=""
-						onclick="confirm_scenario_verify()">Ok</button>
-					<button type="button" class="btn btn-primary" id=""
-						onclick="hide_popups()">Cancel</button>
-				</div>
-			</div>
-		</div>
-	</div>
-
-	<div class="popupcontainer hidden flex" id="charactercreator">
-		<div class="popupbg flex"></div>
-		<div class="nspopup flexsize higher">
-			<div class="popuptitlebar">
-				<div class="popuptitletext">Roleplay Character Creator</div>
-			</div>
-			<div class="menutext">
-				This tool is an easy way to create your own roleplay characters.<br>Alternatively, you can <a href="#" class="color_blueurl" onclick="hide_popups();document.getElementById('loadfileinput').click()"><b>click here to import an existing Tavern Character Card</b></a>.<br><br>
-
-				<div class="inlinelabel">
-					<div class="justifyleft" style="padding:4px">Character Name: </div>
-					<input title="Character Name" style="width:calc(100% - 136px);" type="text" placeholder="Enter Bot's Name (e.g. Arthur Green)" value="" id="charcreator_name">
-				</div>
-				<div class="inlinelabel">
-					<div class="justifyleft" style="padding:4px">Character Avatar: </div>
-					<button type="button" class="btn btn-primary" style="padding:2px 4px;margin:2px;" onclick="selectAvatarImage(false)">Select Image</button>
-					<div id="charcreator_avatar" style="background-position: 50% 50%; background-size: 100% 100%; background-origin: content-box; background-repeat: no-repeat; width: 32px; height:32px; border-radius:100rem; background-clip: content-box; margin: 4px 4px; border:none;"></div>
-				</div>
-				<div class="inlinelabel">
-					<div class="justifyleft" style="padding:4px">Character Persona: </div>
-					<input title="Character Persona" style="width:calc(100% - 136px);" type="text" placeholder="Describe this character (e.g. 25yo Male Elf, Brave, Chatty)" value="" id="charcreator_persona">
-				</div>
-				<div class="inlinelabel">
-					<div class="justifyleft" style="padding:4px">Describe Scenario: </div>
-					<input title="Describe Scenario" style="width:calc(100% - 136px);" type="text" placeholder="Describe the scenario (e.g. A fireside chat at the spooky campsite)" value="" id="charcreator_scenario">
-				</div>
-				<div class="inlinelabel">
-					<div class="justifyleft" style="padding:4px">Character Greeting: </div>
-					<input title="Character Greeting" style="width:calc(100% - 136px);" type="text" placeholder="First message greeting (e.g. Hello, traveller!)" value="" id="charcreator_greeting">
-				</div>
-
-				After creating your character, you can edit it further at any time in the 'Context' memory window.
-			</div>
-			<div class="popupfooter">
-				<button type="button" class="btn btn-primary" onclick="character_creator_done(true);">Confirm</button>
-				<button type="button" class="btn btn-primary" onclick="character_creator_done(false);">Cancel</button>
-			</div>
-		</div>
-	</div>
-
-	<div class="popupcontainer flex hidden" id="saveloadcontainer">
-		<div class="popupbg flex"></div>
-		<div class="saveloadpopup">
-			<div class="popuptitlebar">
-				<div class="popuptitletext">Save File / Load File / Export File</div>
-			</div>
-
-			<div style="overflow: auto;">
-				<div id="saveloadentries" class="menutext saveloadgrid">
-					<div style="display:flex">
-						<button type="button" style="font-size:12px; margin:2px;width:33%" name="localsave" class="btn btn-primary" onclick="hide_popups();save_file_button()">💾<br>Download File</button>
-						<button type="button" style="font-size:12px; margin:2px;width:33%" name="localload" class="btn btn-primary" onclick="hide_popups();load_file_button()">📁<br>Open File</button>
-						<button type="button" style="font-size:12px; margin:2px;width:34%" name="shareurl" class="btn btn-primary" onclick="hide_popups();share_story_button()">🌐<br>Share</button>
-					</div>
-					<div style="margin-top:3px; text-align: center; align-self: center; width: 100%">
-						<span style="font-weight:bold;text-decoration: underline;">Slot Storage Option</span>
-					</div>
-					<div style="display:flex;">
-						<div style="width: 92px; margin:8px; margin-left: 4px; margin-right: 4px; font-size: 14px;">Data Location</div>
-						<div style="margin:3px; text-align: center; align-self: center; width: calc(100% - 94px);">
-						<select title="Select Slot Location" style="padding:4px;" class="form-control" id="saveslotlocationdropdown" onchange="saveloadchangeslot(true)">
-						<option value="1">Local Browser Cache</option>
-						<option value="2" id="kcppsaveavailable" class="hidden">KoboldCpp Server Storage</option>
-						</select>
+			<div class="context_tab_container" id="memory_tab_container">
+				<div class="settinglabel">
+					<span class="justifyleft">Memory<span class="helpicon">?<span
+						class="helptext">Put the information you want the AI to always remember. It will be inserted into the top of every request sent to the AI.</span></span></span>
+					<span class="justifyright flex-push-right" >
+						<div class="settinglabel" style="padding-top: 4px;">
+							<div class="justifyleft settingsmall" title="Add newline after injecting memory text">Newline After Memory </div>
+						<input type="checkbox" title="Add Newline After Memory" id="newlineaftermemory" style="margin:0px 0 0;" checked>
 						</div>
-					</div>
-					<div style="display:flex;">
-						<div style="width: 92px; margin:8px; margin-left: 4px; margin-right: 4px; font-size: 14px;">Selected Slot</div>
-						<div style="margin:3px; text-align: center; align-self: center; width: calc(100% - 94px);">
-						<select title="Select Save Slot" style="padding:4px;" class="form-control" id="saveslotselecteddropdown" onchange="saveloadchangeslot(false)">
-						</select>
-						</div>
-					</div>
-					<div style="width:100%;align-self: center;">
-						<button type="button" id="savetoslot" title="Save To Slot" class="btn btn-primary bg_primary" onclick="save_to_curr_slot()"><img class="btnicon-save"/> Save Slot</button>
-						<button type="button" id="loadfromslot" title="Load From Slot" class="btn btn-primary bg_primary" onclick="load_from_curr_slot()"><img class="btnicon-load"/> Load Slot</button>
-						<button type="button" id="downloadslot" title="Download Slot" class="btn btn-primary bg_green" onclick="download_from_curr_slot()"><img class="btnicon-download"/> Download</button>
-						<button type="button" id="deleteslot" title="Delete Slot" class="btn btn-primary bg_red" onclick="delete_from_curr_slot()"><img class="btnicon-delete"/> Delete Slot</button>
-					</div>
-				</div>
-				<div class="menutext"><p style="padding:6px;font-size: 10px;" class="color_red">Caution: Local Storage Slots are saved to a temporary cache and can be deleted by your browser. KoboldCpp remote storage will save data to a file in your KoboldCpp server. To avoid losing data, use the download file button.</p></div>
-				<div class="popupfooter">
-					<button type="button" class="btn btn-primary" id=""
-						onclick="hide_popups()">Back</button>
-				</div>
-			</div>
-		</div>
-	</div>
-
-	<div class="popupcontainer flex hidden" id="customendpointcontainer">
-		<div class="popupbg flex"></div>
-		<div class="nspopup flexsize evenhigher">
-			<div class="popuptitlebar">
-				<div class="popuptitletext">Select your AI provider</div>
-			</div>
-			<div style="padding: 4px;">
-			<select id="unusedcustomapidropdown" style="display: none;"></select>
-			<select title="Select your AI provider" style="padding:4px;" class="form-control" id="customapidropdown" onchange="customapi_dropdown(true)">
-				<option value="0">AI Horde</option>
-				<option value="1">KoboldAI Remote API</option>
-				<option value="2">OpenAI Compatible API</option>
-				<option value="3">OpenRouter API</option>
-				<option value="4">Claude By Anthropic API</option>
-				<option value="5">PaLM/Gemini By Google API</option>
-				<option value="6">Cohere API</option>
-				<option value="7">MistralAI API</option>
-				<option value="8">Featherless API</option>
-				<option value="9">Grok API</option>
-			</select>
-			</div>
-
-			<div class="menutext" id="hordeloadmodelcontainer">
-				The AI Horde is a service that generates text using crowdsourced GPUs run by independent volunteer workers. Avoid sending privacy sensitive information. <a href="#" class="color_blueurl" onclick="explain_horde()">Click here for more info</a>
-				<div class="menutext" style="text-align: left;">
-					<span style="float:left; text-align: left;">
-					Your AI Horde API Key <span class="helpicon">?
-						<span class="helptext">You need an API key to use AI Horde to generate text. Get one at
-							https://aihorde.net/register or use the anonymous key 0000000000.</span>
-					</span>
-					<br><a href="#" id="showownworkerslink" class="color_blueurl hidden" onclick="show_my_own_workers()">[Manage My Workers]</a></span>
-					<span class="color_green" style="float:right; text-align: right;" id="kudos_bal">
-						Need a Key?<br><a class='color_blueurl' href='https://aihorde.net/register'>(Register New User)</a>
 					</span>
 				</div>
-				<input class="form-control" type="password" placeholder="Enter API Key (or use 0000000000)" value=""
-					id="apikey" onfocus="focus_api_keys()" onblur="fetch_kudo_balance();blur_api_keys()">
-
-				<div class="menutext" style="text-align: left;">
-					Select AI Horde Model <span class="helpicon">?
-						<span class="helptext">These are the models currently provided by AI Horde volunteers.</span>
-					</span> <a href="#" class="color_blueurl" onclick="reset_horde_selection()">[Reset]</a>
-					<span style="float:right;">
-					<a href="#" class="color_green" onclick="get_and_show_workers()">[See Current Volunteers] </a>
-					</span>
-					<select class="form-control" id="pickedmodel" size="7" multiple></select>
-				</div>
-
-				<div class="menutext" style="text-align: left;">
-					Select By Worker <span class="helpicon">?
-						<span class="helptext">This option explicitly assigns worker IDs, fixed based on the current workers available at model selection time.</span>
-					</span>
-					<input type="checkbox" id="manualworker" onclick="toggle_manual_horde_worker()">
-
-					<span style="float:right;">
-						<input class="settinglabel miniinput" style="margin: 3px; width: 90px;" type="text" placeholder="Quick Search" value="" id="modelquicksearch" oninput="model_quick_search()">
+				<textarea title="Edit Memory" class="form-control menuinput_multiline" id="memorytext" style="height: 120px;"
+					placeholder="Edit the memory to be sent with each request to the AI."></textarea>
+				<div class="settinglabel">
+					<div class="justifyleft"><br>Author's Note<span class="helpicon">?<span
+						class="helptext">Similar to Memory, but inserted near the end of the text instead of the start. A good way to control the mood/behavior of the AI.</span></span></div>
+					<span class="justifyright flex-push-right" >
+						<button type="button" class="btn btn-primary" style="padding:4px 6px;margin-top:4px;" id="btnnotes" onclick="set_personal_notes()">Notes</button>
+						<button type="button" class="btn btn-primary" style="padding:4px 6px;margin-top:4px;" id="btnautogenmem" onclick="autogenerate_summary_memory()">AutoGenerate Memory</button>
 					</span>
 				</div>
-
-			</div>
-
-			<div id="koboldcustom" class="menutext">
-				You can use this to connect to a KoboldAI instance running via a remote tunnel such as <span class="color_orange" style="font-weight: bold;">trycloudflare, localtunnel, ngrok</span>.<br><br>
-				Localhost IPs require host mode enabled. You can use the remote address displayed in the <span class="color_orange" style="font-weight: bold;">terminal console</span> or <span class="color_orange" style="font-weight: bold;">colab window</span>, note that the model must be loaded first.<br><br>
-				<span class="color_green" style="font-weight: bold;">Please input URL of the KoboldAI instance.</span><br><br>
-				<input class="form-control" id="customkoboldendpoint" placeholder="https://sample-remote-address.trycloudflare.com" value="">
-				<input class="form-control" type="password" id="customkoboldkey" placeholder="KoboldAI API Key (Optional)" value="" onfocus="focus_api_keys()" onblur="blur_api_keys()"><br>
-				<div class="borderbox flex flex-push-right">
-					<input type="checkbox" id="remoteconsolelog">
-					<div class="box-label" title="Will display outputs to the remote endpoint's console logs, useful for debugging.">Show Console Logging</div>
-				</div>
-			</div>
-			<div id="oaicustom" class="menutext hidden">
-				<span id="oaidesc">
-				Entering your OpenAI API key will allow you to use KoboldAI Lite with their API.<br><br>
-				Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature. Your API key is used directly with the OpenAI API and is not transmitted to us.<br>Only Temperature, Top-P and Repetition Penalty samplers are used.<br><br>
-				<span class="color_green" style="font-weight: bold;">Please input OpenAI API URL and Key.</span><br><br>
-				</span>
-				<span id="openrouterdesc" class="hidden">
-				Entering your OpenRouter API key will allow you to use KoboldAI Lite with their API.<br><br>
-				Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature. Your API key is used directly with the OpenRouter API and is not transmitted to us.<br>Only Temperature, Top-P and Repetition Penalty samplers are used.<br><br>
-				<span class="color_green" style="font-weight: bold;">Please input OpenRouter Key.</span><br><br>
-				</span>
-				<span id="mistralaidesc" class="hidden">
-				Entering your MistralAI API key will allow you to use KoboldAI Lite with their API.<br><br>
-				Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature. Your API key is used directly with the MistralAI API and is not transmitted to us.<br>Only Temperature and Top-P samplers are used.<br><br>
-				<span class="color_green" style="font-weight: bold;">Please input MistralAI Key.</span><br><br>
-				</span>
-				<span id="featherlessdesc" class="hidden">
-				Entering your Featherless API key will allow you to use KoboldAI Lite with their API.<br><br>
-				Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature. Your API key is used directly with the Featherless API and is not transmitted to us.<br>Only Temperature, Top-P, Top-K, Min-P and Repetition Penalty samplers are used.<br><br>
-				<span class="color_green" style="font-weight: bold;">Please input Featherless Key.</span><br><br>
-				</span>
-				<span id="grokdesc" class="hidden">
-				Entering your Grok API key will allow you to use KoboldAI Lite with their API.<br><br>
-				Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature. Your API key is used directly with the Grok API and is not transmitted to us.<br>Only Temperature, Top-P, Top-K, Min-P and Repetition Penalty samplers are used.<br><br>
-				<span class="color_green" style="font-weight: bold;">Please input Grok Key.</span><br><br>
-				</span>
-
-				<input class="form-control" type="text" id="custom_oai_endpoint" placeholder="OpenAI API URL" value="" onblur="try_fetch_oai_models_auto()">
-				<input class="form-control" type="password" id="custom_oai_key" placeholder="API Key (Required)" value="" onfocus="focus_api_keys()" onblur="blur_api_keys()"><br>
-				Model Choice:<br>
-				<select style="padding:4px;display:inline;width:calc(100% - 220px)" class="form-control" id="custom_oai_model" onchange="oai_model_change(true)">
-					<option value="gpt-3.5-turbo-instruct" selected="selected">gpt-3.5-turbo-instruct</option>
-					<option value="davinci-002">davinci-002</option>
-					<option value="gpt-3.5-turbo">gpt-3.5-turbo</option>
-					<option value="gpt-3.5-turbo-16k">gpt-3.5-turbo-16k</option>
-					<option value="gpt-4">gpt-4</option>
-					<option value="gpt-4-turbo">gpt-4-turbo</option>
-					<option value="gpt-4o">gpt-4o</option>
-					<option value="gpt-4-32k">gpt-4-32k</option>
-					<option value="o1-mini">o1-mini</option>
-					<option value="o1-preview">o1-preview</option>
-					<option style="display:none;" class="custom_model_option" value="custom">[Custom]</option>
-				</select>
-				<select style="padding:4px;display:inline;width:calc(100% - 220px)" class="form-control hidden" id="custom_openrouter_model" onchange="oai_model_change(true)">
-					<option value="openai/gpt-3.5-turbo">openai/gpt-3.5-turbo</option>
-					<option value="openai/gpt-4">openai/gpt-4</option>
-					<option value="openai/gpt-3.5-turbo-instruct">openai/gpt-3.5-turbo-instruct</option>
-					<option value="mistralai/mistral-7b-instruct" selected="selected">mistralai/mistral-7b-instruct</option>
-					<option value="gryphe/mythomax-l2-13b">gryphe/mythomax-l2-13b</option>
-					<option value="huggingfaceh4/zephyr-7b-beta">huggingfaceh4/zephyr-7b-beta</option>
-					<option value="anthropic/claude-2.0">anthropic/claude-2.0</option>
-					<option style="display:none;" class="custom_model_option" value="custom">[Custom]</option>
-				</select>
-				<select style="padding:4px;display:inline;width:calc(100% - 220px)" class="form-control hidden" id="custom_mistralai_model" onchange="oai_model_change(true)">
-					<option value="open-mistral-7b">open-mistral-7b</option>
-					<option value="open-mistral-nemo">open-mistral-nemo</option>
-					<option value="open-mixtral-8x22b">open-mixtral-8x22b</option>
-					<option value="mistral-tiny">mistral-tiny</option>
-					<option value="mistral-small">mistral-small</option>
-					<option value="mistral-medium-latest">mistral-medium-latest</option>
-					<option value="mistral-large-latest">mistral-large-latest</option>
-					<option value="pixtral-12b-latest">pixtral-12b-latest</option>
-					<option value="pixtral-large-latest">pixtral-large-latest</option>
-					<option value="codestral-latest">codestral-latest</option>
-					<option style="display:none;" class="custom_model_option" value="custom">[Custom]</option>
-				</select>
-				<select style="padding:4px;display:inline;width:calc(100% - 220px)" class="form-control hidden" id="custom_featherless_model" onchange="oai_model_change(true)">
-					<option value="Sao10K/L3-8B-Lunaris-v1">Sao10K/L3-8B-Lunaris-v1</option>
-					<option value="Sao10K/L3-8B-Stheno-v3.2">Sao10K/L3-8B-Stheno-v3.2</option>
-					<option value="unsloth/llama-3-8b-Instruct">unsloth/llama-3-8b-Instruct</option>
-					<option value="beomi/Llama-3-Open-Ko-8B">beomi/Llama-3-Open-Ko-8B</option>
-					<option value="Sao10K/Fimbulvetr-11B-v2">Sao10K/Fimbulvetr-11B-v2</option>
-					<option value="HuggingFaceH4/zephyr-7b-beta">HuggingFaceH4/zephyr-7b-beta</option>
-					<option value="upstage/SOLAR-10.7B-Instruct-v1.0">upstage/SOLAR-10.7B-Instruct-v1.0</option>
-					<option value="alpindale/magnum-72b-v1">alpindale/magnum-72b-v1</option>
-					<option value="Sao10K/L3-70B-Euryale-v2.1">Sao10K/L3-70B-Euryale-v2.1</option>
-					<option value="alpindale/WizardLM-2-8x22B">alpindale/WizardLM-2-8x22B</option>
-					<option value="meta-llama/Meta-Llama-3.1-405B-Instruct">meta-llama/Meta-Llama-3.1-405B-Instruct</option>
-				<option style="display:none;" class="custom_model_option" value="custom">[Custom]</option>
-				</select>
-				<select style="padding:4px;display:inline;width:calc(100% - 220px)" class="form-control hidden" id="custom_grok_model" onchange="oai_model_change(true)">
-					<option value="grok-beta">grok-beta</option>
-					<option style="display:none;" class="custom_model_option" value="custom">[Custom]</option>
-				</select>
-				<button type="button" class="btn btn-primary" style="display:inline;width:105px;" id="oaifetchlist" onclick="oai_fetch_models()">Fetch List</button>
-				<button type="button" class="btn btn-primary" style="display:inline;width:105px;" id="oaiusecustom" onclick="select_custom_oai_model()">Use Custom</button>
-				<div class="hidden" id="oaiemulatecompletionsbox">
-					<div><input type="checkbox" id="oaiemulatecompletions" title="Emulate Completions with Prefill">
-					<div class="box-label">Emulate Completions API with Prefill</div></div>
-				</div>
-				<div style="display:inline-flex">
-					<div><input type="checkbox" id="oaiaddversion" title="Add Endpoint Version Number" onchange="" checked>
-					<div class="box-label">Add Ver. Num</div></div>
-					<div><input type="checkbox" id="oaistreaming" title="Enable SSE Streaming" onchange="">
-					<div class="box-label">Streaming</div></div>
-					<div><input type="checkbox" id="useoaichatcompl" title="Use ChatCompletions API" onchange="toggleoaichatcompl()">
-					<div class="box-label">Chat-Completions API</div></div>
-					<div><input type="checkbox" id="useoainonstandard" title="Send Non-Standard Fields">
-					<div class="box-label">Non-Standard Fields</div></div>
-				</div>
-				<span id="useoaichatcomplbox" class="hidden" onload="toggleoaichatcompl();">
-					<br>
-					Main Message Role:
-					<select class="form-control" style="height: 25px; font-size:12px; padding:4px;display:inline;width:100px" id="oairoledropdown">
-						<option value="0" selected>User</option>
-						<option value="1">Assistant</option>
-						<option value="2">System</option>
-					</select>
-					<input type="checkbox" id="jailbreakprompt" onchange="togglejailbreak()">
-					<div class="box-label" title="Adds extra text at the start to improve AI response">Add Prefix</div>
-					<input type="checkbox" id="jailbreakprompt2" onchange="togglejailbreak2()">
-					<div class="box-label" title="Adds extra text to the end to improve AI response">Add Postfix</div>
-
-					<div style="display:flex" id="oaijailbreakpromptblock1">
-					<select class="form-control" style="height: 25px; font-size:12px; padding:4px;display:inline;width:100px" id="jailbreakprompttextrole">
-						<option value="0">User</option>
-						<option value="1">Assistant</option>
-						<option value="2" selected>System</option>
-					</select>
-					<textarea class="form-control" rows="3" style="resize: vertical; line-height:1.1; padding:4px; display:inline; width: 100%" type="text" id="jailbreakprompttext" placeholder="(Enter System Prefix)"
-					value="" onload="togglejailbreak();"></textarea>
-					</div>
-
-					<div style="display:flex" id="oaijailbreakpromptblock2">
-					<select class="form-control" style="height: 25px; font-size:12px; padding:4px;display:inline;width:100px" id="jailbreakprompttext2role">
-						<option value="0">User</option>
-						<option value="1" selected>Assistant</option>
-						<option value="2">System</option>
-					</select>
-					<textarea class="form-control" rows="3" style="resize: vertical; line-height:1.1; padding:4px;  display:inline; width: 100%;" type="text" id="jailbreakprompttext2" placeholder="(Enter Assistant Postfix)"
-					value="" onload="togglejailbreak2();"></textarea>
-					</div>
-				</span>
-				<span id="openrouterproviderbox" class="hidden"><br>Preferred Provider: <input style="height: 25px; font-size:12px;padding:4px;display:inline;width:calc(100% - 140px)" class="form-control" type="text" id="openrouterproviders" placeholder="(Automatic)" value="">
-					<div style="display:inline;width:210px;">
-					</div>
-				</span>
-
-			</div>
-			<div id="claudecustom" class="menutext hidden">
-				Entering your Claude API key will allow you to use KoboldAI Lite with their API.<br><br>
-				Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature. <br><span class="color_red">At this time, the official Claude API has CORS restrictions and must be accessed with a CORS proxy. Your connection WILL be proxied.</span><br>Only Temperature, Top-P and Top-K samplers are used.<br><br>
-				<span class="color_green" style="font-weight: bold;">Please input Claude API URL and Key.</span><br><br>
-				<input class="form-control" type="text" id="custom_claude_endpoint" placeholder="Claude API URL" value="">
-				<input class="form-control" type="password" id="custom_claude_key" placeholder="Claude API Key (Required)" value="" onfocus="focus_api_keys()" onblur="blur_api_keys()"><br>
-				Model Choice:<br>
-				<select style="padding:4px;" class="form-control" id="custom_claude_model"  onload="toggleclaudemodel()"  onchange="toggleclaudemodel()">
-					<option value="claude-v1">claude-v1</option>
-					<option value="claude-v1-100k">claude-v1-100k</option>
-					<option value="claude-instant-v1">claude-instant-v1</option>
-					<option value="claude-instant-v1-100k">claude-instant-v1-100k</option>
-					<option value="claude-2">claude-2</option>
-					<option value="claude-2.1">claude-2.1</option>
-					<option value="claude-2.0">claude-2.0</option>
-					<option value="claude-3-opus-20240229">claude-3-opus</option>
-					<option value="claude-3-sonnet-20240229">claude-3-sonnet</option>
-					<option value="claude-3-haiku-20240307">claude-3-haiku</option>
-					<option value="claude-3-5-sonnet-20240620">claude-3-5-sonnet-20240620</option>
-					<option value="claude-3-5-sonnet-20241022">claude-3-5-sonnet-20241022</option>
-					<option value="claude-3-5-sonnet-latest" selected="selected">claude-3-5-sonnet-latest</option>
-					<option value="claude-3-5-haiku-20241022">claude-3-5-haiku-20241022</option>
-				</select>
-				<input type="checkbox" id="claudeaddversion" onchange="" checked>
-				<div class="box-label" title="Add endpoint version">Add Endpoint Version</div>
-				<span id="clauderenamecompatdiv">
-				<input type="checkbox" id="clauderenamecompat" onchange="" checked>
-				<div class="box-label" title="Rename User and Bot tags to work with claude, force inject them otherwise">Claude Compatibility Rename Fix</div>
-				</span>
-
-				<textarea class="form-control hidden" rows="2" style="resize: vertical; line-height:1.1; padding:4px; display:inline; width: 100%" type="text" id="claudesystemprompt" placeholder="(Enter System Prompt)"
-				value="" onload=""></textarea>
-				<textarea class="form-control hidden" rows="2" style="resize: vertical; line-height:1.1; padding:4px; display:inline; width: 100%" type="text" id="claudejailbreakprompt" placeholder="(Enter Assistant Postfix)"
-				value="" onload=""></textarea>
-
-			</div>
-			<div id="palmcustom" class="menutext hidden">
-				Uses Gemini or PaLM Text Bison by Google.<br><br>
-				Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature. Your API key is used directly with the Gemini API and is not transmitted to us.<br><br>
-				<div>
-				<select style="padding:4px; width:calc(100% - 110px); display:inline-block" class="form-control" id="custom_palm_model" onchange="togglepalmmodel()">
-					<option value="gemini-pro" selected="selected">gemini-pro</option>
-					<option value="gemini-1.5-pro-001">gemini-1.5-pro-001</option>
-					<option value="gemini-1.5-pro-002">gemini-1.5-pro-002</option>
-					<option value="gemini-1.5-pro-latest">gemini-1.5-pro-latest</option>
-					<option value="gemini-1.5-flash-latest">gemini-1.5-flash-latest</option>
-					<option value="gemini-1.5-pro-exp-0801">gemini-1.5-pro-exp-0801</option>
-					<option value="gemini-1.5-pro-exp-0827">gemini-1.5-pro-exp-0827</option>
-					<option value="gemini-2.0-flash">gemini-2.0-flash</option>
-					<option value="gemini-2.0-pro-exp">gemini-2.0-pro-exp</option>
-					<option value="gemini-exp-1114">gemini-exp-1114</option>
-					<option value="gemini-exp-1121">gemini-exp-1121</option>
-					<option value="text-bison-001">text-bison-001</option>
-				</select>
-				<button type="button" class="btn btn-primary" style="display:inline;width:105px;" id="geminifetchlist" onclick="gemini_fetch_models()">Fetch List</button>
-				</div>
-				<span class="color_green" style="font-weight: bold;">Please input Gemini or PaLM API Key.</span><br><br>
-				<input class="form-control" type="password" id="custom_palm_key" placeholder="PaLM/Gemini API Key (Required)" value="" onfocus="focus_api_keys()" onblur="blur_api_keys()"><br>
-				<div id="gemini_role_options">
-				<div>
-					Main Message Role:
-					<select class="form-control" style="height: 25px; font-size:12px; padding:4px;display:inline;width:100px" onload="togglegeminirole();" onchange="togglegeminirole();" id="geminiroledropdown">
-						<option value="" selected>Default</option>
-						<option value="user">User</option>
-						<option value="model">Model</option>
-					</select>
-				</div>
-				<div id="gemini_role_options2" style="display:flex">
-					<select class="form-control" style="height: 25px; font-size:12px; padding:4px;display:inline;width:100px" id="gemini_postfix_role">
-						<option value="user">User</option>
-						<option value="model" selected>Model</option>
-					</select>
-					<textarea class="form-control" rows="3" style="resize: vertical; line-height:1.1; padding:4px;  display:inline; width: 100%;" type="text" id="gemini_postfix_text" placeholder="(Enter Gemini Postfix)"
-					value=""></textarea>
-				</div>
-				</div>
-				<textarea class="form-control" rows="3" style="resize: vertical; line-height:1.1; padding:4px; display:inline; width: 100%" type="text" id="gemini_system_instruction" placeholder="(Enter System Instruction)"
-				value=""></textarea><br>
-			</div>
-			<div id="coherecustom" class="menutext hidden">
-				Uses Cohere's models through their own API.<br><br>
-				Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature. Your API key is used directly with the Cohere API and is not transmitted to us.<br><br>
-				<select style="padding:4px;" class="form-control" id="custom_cohere_model">
-					<option value="command" selected="selected">command</option>
-					<option value="command-r">command-r</option>
-					<option value="command-r-plus">command-r-plus</option>
-					<option value="command-r-08-2024">command-r-08-2024</option>
-					<option value="command-r-plus-08-2024">command-r-plus-08-2024</option>
-				</select>
-				<span class="color_green" style="font-weight: bold;">Please input Cohere API Key.</span><br><br>
-				<input class="form-control" type="password" id="custom_cohere_key" placeholder="Cohere API Key (Required)" value="" onfocus="focus_api_keys()" onblur="blur_api_keys()"><br>
-				<input type="checkbox" id="usecohereweb">
-				<div class="box-label" id="usecohereweblabel">Use WebSearch</div>
-				<input type="checkbox" id="useocoherepreamble" onchange="togglecoherepreamble()">
-				<div class="box-label" id="useocoherepreamblelabel">Use Preamble</div>
-
-				<span id="useocoherepreamblebox" class="hidden" onload="togglecoherepreamble();">
-					<textarea class="form-control" id="cohere_preamble" rows="3" style="resize: vertical; line-height:1.1; padding:4px; display:inline; width: 100%" type="text" placeholder="(Enter Preamble)" value=""></textarea>
-				</span>
-			</div>
-			<div class="popupfooter">
-				<button type="button" class="btn btn-primary" onclick="clear_cors_proxy_flag();connect_custom_endpoint()">Ok</button>
-				<button type="button" class="btn btn-primary" onclick="dismiss_endpoint_container()">Cancel</button>
-			</div>
-		</div>
-	</div>
-
-	<div class="popupcontainer flex hidden" id="admincontainer">
-		<div class="popupbg flex"></div>
-		<div class="nspopup flexsizevsmall">
-			<div class="popuptitlebar">
-				<div class="popuptitletext">Change Loaded KoboldCpp Config</div>
-			</div>
-			<div class="menutext">
-				<b></b>Warning: This will terminate the current KoboldCpp instance and relaunch it with a new config.</b><br><br>
-				If an invalid configuration is selected, the new server may fail to relaunch!<br><br>
-				<div>
-					<select title="Select New Config" style="padding:4px;" class="form-control" id="adminconfigdropdown">
-					</select>
-				</div>
+				<textarea title="Edit Author's Note" style="height: 80px;" class="form-control menuinput_multiline" id="anotetext"
+					placeholder="Author's Note will be inserted close to end of context."></textarea>
 				<br>
+				<div class="settinglabel">
+					<span class="justifyleft">Author's Note Template<span class="helpicon">?<span
+						class="helptext">A placeholder, will be inserted with the author's note replacing the &lt;|&gt;. You generally don't need to change this.</span></span></span>
+					<span class="justifyright flex-push-right" >
+						A/N Strength<span class="helpicon">?<span
+							class="helptext">Controls how far back to insert the Author's Note. Notes injected closer to the end have a stronger effect.</span></span>
+					</span>
+				</div>
+				<div style="display: flex; column-gap: 4px;">
+				<input title="Author's Note Template" class="form-control menuinput_inline" type="text"
+					placeholder="(the &lt;|&gt; will be replaced with the Author's Note text)" value="" id="anotetemplate">
+					<select title="Author's Note Strength" style="padding:4px; display: inline;	width: 94px; padding: 6px 3px;" class="form-control" id="anote_strength">
+						<option value="480">Weak</option>
+						<option value="320">Medium</option>
+						<option value="160">Strong</option>
+						<option value="0">Immediate</option>
+					</select>
+				</div>
 			</div>
-			<div class="popupfooter">
-				<button type="button" style="width:200px" class="btn btn-primary" onclick="trigger_admin_reload()">Reload KoboldCpp</button>
-				<button type="button" class="btn btn-primary" onclick="hide_popups()">Cancel</button>
-			</div>
-		</div>
-	</div>
 
-	<div class="popupcontainer flex hidden" id="newgamecontainer">
-		<div class="popupbg flex"></div>
-		<div class="nspopup flexsizevsmall">
-			<div class="popuptitlebar">
-				<div class="popuptitletext">Really Start A New Session?</div>
+			<div class="context_tab_container" id="wi_tab_container">
+				<div style="text-align: right;">
+				<button type="button" style="padding:4px;margin:4px" class="btn bluebtn widelbtn" id="wiadd" onclick="add_wi()">+Add</button>
+				</div>
+				<div class="wilist" id="wilist">
+				</div>
+
+				<div class="settinglabel" style="padding: 4px;">
+					<div class="justifyleft settingsmall">WI Insert Location <span class="helpicon">?<span
+						class="helptext">Controls where the world info should be inserted</span></span></div>
+				<select title="World Info Insert Location" style="height:16px;padding:0px;margin:0px 4px 0; width:90px;font-size:10px;" class="form-control" id="wi_insertlocation">
+					<option value="0">After Memory</option>
+					<option value="1">Before A/N</option>
+				</select></div>
+				<div class="settinglabel" style="padding: 4px;">
+					<div class="justifyleft settingsmall">WI Search Depth <span class="helpicon">?<span
+						class="helptext">Controls how far back in the text to search for World Info Keys</span></span></div>
+				<select title="World Info Search Depth" style="height:16px;padding:0px;margin:0px 4px 0; width:90px;font-size:10px;" class="form-control" id="wi_searchdepth">
+					<option value="0">Full Context</option>
+					<option value="8192">Last 8192</option>
+					<option value="4096">Last 4096</option>
+					<option value="2048">Last 2048</option>
+					<option value="1024">Last 1024</option>
+					<option value="512">Last 512</option>
+					<option value="256">Last 256</option>
+				</select></div>
+
+
+				<div style="float:right;">
+					<input title="World Info Quick Search" class="settinglabel miniinput" style="margin: 3px; width: 90px;" type="text" placeholder="Quick Search" value="" id="wiquicksearch" oninput="wi_quick_search()">
+				</div>
+				<div class="settinglabel" style="padding: 4px;">
+					<div class="justifyleft settingsmall" title="Controls whether the world info keys are matched in a case-sensitive way.">Case Sensitive Keys </div>
+				<input title="World Info Case Sensitive" type="checkbox" id="case_sensitive_wi" style="margin:0px 0 0;">
+				</div>
 			</div>
-			<div class="menutext">
-				Unsaved data will be lost.<br><br>
-				<div>
-					<div style="vertical-align: middle;">
-						<div title="If disabled, brings you back to the start page">
-							<span>Keep AI Selected? </span>
-							<input type="checkbox" id="keep_ai_selected" style=" vertical-align: top;" checked>
-						</div>
-						<div>
-							<span>Keep Memory and World Info? </span>
-							<input type="checkbox" id="keep_memory" style=" vertical-align: top;">
-						</div>
+
+			<div class="context_tab_container" id="documentdb_tab_container">
+				<div class="settinglabel" style="padding: 4px;">Automatically search and include relevant snippets from a text document or history.</div>
+				<div class="settinglabel" style="padding: 4px;">
+					<div class="justifyleft settingsmall" title="Enable TextDB">Enable TextDB </div>
+					<input title="Enable TextDB" type="checkbox" id="documentdb_enabled" style="margin:0px 0 0;">
+				</div>
+
+				<div class="settinglabel" style="padding: 4px;">
+					<div class="justifyleft settingsmall" title="Search Includes Context History">Search Includes Context History <span class="helpicon">?
+						<span class="helptext">If enabled, the entire story/chat history is used as a searchable document</span></span></div>
+					<input title="Search Context History" type="checkbox" id="documentdb_searchhistory" style="margin:0px 0 0;">
+				</div>
+
+				<div class="settinglabel" style="padding: 4px;">
+					<div class="justifyleft settingsmall">Num. Search Results <span class="helpicon">?
+					<span class="helptext">Controls how many snippets to retrieve from DB</span></span></div>
+					<input title="Number of Search Results" class="settinglabel miniinput"
+						style="height:16px;margin:0px 4px 0; width:90px;font-size:10px;" type="number"
+						min="1" max="5" step="1" pattern="\d+" placeholder="" value="" id="documentdb_numresults">
+				</div>
+				<div class="settinglabel" style="padding: 4px;">
+					<div class="justifyleft settingsmall">Nearby Text Amount <span class="helpicon">?
+						<span class="helptext">Controls	amount of nearby text included in the search query</span></span>
+					</div>
+					<input title="Nearby Text Amount" class="settinglabel miniinput"
+						style="height:16px;margin:0px 4px 0; width:90px;font-size:10px;" type="number"
+						min="1" step="1" pattern="\d+" placeholder="" value="" id="documentdb_searchrange">
+				</div>
+				<div class="settinglabel" style="padding: 4px;">
+					<div class="justifyleft settingsmall">Search Chunk Size <span class="helpicon">?
+					<span class="helptext">Controls the size of each snippet being searched for</span></span>
+					</div>
+					<input title="Search Chunk Size" class="settinglabel miniinput"
+						style="height:16px;margin:0px 4px 0; width:90px;font-size:10px;" type="number"
+						min="0" step="1" pattern="\d+" placeholder="" value="" id="documentdb_chunksize">
+				</div>
+				<div class="settinglabel">
+					<div class="justifyleft"><br>TextDB Storage<span class="helpicon">?<span
+						class="helptext">Paste as much raw text data here as you like. E.g. background information, reference documents, etc. This text will populate the database that will be chunked and searched by TextDB.</span></span></div>
+				</div>
+				<textarea title="Edit TextDB" class="form-control menuinput_multiline" id="documentdb_data" style="height: 120px;"
+					placeholder="Paste as much text data here as you like. This text will populate the database that will be searched by TextDB."></textarea>
+			</div>
+
+			<div class="context_tab_container" id="websearch_tab_container">
+				<div class="settinglabel" style="padding: 4px;">Search the Web for relavant information when using instruct mode<br>(Requires WebSearch enabled KoboldCpp)</div>
+				<div id="websearchunsupporteddiv" class="color_red hidden" style="font-weight:bold;padding:3px;font-size:12px">WebSearch Not Supported</div>
+				<div class="settinglabel" style="padding: 4px;">
+					<div class="justifyleft settingsmall" title="Enable WebSearch">Enable WebSearch </div>
+					<input title="Enable WebSearch" type="checkbox" id="websearch_enabled" style="margin:0px 0 0;">
+				</div>
+				<div class="settinglabel" style="padding: 4px;">
+					<div class="justifyleft settingsmall" title="Use Multiple Passes">Use Multiple Passes <span class="helpicon">?<span
+						class="helptext">Using this option will run a second LLM tool call to summarize context and create a more accurate search query. Slower but may be more accurate.</span></span></div>
+					<input title="Use Multiple Passes" type="checkbox" id="websearch_multipass" style="margin:0px 0 0;">
+				</div>
+				<div class="justifyleft settinglabel">Multipass WebSearch Template <span class="helpicon">?<span
+					class="helptext">The template used to generate the search query when multipass search is used</span></span></div>
+					<div style="display: flex; column-gap: 4px; margin-bottom: 4px;">
+					<textarea title="Multipass WebSearch Template" style="height: 80px;" class="form-control menuinput_multiline" id="websearch_template"
+					placeholder=""></textarea>
+				</div>
+			</div>
+
+			<div class="context_tab_container" id="token_tab_container">
+				<div class="justifyleft settinglabel">Extra Stopping Sequences <span class="helpicon">?<span
+					class="helptext">Triggers the text generator to stop generating early if this sequence appears, in addition to default stop sequences. If you want multiple sequences, separate them with the following delimiter: ||$||</span></span></div>
+					<div class="color_red hidden" id="noextrastopseq">Stop Sequences may be unavailable.</div>
+					<div style="display: flex; column-gap: 4px; margin-bottom: 4px;">
+					<input title="Extra Stopping Sequences" class="form-control menuinput_inline" type="text" placeholder="None" value="" id="extrastopseq">
+					<button type="button" class="btn btn-primary" style="width:90px;padding:6px 6px;" onclick="add_stop_seq()">Add New</button>
+				</div>
+
+				<div style="padding:3px;" class="justifyleft settinglabel">Logit Biases <span class="helpicon">?<span
+					class="helptext">Specify a dictionary of token IDs to modify the probability of occuring.</span></span>
+					<button type="button" title="Logit Biases" class="btn btn-primary" style="font-size:12px;padding:2px 2px;" onclick="expand_tokens_section('expandlogitbias')">Expand Section</button>
+				</div>
+				<div id="expandlogitbias" class="hidden">
+					<div class="color_red hidden" id="nologitbias">Logit bias may be unavailable.</div>
+					<div style="color:#ffffff;">Enter OpenAI-formatted logit bias dictionary. Each key is the integer token IDs and their values are the biases (-100.0 to 100.0). Leave blank to disable.<br><a href='https://platform.openai.com/docs/api-reference/chat/create#chat-create-logit_bias' target='_blank' class='color_blueurl'>Input is a JSON object, reference here.</a><br></div>
+					<textarea class="form-control menuinput_multiline" style="height: 80px;line-height:1.1;margin-bottom: 4px;" id="logitbiastxtarea" placeholder="" rows="5"></textarea>
+					<div style="display: flex; column-gap: 4px; margin-bottom: 4px;">
+					<input style="padding:2px" class="form-control menuinput_inline hidden" inputmode="text" type="text" placeholder="Token String" value="" id="newlogitbiasstring">
+					<input style="padding:2px" class="form-control menuinput_inline" inputmode="numeric" type="text" placeholder="Token ID" value="" id="newlogitbiasid">
+					<input style="padding:2px" class="form-control menuinput_inline" inputmode="text" type="text" placeholder="Bias Value" value="" id="newlogitbiasval">
+					<button type="button" class="btn btn-primary" style="width:90px;padding:6px 6px;" onclick="add_logit_bias()">Add New</button>
+					</div>
+					<div class="settinglabel hidden" id="newlogitbiasstringtogglesection">
+						<div class="justifyleft settingsmall">Input Strings Instead of IDs (Uses KCPP Tokenizer) <span class="helpicon">?<span
+							class="helptext">If enabled, allows you to input strings instead of only token IDs, and tokenizes them for you.</span></span></div>
+						<input type="checkbox" id="newlogitbiasstringtoggle" onclick="toggle_logit_bias_string()">
 					</div>
 				</div>
-				<br>
-			</div>
-			<div class="popupfooter">
-				<button type="button" class="btn btn-primary" onclick="confirm_newgame()">Ok</button>
-				<button type="button" class="btn btn-primary" onclick="hide_popups()">Cancel</button>
-			</div>
-		</div>
-	</div>
 
-	<div class="popupcontainer flex hidden" id="advancedloadfile">
-		<div class="popupbg flex"></div>
-		<div class="nspopup flexsizevsmall">
-			<div class="popuptitlebar">
-				<div class="popuptitletext">Advanced Load File</div>
-			</div>
-			<div class="menutext">
-				Select categories to import from saved file. Selected categories will be overwritten. Unselected categories will retain original values.<br>
-				<br><div>
-				<table style="width:90%; margin:8px auto;">
-				<tr><td><span style="vertical-align: middle;">Main Story</span></td><td><input type="checkbox" id="advset_mainstory" style=" vertical-align: top;" checked></td></tr>
-				<tr><td><span style="vertical-align: middle;">Memory and Author's Note</span></td><td><input type="checkbox" id="advset_memanote" style=" vertical-align: top;" checked></td></tr>
-				<tr><td><span style="vertical-align: middle;">World Info</span></td><td><input type="checkbox" id="advset_worldinfo" style=" vertical-align: top;" checked></td></tr>
-				<tr><td><span style="vertical-align: middle;">Stop Sequences</span></td><td><input type="checkbox" id="advset_stopseq" style=" vertical-align: top;" checked></td></tr>
-				<tr><td><span style="vertical-align: middle;">General Settings</span></td><td><input type="checkbox" id="advset_gensettings" style=" vertical-align: top;" checked></td></tr>
-				<tr><td><span style="vertical-align: middle;">Aesthetic Settings</span></td><td><input type="checkbox" id="advset_aessettings" style=" vertical-align: top;" checked></td></tr>
-				</table>
+				<div style="padding:3px;" class="justifyleft settinglabel">Phrase / Word Ban (Anti-Slop) <span class="helpicon">?<span
+					class="helptext">Prevents specific words or phrases from being generated, either modifying model vocab or by backtracking and regenerating when they appear. If you want multiple sequences, separate them with the following delimiter: ||$||</span></span>
+					<button type="button" title="Phrase / Token Ban (Anti-Slop)" class="btn btn-primary" style="font-size:12px;padding:2px 2px;" onclick="expand_tokens_section('expandtokenbans')">Expand Section</button>
+				</div>
+				<div id="expandtokenbans" class="hidden">
+					<div class="color_red hidden" id="notokenbans">Phrase banning may be unavailable.</div>
+					<div style="color:#ffffff;">Prevents specific words or phrases from being generated, either modifying model vocab or by backtracking and regenerating when they appear. If you want multiple sequences, separate them with the following delimiter: ||$||<br><em>Note: If you're trying to ban a specific token by ID, you should use Logit Bias instead!</em><br></div>
+					<div style="display: flex; column-gap: 4px; margin-bottom: 4px;">
+					<input class="form-control menuinput_inline" type="text" placeholder="None" value="" id="tokenbans">
+					<button type="button" class="btn btn-primary" style="width:90px;padding:6px 6px;" onclick="add_token_ban()">Add New</button>
+					</div>
+				</div>
+
+				<div style="padding:3px;" class="justifyleft settinglabel">Regex Replace <span class="helpicon">?<span
+					class="helptext">Allows transforming incoming text with regex patterns, modifying all matches. Replacements will be applied in sequence.</span></span>
+					<button type="button" title="Regex Replace" class="btn btn-primary" style="font-size:12px;padding:2px 2px;" onclick="expand_tokens_section('expandregexreplace')">Expand Section</button>
+				</div>
+				<div id="expandregexreplace" class="hidden">
+					<table id="regex_replace_table" class="settinglabel" style="border-spacing: 3px 2px; border-collapse: separate; text-align: center;">
+					</table>
+				</div>
+
+				<div style="padding:3px;" class="justifyleft settinglabel">Thinking / Reasoning Tags <span class="helpicon">?<span
+					class="helptext">Allows hiding or removing thinking tags.</span></span>
+					<button type="button" title="Regex Replace" class="btn btn-primary" style="font-size:12px;padding:2px 2px;" onclick="expand_tokens_section('expandthinking')">Expand Section</button>
+				</div>
+				<div id="expandthinking" class="hidden">
+					<div class="settinglabel justifyleft">This allows you to specify regex to handle output from reasoning models, to hide, remove, or ignore the Chain-Of-Thought.</div>
+					<div style="padding:4px" class="settinglabel justifyleft">CoT Regex Pattern: <input class="settinglabel miniinput" style="margin-left:4px;width:calc(100% - 132px);" type="text" placeholder="(Inactive)" value="" id="thinking_pattern"></div>
+					<div style="padding:4px" class="settinglabel justifyleft">CoT Handling: <select class="form-control" style="margin-left:4px;height: 25px; font-size:12px; padding:2px;display:inline;width:120px" id="thinking_action">
+						<option value="0" selected>Display</option>
+						<option value="1">Collapse</option>
+						<option value="2">Hide</option>
+						<option value="3">Remove</option>
+					</select></div>
+					<div style="padding:4px" class="settinglabel justifyleft">Don't Submit Old Thoughts <input title="Do not submit Past Thoughts" type="checkbox" id="strip_past_thinking" style="margin:4px;"></div>
+					<div style="padding:4px" class="settinglabel justifyleft">Force Insert Thinking Tag <input title="Force Insert Thinking Tag" type="checkbox" id="force_thinking_tag" style="margin:4px;"><input class="settinglabel miniinput" style="margin-left:4px;width:100px;" type="text" placeholder="" value="" id="start_thinking_tag"></div>
+					<div style="padding:4px" class="settinglabel justifyleft">Note: 'Force Insert Thinking Tags' does not work with the 'Remove' option of CoT handling regex.</div>
+				</div>
+
+				<div style="padding:3px;" class="justifyleft settinglabel">Placeholder Tags <span class="helpicon">?<span
+					class="helptext">Configure automatic substitutions for placeholders in text.</span></span>
+					<button type="button" title="Placeholder Tags" class="btn btn-primary" style="font-size:12px;padding:2px 2px;" onclick="expand_tokens_section('expandplaceholdertags')">Expand Section</button>
+				</div>
+				<div id="expandplaceholdertags" class="hidden">
+					<div class="settinglabel justifyleft">Stories can use placeholders like {{user}} and {{[INPUT]}} that require dynamic substitution. If disabled, uses plaintext tags verbatim.</div>
+					<div class="settinglabel">
+						<div class="justifyleft settingsmall">Enable Placeholder Tags <span class="helpicon">?<span
+							class="helptext">If enabled, uses placeholders that get swapped on submit. If disabled, uses plaintext verbatim.</span></span></div>
+					<input type="checkbox" id="placeholder_tags2">
+					</div>
+					<table id="placeholder_replace_table" class="settinglabel" style="text-align: center; border-spacing: 3px 2px; border-collapse: separate;">
+					</table>
+					<div class="settinglabel justifyleft">To modify {{[INPUT]}} or {{[OUTPUT]}} and other instruct tags, please edit them in the Settings &gt; Format page.</div>
 				</div>
-			</div>
-			<div class="popupfooter">
-				<button type="button" class="btn btn-primary" onclick="advload_btnok()">Ok</button>
-				<button type="button" class="btn btn-primary" onclick="hide_popups()">Cancel</button>
-			</div>
-		</div>
-	</div>
 
-	<div class="popupcontainer flex hidden" id="zoomedimgcontainer">
-		<div class="popupbg flex"></div>
-		<div class="nspopup flexsize highest">
-			<div class="popuptitlebar">
-				<div class="popuptitletext">Image Information</div>
 			</div>
 
-			<div class="zoomedimgdiv">
-				<img class="zoomedimg" id="zoomedimg" src="">
+			<div id="memorycontainerfooter" class="popupfooter">
+				<button type="button" class="btn btn-primary" onclick="confirm_memory();save_wi();commit_wi_changes();render_gametext();sync_multiplayer(true);hide_popups()">OK</button>
+				<button type="button" class="btn btn-primary" onclick="hide_popups();">Cancel</button>
 			</div>
-
-			<div class="menutext zoomedimgdesc" id="zoomedimgdesc" style="word-wrap: break-word;">
-				Loading...
-			</div>
-			<br>
-			<div class="popupfooter">
-				<button type="button" class="bg_red btn btn-primary" style="width: 124px;" onclick="delete_curr_image();hide_popups();">Delete Image</button>
-				<button type="button" class="btn btn-primary" onclick="hide_popups()">Close</button>
+			<div id="memorycontainerfooter2" class="popupfooter hidden">
+				<button type="button" class="btn btn-primary" onclick="confirm_memory();save_wi();commit_wi_changes();render_gametext();sync_multiplayer(true);hide_popups()">Apply</button>
 			</div>
 		</div>
 	</div>
 
 	<div class="popupcontainer flex hidden" id="settingscontainer">
-		<div class="popupbg flex"></div>
-		<div class="nspopup flexsize" style="margin-top: 4vh; background-color:#102840">
+		<div class="popupbg flex" id="settingscontainerbg"></div>
+		<div class="nspopup flexsize" id="settingscontainerfg" style="margin-top: 4vh;">
 			<div class="popuptitlebar">
 				<div class="popuptitletext" title="Settings Menu">Settings</div>
 			</div>
@@ -21486,7 +21339,7 @@ Current version indicated by LITEVER below.
 								</div>
 							</div>
 
-							<div style="border-top: 1px solid #12324f; margin-top: 4px;">
+							<div style="border-top: 1px solid #465d73; margin-top: 4px;">
 								<div>
 									<div class="settinglabel" style="margin-top: 2px;">
 										<div class="justifyleft settingsmall">Alerts <span class="helpicon">?<span class="helptext">Triggers alerts when a generation is completed.</span></span></div>
@@ -21580,7 +21433,7 @@ Current version indicated by LITEVER below.
 						</div>
 
 
-						<div class="settinglabel" style="border-top: 1px solid #12324f; margin-top: 4px;">
+						<div class="settinglabel" style="border-top: 1px solid #465d73; margin-top: 4px;">
 							<div class="justifyleft settingsmall" style="margin-top: 4px;">Voice Input <span class="helpicon">?<span class="helptext">Requires KoboldCpp with Whisper model loaded. Enables Speech-To-Text voice input. Automatically listens for speech in 'On' mode (Voice Detection), or use Push-To-Talk (PTT).</span></span></div>
 							<select title="Speech To Text Mode" style="padding:1px; height:auto; font-size: 8pt;" class="form-control" id="voice_typing_mode">
 								<option value="0">Off</option>
@@ -21801,6 +21654,11 @@ Current version indicated by LITEVER below.
 								<option value="3">Unlock</option>
 							</select>
 						</div>
+						<div class="settinglabel">
+							<div class="justifyleft settingsmall">SidePanel Mode <span class="helpicon">?<span
+								class="helptext">Displays the settings and context panels permanently on the sides, instead of as popups. Not available on Mobile displays.</span></span></div>
+						   <input title="Inverted Colors" type="checkbox" id="sidepanel_mode" style="margin:0px 0px 0px auto;">
+						</div>
 						<div class="settinglabel">
 							<div class="justifyleft settingsmall">Inverted Colors <span class="helpicon">?<span
 								class="helptext">Inverts all colors, simple light mode</span></span></div>
@@ -21822,264 +21680,746 @@ Current version indicated by LITEVER below.
 
 			</div>
 
-			<div class="popupfooter">
-				<button type="button" class="btn btn-primary" id="btn_settingsaccept"
-					onclick="confirm_settings()">OK</button>
-				<button type="button" class="btn btn-primary" id="btn_settingsclose"
-					onclick="hide_popups()">Cancel</button>
+			<div id="settingscontainerfooter" class="popupfooter">
+				<button type="button" class="btn btn-primary" onclick="confirm_settings()">OK</button>
+				<button type="button" class="btn btn-primary" onclick="hide_popups()">Cancel</button>
+			</div>
+			<div id="settingscontainerfooter2" class="popupfooter hidden">
+				<button type="button" class="btn btn-primary" onclick="confirm_settings()">Apply</button>
 			</div>
 		</div>
 	</div>
 
-	<div class="popupcontainer flex hidden" id="memorycontainer">
+	<div class="popupcontainer flex hidden" id="aestheticsettingscontainer">
 		<div class="popupbg flex"></div>
-		<div class="nspopup flexsizebig evenhigher">
-			<div class="popuptitlebar">
-				<div class="popuptitletext">Context Data</div>
+		<div class="nspopup evenhigher" style="margin-left: 20px; margin-right: 20px;">
+			<div class="popuptitlebar" id="aesthetic_customization_panel">
+				<div class="popuptitletext">Aesthetic UI customization panel</div>
 			</div>
-			<div><ul class="nav nav-tabs settingsnav">
-				<li id="memory_tab" class="active"><a class="" href="#" onclick="display_memory_tab(0)">Memory</a></li>
-				<li id="wi_tab"><a class="" href="#" onclick="display_memory_tab(1)">World Info</a></li>
-				<li id="documentdb_tab"><a class="" href="#" onclick="display_memory_tab(2)">TextDB</a></li>
-				<li id="websearch_tab"><a class="" href="#" onclick="display_memory_tab(3)">WebSearch</a></li>
-				<li id="token_tab"><a class="" href="#" onclick="display_memory_tab(4)">Tokens</a></li>
-			  </ul></div>
+			<div class="menutext" style="display: flex; flex-direction: row; height:max(70vh, 480px);">
+				<!-- Settings panel -->
+				<div style="background-color: #122b40;" onchange="refreshAestheticPreview()">
+					<div style="padding: 10px; width:350px; height:100%">
 
-			<div class="context_tab_container" id="memory_tab_container">
-				<div class="settinglabel">
-					<span class="justifyleft">Memory<span class="helpicon">?<span
-						class="helptext">Put the information you want the AI to always remember. It will be inserted into the top of every request sent to the AI.</span></span></span>
-					<span class="justifyright flex-push-right" >
-						<div class="settinglabel" style="padding-top: 4px;">
-							<div class="justifyleft settingsmall" title="Add newline after injecting memory text">Newline After Memory </div>
-						<input type="checkbox" title="Add Newline After Memory" id="newlineaftermemory" style="margin:0px 0 0;" checked>
+						<!-- BACKGROUND STYLE SETTINGS -->
+						<div>
+							<div class="settinglabel" style="display: flex;flex-direction: column; margin-top:5px; border-top: solid 1px rgba(180, 180, 255, 0.2);">
+								<!-- Background style header -->
+								<div class="justifyleft settingsmall" style="font-size: 14px; margin-bottom: 2px;">Background Style</div>
+
+								<!-- Background style main settings -->
+								<div style="margin-left: 12px;">
+									<div class="ui-settings-inline">
+										<div style="margin-right: 5px">Bubble Color: </div>
+										<div class="enhancedStandardColorPicker" id="sys-bubble-colorselector">System 🖌️</div>
+										<div class="enhancedStandardColorPicker" id="you-bubble-colorselector">You 🖌️</div>
+										<div class="enhancedStandardColorPicker" id="AI-bubble-colorselector">AI 🖌️</div>
+									</div>
+
+									<div class="ui-settings-inline" style="font-size: 10px; margin-left: 10px">
+										<div style="padding-top: 2px;">Rounded Bubbles: </div>
+										<input id="aui_rounded_bubbles"  type="checkbox" style="height: 10px">
+
+										<div style="padding-top: 2px; padding-left: 5px;">Color Background: </div>
+										<input id="aui_match_background"  type="checkbox" style="height: 10px">
+									</div>
+
+									<div class="ui-settings-inline">
+										<div style="margin-right:20px;">Min Height: </div>
+										<div class="instruct-settings-input"><input id ="instruct-min-backgroundHeight" type="number"/> px</div>
+										<div class="ui-settings-inline">
+											<div style="padding-top: 4px; font-size: 10px; margin-left: 10px;">Horizontally-centered text:</div>
+											<input id="instructModeCenterHorizontally" type="checkbox" style="height: 10px; margin-top: 6px;">
+										</div>
+									</div>
+									<div class="ui-settings-inline">
+										<div style="margin-right:20px;">Margin (px): </div>
+										<div class="instruct-settings-input" data-type="margin" data-side="left"  >L: <input type="number"/></div>
+										<div class="instruct-settings-input" data-type="margin" data-side="right" >R: <input type="number"/></div>
+										<div class="instruct-settings-input" data-type="margin" data-side="top"   >T: <input type="number"/></div>
+										<div class="instruct-settings-input" data-type="margin" data-side="bottom">B: <input type="number"/></div>
+									</div>
+									<div class="ui-settings-inline">
+										<div style="margin-right:13px">Padding (px): </div>
+										<div class="instruct-settings-input" data-type="padding" data-side="left"  >L: <input type="number"/></div>
+										<div class="instruct-settings-input" data-type="padding" data-side="right" >R: <input type="number"/></div>
+										<div class="instruct-settings-input" data-type="padding" data-side="top"   >T: <input type="number"/></div>
+										<div class="instruct-settings-input" data-type="padding" data-side="bottom">B: <input type="number"/></div>
+									</div>
+								</div>
+							</div>
 						</div>
-					</span>
+
+						<!-- PORTRAIT STYLE SETTINGS -->
+						<div>
+							<div class="settinglabel" style="display: flex;flex-direction: column; margin-top:5px; border-top: solid 1px rgba(180, 180, 255, 0.2);">
+								<!-- Portrait style header -->
+								<div class="justifyleft settingsmall" style="font-size: 15px; margin-bottom: 5px;">Portrait Style</div>
+
+								<!-- Portrait style main settings -->
+								<div style="margin-left: 12px;">
+									<div class="ui-settings-inline">
+										<div style="margin-right: 27px">Portraits: </div>
+										<div id="you-portrait">🖼️ Your Portrait</div>
+										<div id="AI-portrait">🖼️ AI's Portrait</div>
+									</div>
+								</div>
+								<div style="margin-left: 12px;">
+									<div class="ui-settings-inline">
+										<div style="margin-right:17px;">Portrait Style: </div>
+										<select class="form-control" id="instructBorderStyle" style="width:70px;height:16px;padding:0; font-size: 10px;">
+											<option value="None">None</option>
+											<option value="Circle">Circle</option>
+											<option value="Rounded">Rounded</option>
+											<option value="Rect">Rect</option>
+										</select>
+										<div style="margin-left: 10px;"><a href="#" id="reset-portrait" class="color_blueurl">(Reset Image)</a></div>
+									</div>
+									<div class="ui-settings-inline">
+										<div style="margin-right:18px;">User Portrait: </div>
+										<div>						 <span class="rectPortraitMode">Size: </span><input id="portrait_width_you"  type="number" placeholder="100" value="100" style='width:40px;height:20px;font-size:10px;'/></div>
+										<div style="align-self: left;">px</div>
+										<div style="margin-left:20px"><span class="rectPortraitMode">A/R: </span><input id="portrait_ratio_you" type="number" placeholder="1.0" step="0.01" value="1.0" style='width:46px;height:20px;font-size:10px;' class="rectPortraitMode"/></div>
+									</div>
+									<div class="ui-settings-inline">
+										<div style="margin-right:32px;">AI Portrait: </div>
+										<div>						 <span class="rectPortraitMode">Size: </span><input id="portrait_width_AI"  type="number" placeholder="100" value="100" style='width:40px;height:20px;font-size:10px;'/></div>
+										<div style="align-self: left;">px</div>
+										<div style="margin-left:20px"><span class="rectPortraitMode">A/R: </span><input id="portrait_ratio_AI" type="number" placeholder="1.0" step="0.01" value="1.0" style='width:46px;height:20px;font-size:10px;' class="rectPortraitMode"/></div>
+									</div>
+									<div class="ui-settings-inline" style="font-size: 10px; margin-left: 10px">
+										<div style="padding-top: 2px;">Show Names (Chat Mode): </div>
+										<input id="aui_show_chat_names" type="checkbox" style="height: 10px">
+									</div>
+								</div>
+							</div>
+						</div>
+
+						<!-- FONT STYLE SETTINGS -->
+						<div>
+							<div class="settinglabel" style="display: flex;flex-direction: column; margin-top:5px; border-top: solid 1px rgba(180, 180, 255, 0.2);">
+								<!-- Font style header -->
+								<div class="justifyleft settingsmall" style="font-size: 15px; margin-bottom:5px;">Font Style</div>
+
+								<!-- Font style main settings -->
+								<div style="margin-left: 12px;">
+									<div class="ui-settings-inline">
+										<div style="margin-right:20px;text-align: center;">Font Size: </div>
+										<div style="margin: 0px 10px"><input id="instruct-font-size" type="number" min="8" max="40" style='width:40px;height:20px;font-size:10px;'/> px</div>
+									</div>
+									<div class="ui-settings-inline">
+										<div style="font-size: 12px; margin-right:27px; text-align: center;">Customize: </div>
+										<div class="ui-settings-inline" style="font-size: 10px">
+											<div style="padding-top: 2px;">Per-entity: </div>
+											<input id="instructModeCustomized" type="checkbox" style="height: 10px;">
+										</div>
+										<div class="ui-settings-inline" style="font-size: 10px; margin-left: 10px">
+											<div style="padding-top: 2px;">Style Text: </div>
+											<input id="instructModeMarkdown"  type="checkbox" style="height: 10px">
+										</div>
+									</div>
+									<div class="ui-settings-inline uniform-mode-font">
+										<div style="margin-right:48px; text-align: center;">Colors: </div>
+										<div class="enhancedcolorPicker" id="uniform-text-colorselector">text🖌️</div>
+										<div class="enhancedcolorPicker instruct-markdown-user" id="uniform-speech-colorselector">"speech"🖌️</div>
+										<div class="enhancedcolorPicker instruct-markdown-user" id="uniform-action-colorselector">*action*🖌️</div>
+									</div>
+									<div class="ui-settings-inline custom-mode-font">
+										<div style="margin-right:58px; text-align: center;">You: </div>
+										<div class="enhancedcolorPicker" id="you-text-colorselector">text🖌️</div>
+										<div class="enhancedcolorPicker instruct-markdown-user" id="you-speech-colorselector">"speech"🖌️</div>
+										<div class="enhancedcolorPicker instruct-markdown-user" id="you-action-colorselector">*action*🖌️</div>
+									</div>
+									<div class="ui-settings-inline custom-mode-font">
+										<div style="margin-right:67px; text-align: center;">AI: </div>
+										<div class="enhancedcolorPicker" id="AI-text-colorselector">text🖌️</div>
+										<div class="enhancedcolorPicker instruct-markdown-user" id="AI-speech-colorselector">"speech"🖌️</div>
+										<div class="enhancedcolorPicker instruct-markdown-user" id="AI-action-colorselector">*action*🖌️</div>
+									</div>
+									<div class="ui-settings-inline custom-mode-font">
+										<div style="margin-right:38px; text-align: center;">System: </div>
+										<div class="enhancedcolorPicker" id="sys-text-colorselector">text🖌️</div>
+										<div class="enhancedcolorPicker instruct-markdown-user" id="sys-speech-colorselector">"speech"🖌️</div>
+										<div class="enhancedcolorPicker instruct-markdown-user" id="sys-action-colorselector">*action*🖌️</div>
+									</div>
+									<div class="ui-settings-inline instruct-markdown-user">
+										<div style="margin-right:11px; text-align: center;">Code blocks: </div>
+										<div class="enhancedcolorPicker" id="code-block-background-colorselector">background🖌️</div>
+										<div class="enhancedcolorPicker" id="code-block-foreground-colorselector">foreground🖌️</div>
+									</div>
+								</div>
+								<br>
+									<div style="margin-left: 10px;"><a href="#" id="reset-all-aesthetic-instruct" class="color_blueurl">(Reset All Styles)</a></div>
+							</div>
+						</div>
+
+					</div>
+					<div class="popupfooter" id="aesthetic_instruct_footer" style="margin-top: -55px;height:55px;">
+						<button type="button" class="btn btn-primary" id="btn_settingsaccept" onclick="hideAestheticUISettingsMenu(true)">OK</button>
+						<button type="button" class="btn btn-primary" id="btn_settingsclose" onclick="hideAestheticUISettingsMenu(false)">Cancel</button>
+					</div>
 				</div>
-				<textarea title="Edit Memory" class="form-control menuinput_multiline" id="memorytext" style="height: 120px;"
-					placeholder="Edit the memory to be sent with each request to the AI."></textarea>
-				<div class="settinglabel">
-					<div class="justifyleft"><br>Author's Note<span class="helpicon">?<span
-						class="helptext">Similar to Memory, but inserted near the end of the text instead of the start. A good way to control the mood/behavior of the AI.</span></span></div>
-					<span class="justifyright flex-push-right" >
-						<button type="button" class="btn btn-primary" style="padding:4px 6px;margin-top:4px;" id="btnnotes" onclick="set_personal_notes()">Notes</button>
-						<button type="button" class="btn btn-primary" style="padding:4px 6px;margin-top:4px;" id="btnautogenmem" onclick="autogenerate_summary_memory()">AutoGenerate Memory</button>
-					</span>
+				<div id="aesthetic_text_preview_panel" style="background-color: black; padding: 10px; height:100%; overflow-y: auto; ">
+					<p>Style Preview</p>
+					<div id="aesthetic_text_preview" style="background-color: black; margin: 2px; text-align: left; word-wrap: break-word;"></div>
 				</div>
-				<textarea title="Edit Author's Note" style="height: 80px;" class="form-control menuinput_multiline" id="anotetext"
-					placeholder="Author's Note will be inserted close to end of context."></textarea>
-				<br>
-				<div class="settinglabel">
-					<span class="justifyleft">Author's Note Template<span class="helpicon">?<span
-						class="helptext">A placeholder, will be inserted with the author's note replacing the &lt;|&gt;. You generally don't need to change this.</span></span></span>
-					<span class="justifyright flex-push-right" >
-						A/N Strength<span class="helpicon">?<span
-							class="helptext">Controls how far back to insert the Author's Note. Notes injected closer to the end have a stronger effect.</span></span>
-					</span>
-				</div>
-				<div style="display: flex; column-gap: 4px;">
-				<input title="Author's Note Template" class="form-control menuinput_inline" type="text"
-					placeholder="(the &lt;|&gt; will be replaced with the Author's Note text)" value="" id="anotetemplate">
-					<select title="Author's Note Strength" style="padding:4px; display: inline;	width: 94px; padding: 6px 3px;" class="form-control" id="anote_strength">
-						<option value="480">Weak</option>
-						<option value="320">Medium</option>
-						<option value="160">Strong</option>
-						<option value="0">Immediate</option>
+				<input type="file" id="portraitFileInput" style="display:none" accept="image/*">
+			</div>
+		</div>
+	</div>
+
+	<div class="popupcontainer flex hidden" id="quickstartcontainer">
+		<div class="popupbg flex"></div>
+		<div class="scenariopopup">
+			<div class="popuptitlebar">
+				<div class="popuptitletext">Quick Start - Select A Scenario</div>
+			</div>
+
+			<div style="overflow: auto;">
+
+				<div class="scenariosearch">
+				<input class="scenariosearchbox1 form-control" type="text" placeholder="Quick Search" value=""
+					id="scenariosearch" oninput="scenario_search()">
+					<select class="scenariosearchbox2 form-control" id="scenariosearchdropdown" onchange="scenario_search()">
+						<option value="0">All</option>
+						<option value="1">Story</option>
+						<option value="2">Adventure</option>
+						<option value="3">Chat</option>
+						<option value="4">Instruct</option>
 					</select>
 				</div>
-			</div>
+				<div id="scenarioautopickbox" class="menutext" style="text-align: left; padding-left: 8px;">
+					Automatically select AI model <span class="helpicon">?
+						<span class="helptext">This option picks a suitable AI model based on the selected scenario. If no text model is currently selected, an appropriate one will be automatically picked for you.</span>
+					</span>
+					<input type="checkbox" id="scenarioautopickai" onchange="togglescenarioautopick()" checked>
 
-			<div class="context_tab_container" id="wi_tab_container">
-				<div style="text-align: right;">
-				<button type="button" style="padding:4px;margin:4px" class="btn bluebtn widelbtn" id="wiadd" onclick="add_wi()">+Add</button>
-				</div>
-				<div class="wilist" id="wilist">
 				</div>
 
-				<div class="settinglabel" style="padding: 4px;">
-					<div class="justifyleft settingsmall">WI Insert Location <span class="helpicon">?<span
-						class="helptext">Controls where the world info should be inserted</span></span></div>
-				<select title="World Info Insert Location" style="height:16px;padding:0px;margin:0px 4px 0; width:90px;font-size:10px;" class="form-control" id="wi_insertlocation">
-					<option value="0">After Memory</option>
-					<option value="1">Before A/N</option>
-				</select></div>
-				<div class="settinglabel" style="padding: 4px;">
-					<div class="justifyleft settingsmall">WI Search Depth <span class="helpicon">?<span
-						class="helptext">Controls how far back in the text to search for World Info Keys</span></span></div>
-				<select title="World Info Search Depth" style="height:16px;padding:0px;margin:0px 4px 0; width:90px;font-size:10px;" class="form-control" id="wi_searchdepth">
-					<option value="0">Full Context</option>
-					<option value="8192">Last 8192</option>
-					<option value="4096">Last 4096</option>
-					<option value="2048">Last 2048</option>
-					<option value="1024">Last 1024</option>
-					<option value="512">Last 512</option>
-					<option value="256">Last 256</option>
-				</select></div>
-
-
-				<div style="float:right;">
-					<input title="World Info Quick Search" class="settinglabel miniinput" style="margin: 3px; width: 90px;" type="text" placeholder="Quick Search" value="" id="wiquicksearch" oninput="wi_quick_search()">
+				<div id="scenariogrid" class="scenariogrid">
 				</div>
-				<div class="settinglabel" style="padding: 4px;">
-					<div class="justifyleft settingsmall" title="Controls whether the world info keys are matched in a case-sensitive way.">Case Sensitive Keys </div>
-				<input title="World Info Case Sensitive" type="checkbox" id="case_sensitive_wi" style="margin:0px 0 0;">
+				<div id="scenariodesc" class="scenariodesc">
+				</div>
+
+				<div class="popupfooter">
+					<button type="button" class="btn btn-primary" id=""
+						onclick="confirm_scenario_verify()">Ok</button>
+					<button type="button" class="btn btn-primary" id=""
+						onclick="hide_popups()">Cancel</button>
 				</div>
 			</div>
+		</div>
+	</div>
 
-			<div class="context_tab_container" id="documentdb_tab_container">
-				<div class="settinglabel" style="padding: 4px;">Automatically search and include relevant snippets from a text document or history.</div>
-				<div class="settinglabel" style="padding: 4px;">
-					<div class="justifyleft settingsmall" title="Enable TextDB">Enable TextDB </div>
-					<input title="Enable TextDB" type="checkbox" id="documentdb_enabled" style="margin:0px 0 0;">
-				</div>
-
-				<div class="settinglabel" style="padding: 4px;">
-					<div class="justifyleft settingsmall" title="Search Includes Context History">Search Includes Context History <span class="helpicon">?
-						<span class="helptext">If enabled, the entire story/chat history is used as a searchable document</span></span></div>
-					<input title="Search Context History" type="checkbox" id="documentdb_searchhistory" style="margin:0px 0 0;">
-				</div>
-
-				<div class="settinglabel" style="padding: 4px;">
-					<div class="justifyleft settingsmall">Num. Search Results <span class="helpicon">?
-					<span class="helptext">Controls how many snippets to retrieve from DB</span></span></div>
-					<input title="Number of Search Results" class="settinglabel miniinput"
-						style="height:16px;margin:0px 4px 0; width:90px;font-size:10px;" type="number"
-						min="1" max="5" step="1" pattern="\d+" placeholder="" value="" id="documentdb_numresults">
-				</div>
-				<div class="settinglabel" style="padding: 4px;">
-					<div class="justifyleft settingsmall">Nearby Text Amount <span class="helpicon">?
-						<span class="helptext">Controls	amount of nearby text included in the search query</span></span>
-					</div>
-					<input title="Nearby Text Amount" class="settinglabel miniinput"
-						style="height:16px;margin:0px 4px 0; width:90px;font-size:10px;" type="number"
-						min="1" step="1" pattern="\d+" placeholder="" value="" id="documentdb_searchrange">
-				</div>
-				<div class="settinglabel" style="padding: 4px;">
-					<div class="justifyleft settingsmall">Search Chunk Size <span class="helpicon">?
-					<span class="helptext">Controls the size of each snippet being searched for</span></span>
-					</div>
-					<input title="Search Chunk Size" class="settinglabel miniinput"
-						style="height:16px;margin:0px 4px 0; width:90px;font-size:10px;" type="number"
-						min="0" step="1" pattern="\d+" placeholder="" value="" id="documentdb_chunksize">
-				</div>
-				<div class="settinglabel">
-					<div class="justifyleft"><br>TextDB Storage<span class="helpicon">?<span
-						class="helptext">Paste as much raw text data here as you like. E.g. background information, reference documents, etc. This text will populate the database that will be chunked and searched by TextDB.</span></span></div>
-				</div>
-				<textarea title="Edit TextDB" class="form-control menuinput_multiline" id="documentdb_data" style="height: 120px;"
-					placeholder="Paste as much text data here as you like. This text will populate the database that will be searched by TextDB."></textarea>
+	<div class="popupcontainer flex hidden" id="charactercreator">
+		<div class="popupbg flex"></div>
+		<div class="nspopup flexsize higher">
+			<div class="popuptitlebar">
+				<div class="popuptitletext">Roleplay Character Creator</div>
 			</div>
+			<div class="menutext">
+				This tool is an easy way to create your own roleplay characters.<br>Alternatively, you can <a href="#" class="color_blueurl" onclick="hide_popups();document.getElementById('loadfileinput').click()"><b>click here to import an existing Tavern Character Card</b></a>.<br><br>
 
-			<div class="context_tab_container" id="websearch_tab_container">
-				<div class="settinglabel" style="padding: 4px;">Search the Web for relavant information when using instruct mode<br>(Requires WebSearch enabled KoboldCpp)</div>
-				<div id="websearchunsupporteddiv" class="color_red hidden" style="font-weight:bold;padding:3px;font-size:12px">WebSearch Not Supported</div>
-				<div class="settinglabel" style="padding: 4px;">
-					<div class="justifyleft settingsmall" title="Enable WebSearch">Enable WebSearch </div>
-					<input title="Enable WebSearch" type="checkbox" id="websearch_enabled" style="margin:0px 0 0;">
+				<div class="inlinelabel">
+					<div class="justifyleft" style="padding:4px">Character Name: </div>
+					<input title="Character Name" style="width:calc(100% - 136px);" type="text" placeholder="Enter Bot's Name (e.g. Arthur Green)" value="" id="charcreator_name">
 				</div>
-				<div class="settinglabel" style="padding: 4px;">
-					<div class="justifyleft settingsmall" title="Use Multiple Passes">Use Multiple Passes <span class="helpicon">?<span
-						class="helptext">Using this option will run a second LLM tool call to summarize context and create a more accurate search query. Slower but may be more accurate.</span></span></div>
-					<input title="Use Multiple Passes" type="checkbox" id="websearch_multipass" style="margin:0px 0 0;">
+				<div class="inlinelabel">
+					<div class="justifyleft" style="padding:4px">Character Avatar: </div>
+					<button type="button" class="btn btn-primary" style="padding:2px 4px;margin:2px;" onclick="selectAvatarImage(false)">Select Image</button>
+					<div id="charcreator_avatar" style="background-position: 50% 50%; background-size: 100% 100%; background-origin: content-box; background-repeat: no-repeat; width: 32px; height:32px; border-radius:100rem; background-clip: content-box; margin: 4px 4px; border:none;"></div>
 				</div>
-				<div class="justifyleft settinglabel">Multipass WebSearch Template <span class="helpicon">?<span
-					class="helptext">The template used to generate the search query when multipass search is used</span></span></div>
-					<div style="display: flex; column-gap: 4px; margin-bottom: 4px;">
-					<textarea title="Multipass WebSearch Template" style="height: 80px;" class="form-control menuinput_multiline" id="websearch_template"
-					placeholder=""></textarea>
+				<div class="inlinelabel">
+					<div class="justifyleft" style="padding:4px">Character Persona: </div>
+					<input title="Character Persona" style="width:calc(100% - 136px);" type="text" placeholder="Describe this character (e.g. 25yo Male Elf, Brave, Chatty)" value="" id="charcreator_persona">
 				</div>
+				<div class="inlinelabel">
+					<div class="justifyleft" style="padding:4px">Describe Scenario: </div>
+					<input title="Describe Scenario" style="width:calc(100% - 136px);" type="text" placeholder="Describe the scenario (e.g. A fireside chat at the spooky campsite)" value="" id="charcreator_scenario">
+				</div>
+				<div class="inlinelabel">
+					<div class="justifyleft" style="padding:4px">Character Greeting: </div>
+					<input title="Character Greeting" style="width:calc(100% - 136px);" type="text" placeholder="First message greeting (e.g. Hello, traveller!)" value="" id="charcreator_greeting">
+				</div>
+
+				After creating your character, you can edit it further at any time in the 'Context' memory window.
 			</div>
-
-			<div class="context_tab_container" id="token_tab_container">
-				<div class="justifyleft settinglabel">Extra Stopping Sequences <span class="helpicon">?<span
-					class="helptext">Triggers the text generator to stop generating early if this sequence appears, in addition to default stop sequences. If you want multiple sequences, separate them with the following delimiter: ||$||</span></span></div>
-					<div class="color_red hidden" id="noextrastopseq">Stop Sequences may be unavailable.</div>
-					<div style="display: flex; column-gap: 4px; margin-bottom: 4px;">
-					<input title="Extra Stopping Sequences" class="form-control menuinput_inline" type="text" placeholder="None" value="" id="extrastopseq">
-					<button type="button" class="btn btn-primary" style="width:90px;padding:6px 6px;" onclick="add_stop_seq()">Add New</button>
-				</div>
-
-				<div style="padding:3px;" class="justifyleft settinglabel">Logit Biases <span class="helpicon">?<span
-					class="helptext">Specify a dictionary of token IDs to modify the probability of occuring.</span></span>
-					<button type="button" title="Logit Biases" class="btn btn-primary" style="font-size:12px;padding:2px 2px;" onclick="expand_tokens_section('expandlogitbias')">Expand Section</button>
-				</div>
-				<div id="expandlogitbias" class="hidden">
-					<div class="color_red hidden" id="nologitbias">Logit bias may be unavailable.</div>
-					<div style="color:#ffffff;">Enter OpenAI-formatted logit bias dictionary. Each key is the integer token IDs and their values are the biases (-100.0 to 100.0). Leave blank to disable.<br><a href='https://platform.openai.com/docs/api-reference/chat/create#chat-create-logit_bias' target='_blank' class='color_blueurl'>Input is a JSON object, reference here.</a><br></div>
-					<textarea class="form-control menuinput_multiline" style="height: 80px;line-height:1.1;margin-bottom: 4px;" id="logitbiastxtarea" placeholder="" rows="5"></textarea>
-					<div style="display: flex; column-gap: 4px; margin-bottom: 4px;">
-					<input style="padding:2px" class="form-control menuinput_inline hidden" inputmode="text" type="text" placeholder="Token String" value="" id="newlogitbiasstring">
-					<input style="padding:2px" class="form-control menuinput_inline" inputmode="numeric" type="text" placeholder="Token ID" value="" id="newlogitbiasid">
-					<input style="padding:2px" class="form-control menuinput_inline" inputmode="text" type="text" placeholder="Bias Value" value="" id="newlogitbiasval">
-					<button type="button" class="btn btn-primary" style="width:90px;padding:6px 6px;" onclick="add_logit_bias()">Add New</button>
-					</div>
-					<div class="settinglabel hidden" id="newlogitbiasstringtogglesection">
-						<div class="justifyleft settingsmall">Input Strings Instead of IDs (Uses KCPP Tokenizer) <span class="helpicon">?<span
-							class="helptext">If enabled, allows you to input strings instead of only token IDs, and tokenizes them for you.</span></span></div>
-						<input type="checkbox" id="newlogitbiasstringtoggle" onclick="toggle_logit_bias_string()">
-					</div>
-				</div>
-
-				<div style="padding:3px;" class="justifyleft settinglabel">Phrase / Word Ban (Anti-Slop) <span class="helpicon">?<span
-					class="helptext">Prevents specific words or phrases from being generated, either modifying model vocab or by backtracking and regenerating when they appear. If you want multiple sequences, separate them with the following delimiter: ||$||</span></span>
-					<button type="button" title="Phrase / Token Ban (Anti-Slop)" class="btn btn-primary" style="font-size:12px;padding:2px 2px;" onclick="expand_tokens_section('expandtokenbans')">Expand Section</button>
-				</div>
-				<div id="expandtokenbans" class="hidden">
-					<div class="color_red hidden" id="notokenbans">Phrase banning may be unavailable.</div>
-					<div style="color:#ffffff;">Prevents specific words or phrases from being generated, either modifying model vocab or by backtracking and regenerating when they appear. If you want multiple sequences, separate them with the following delimiter: ||$||<br><em>Note: If you're trying to ban a specific token by ID, you should use Logit Bias instead!</em><br></div>
-					<div style="display: flex; column-gap: 4px; margin-bottom: 4px;">
-					<input class="form-control menuinput_inline" type="text" placeholder="None" value="" id="tokenbans">
-					<button type="button" class="btn btn-primary" style="width:90px;padding:6px 6px;" onclick="add_token_ban()">Add New</button>
-					</div>
-				</div>
-
-				<div style="padding:3px;" class="justifyleft settinglabel">Regex Replace <span class="helpicon">?<span
-					class="helptext">Allows transforming incoming text with regex patterns, modifying all matches. Replacements will be applied in sequence.</span></span>
-					<button type="button" title="Regex Replace" class="btn btn-primary" style="font-size:12px;padding:2px 2px;" onclick="expand_tokens_section('expandregexreplace')">Expand Section</button>
-				</div>
-				<div id="expandregexreplace" class="hidden">
-					<table id="regex_replace_table" class="settinglabel" style="border-spacing: 3px 2px; border-collapse: separate; text-align: center;">
-					</table>
-				</div>
-
-				<div style="padding:3px;" class="justifyleft settinglabel">Thinking / Reasoning Tags <span class="helpicon">?<span
-					class="helptext">Allows hiding or removing thinking tags.</span></span>
-					<button type="button" title="Regex Replace" class="btn btn-primary" style="font-size:12px;padding:2px 2px;" onclick="expand_tokens_section('expandthinking')">Expand Section</button>
-				</div>
-				<div id="expandthinking" class="hidden">
-					<div class="settinglabel justifyleft">This allows you to specify regex to handle output from reasoning models, to hide, remove, or ignore the Chain-Of-Thought.</div>
-					<div style="padding:4px" class="settinglabel justifyleft">CoT Regex Pattern: <input class="settinglabel miniinput" style="margin-left:4px;width:calc(100% - 132px);" type="text" placeholder="(Inactive)" value="" id="thinking_pattern"></div>
-					<div style="padding:4px" class="settinglabel justifyleft">CoT Handling: <select class="form-control" style="margin-left:4px;height: 25px; font-size:12px; padding:2px;display:inline;width:120px" id="thinking_action">
-						<option value="0" selected>Display</option>
-						<option value="1">Collapse</option>
-						<option value="2">Hide</option>
-						<option value="3">Remove</option>
-					</select></div>
-					<div style="padding:4px" class="settinglabel justifyleft">Force Insert Thinking Tag <input title="Force Insert Thinking Tag" type="checkbox" id="force_thinking_tag" style="margin:4px;"><input class="settinglabel miniinput" style="margin-left:4px;width:100px;" type="text" placeholder="" value="" id="start_thinking_tag"></div>
-					<div style="padding:4px" class="settinglabel justifyleft">Note: 'Force Insert Thinking Tags' does not work with the 'Remove' option of CoT handling regex.</div>
-				</div>
-
-				<div style="padding:3px;" class="justifyleft settinglabel">Placeholder Tags <span class="helpicon">?<span
-					class="helptext">Configure automatic substitutions for placeholders in text.</span></span>
-					<button type="button" title="Placeholder Tags" class="btn btn-primary" style="font-size:12px;padding:2px 2px;" onclick="expand_tokens_section('expandplaceholdertags')">Expand Section</button>
-				</div>
-				<div id="expandplaceholdertags" class="hidden">
-					<div class="settinglabel justifyleft">Stories can use placeholders like {{user}} and {{[INPUT]}} that require dynamic substitution. If disabled, uses plaintext tags verbatim.</div>
-					<div class="settinglabel">
-						<div class="justifyleft settingsmall">Enable Placeholder Tags <span class="helpicon">?<span
-							class="helptext">If enabled, uses placeholders that get swapped on submit. If disabled, uses plaintext verbatim.</span></span></div>
-					<input type="checkbox" id="placeholder_tags2">
-					</div>
-					<table id="placeholder_replace_table" class="settinglabel" style="text-align: center; border-spacing: 3px 2px; border-collapse: separate;">
-					</table>
-					<div class="settinglabel justifyleft">To modify {{[INPUT]}} or {{[OUTPUT]}} and other instruct tags, please edit them in the Settings &gt; Format page.</div>
-				</div>
-
-			</div>
-
 			<div class="popupfooter">
-				<button type="button" class="btn btn-primary" onclick="confirm_memory();save_wi();commit_wi_changes();render_gametext();sync_multiplayer(true);hide_popups()">OK</button>
-				<button type="button" class="btn btn-primary" onclick="hide_popups();">Cancel</button>
+				<button type="button" class="btn btn-primary" onclick="character_creator_done(true);">Confirm</button>
+				<button type="button" class="btn btn-primary" onclick="character_creator_done(false);">Cancel</button>
+			</div>
+		</div>
+	</div>
+
+	<div class="popupcontainer flex hidden" id="saveloadcontainer">
+		<div class="popupbg flex"></div>
+		<div class="saveloadpopup">
+			<div class="popuptitlebar">
+				<div class="popuptitletext">Save File / Load File / Export File</div>
+			</div>
+
+			<div style="overflow: auto;">
+				<div id="saveloadentries" class="menutext saveloadgrid">
+					<div style="display:flex">
+						<button type="button" style="font-size:12px; margin:2px;width:33%" name="localsave" class="btn btn-primary" onclick="hide_popups();save_file_button()">💾<br>Download File</button>
+						<button type="button" style="font-size:12px; margin:2px;width:33%" name="localload" class="btn btn-primary" onclick="hide_popups();load_file_button()">📁<br>Open File</button>
+						<button type="button" style="font-size:12px; margin:2px;width:34%" name="shareurl" class="btn btn-primary" onclick="hide_popups();share_story_button()">🌐<br>Share</button>
+					</div>
+					<div style="margin-top:3px; text-align: center; align-self: center; width: 100%">
+						<span style="font-weight:bold;text-decoration: underline;">Slot Storage Option</span>
+					</div>
+					<div style="display:flex;">
+						<div style="width: 92px; margin:8px; margin-left: 4px; margin-right: 4px; font-size: 14px;">Data Location</div>
+						<div style="margin:3px; text-align: center; align-self: center; width: calc(100% - 94px);">
+						<select title="Select Slot Location" style="padding:4px;" class="form-control" id="saveslotlocationdropdown" onchange="saveloadchangeslot(true)">
+						<option value="1">Local Browser Cache</option>
+						<option value="2" id="kcppsaveavailable" class="hidden">KoboldCpp Server Storage</option>
+						</select>
+						</div>
+					</div>
+					<div style="display:flex;">
+						<div style="width: 92px; margin:8px; margin-left: 4px; margin-right: 4px; font-size: 14px;">Selected Slot</div>
+						<div style="margin:3px; text-align: center; align-self: center; width: calc(100% - 94px);">
+						<select title="Select Save Slot" style="padding:4px;" class="form-control" id="saveslotselecteddropdown" onchange="saveloadchangeslot(false)">
+						</select>
+						</div>
+					</div>
+					<div style="width:100%;align-self: center;">
+						<button type="button" id="savetoslot" title="Save To Slot" class="btn btn-primary bg_primary" onclick="save_to_curr_slot()"><img class="btnicon-save"/> Save Slot</button>
+						<button type="button" id="loadfromslot" title="Load From Slot" class="btn btn-primary bg_primary" onclick="load_from_curr_slot()"><img class="btnicon-load"/> Load Slot</button>
+						<button type="button" id="downloadslot" title="Download Slot" class="btn btn-primary bg_green" onclick="download_from_curr_slot()"><img class="btnicon-download"/> Download</button>
+						<button type="button" id="deleteslot" title="Delete Slot" class="btn btn-primary bg_red" onclick="delete_from_curr_slot()"><img class="btnicon-delete"/> Delete Slot</button>
+					</div>
+				</div>
+				<div class="menutext"><p style="padding:6px;font-size: 10px;" class="color_red">Caution: Local Storage Slots are saved to a temporary cache and can be deleted by your browser. KoboldCpp remote storage will save data to a file in your KoboldCpp server. To avoid losing data, use the download file button.</p></div>
+				<div class="popupfooter">
+					<button type="button" class="btn btn-primary" id=""
+						onclick="hide_popups()">Back</button>
+				</div>
+			</div>
+		</div>
+	</div>
+
+	<div class="popupcontainer flex hidden" id="customendpointcontainer">
+		<div class="popupbg flex"></div>
+		<div class="nspopup flexsize evenhigher">
+			<div class="popuptitlebar">
+				<div class="popuptitletext">Select your AI provider</div>
+			</div>
+			<div style="padding: 4px;">
+			<select id="unusedcustomapidropdown" style="display: none;"></select>
+			<select title="Select your AI provider" style="padding:4px;" class="form-control" id="customapidropdown" onchange="customapi_dropdown(true)">
+				<option value="0">AI Horde</option>
+				<option value="1">KoboldAI Remote API</option>
+				<option value="2">OpenAI Compatible API</option>
+				<option value="3">OpenRouter API</option>
+				<option value="4">Claude By Anthropic API</option>
+				<option value="5">PaLM/Gemini By Google API</option>
+				<option value="6">Cohere API</option>
+				<option value="7">MistralAI API</option>
+				<option value="8">Featherless API</option>
+				<option value="9">Grok API</option>
+			</select>
+			</div>
+
+			<div class="menutext" id="hordeloadmodelcontainer">
+				The AI Horde is a service that generates text using crowdsourced GPUs run by independent volunteer workers. Avoid sending privacy sensitive information. <a href="#" class="color_blueurl" onclick="explain_horde()">Click here for more info</a>
+				<div class="menutext" style="text-align: left;">
+					<span style="float:left; text-align: left;">
+					Your AI Horde API Key <span class="helpicon">?
+						<span class="helptext">You need an API key to use AI Horde to generate text. Get one at
+							https://aihorde.net/register or use the anonymous key 0000000000.</span>
+					</span>
+					<br><a href="#" id="showownworkerslink" class="color_blueurl hidden" onclick="show_my_own_workers()">[Manage My Workers]</a></span>
+					<span class="color_green" style="float:right; text-align: right;" id="kudos_bal">
+						Need a Key?<br><a class='color_blueurl' href='https://aihorde.net/register'>(Register New User)</a>
+					</span>
+				</div>
+				<input class="form-control" type="password" placeholder="Enter API Key (or use 0000000000)" value=""
+					id="apikey" onfocus="focus_api_keys()" onblur="fetch_kudo_balance();blur_api_keys()">
+
+				<div class="menutext" style="text-align: left;">
+					Select AI Horde Model <span class="helpicon">?
+						<span class="helptext">These are the models currently provided by AI Horde volunteers.</span>
+					</span> <a href="#" class="color_blueurl" onclick="reset_horde_selection()">[Reset]</a>
+					<span style="float:right;">
+					<a href="#" class="color_green" onclick="get_and_show_workers()">[See Current Volunteers] </a>
+					</span>
+					<select class="form-control" id="pickedmodel" size="7" multiple></select>
+				</div>
+
+				<div class="menutext" style="text-align: left;">
+					Select By Worker <span class="helpicon">?
+						<span class="helptext">This option explicitly assigns worker IDs, fixed based on the current workers available at model selection time.</span>
+					</span>
+					<input type="checkbox" id="manualworker" onclick="toggle_manual_horde_worker()">
+
+					<span style="float:right;">
+						<input class="settinglabel miniinput" style="margin: 3px; width: 90px;" type="text" placeholder="Quick Search" value="" id="modelquicksearch" oninput="model_quick_search()">
+					</span>
+				</div>
+
+			</div>
+
+			<div id="koboldcustom" class="menutext">
+				You can use this to connect to a KoboldAI instance running via a remote tunnel such as <span class="color_orange" style="font-weight: bold;">trycloudflare, localtunnel, ngrok</span>.<br><br>
+				Localhost IPs require host mode enabled. You can use the remote address displayed in the <span class="color_orange" style="font-weight: bold;">terminal console</span> or <span class="color_orange" style="font-weight: bold;">colab window</span>, note that the model must be loaded first.<br><br>
+				<span class="color_green" style="font-weight: bold;">Please input URL of the KoboldAI instance.</span><br><br>
+				<input class="form-control" id="customkoboldendpoint" placeholder="https://sample-remote-address.trycloudflare.com" value="">
+				<input class="form-control" type="password" id="customkoboldkey" placeholder="KoboldAI API Key (Optional)" value="" onfocus="focus_api_keys()" onblur="blur_api_keys()"><br>
+				<div class="borderbox flex flex-push-right">
+					<input type="checkbox" id="remoteconsolelog">
+					<div class="box-label" title="Will display outputs to the remote endpoint's console logs, useful for debugging.">Show Console Logging</div>
+				</div>
+			</div>
+			<div id="oaicustom" class="menutext hidden">
+				<span id="oaidesc">
+				Entering your OpenAI API key will allow you to use KoboldAI Lite with their API.<br><br>
+				Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature. Your API key is used directly with the OpenAI API and is not transmitted to us.<br>Only Temperature, Top-P and Repetition Penalty samplers are used.<br><br>
+				<span class="color_green" style="font-weight: bold;">Please input OpenAI API URL and Key.</span><br><br>
+				</span>
+				<span id="openrouterdesc" class="hidden">
+				Entering your OpenRouter API key will allow you to use KoboldAI Lite with their API.<br><br>
+				Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature. Your API key is used directly with the OpenRouter API and is not transmitted to us.<br>Only Temperature, Top-P and Repetition Penalty samplers are used.<br><br>
+				<span class="color_green" style="font-weight: bold;">Please input OpenRouter Key.</span><br><br>
+				</span>
+				<span id="mistralaidesc" class="hidden">
+				Entering your MistralAI API key will allow you to use KoboldAI Lite with their API.<br><br>
+				Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature. Your API key is used directly with the MistralAI API and is not transmitted to us.<br>Only Temperature and Top-P samplers are used.<br><br>
+				<span class="color_green" style="font-weight: bold;">Please input MistralAI Key.</span><br><br>
+				</span>
+				<span id="featherlessdesc" class="hidden">
+				Entering your Featherless API key will allow you to use KoboldAI Lite with their API.<br><br>
+				Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature. Your API key is used directly with the Featherless API and is not transmitted to us.<br>Only Temperature, Top-P, Top-K, Min-P and Repetition Penalty samplers are used.<br><br>
+				<span class="color_green" style="font-weight: bold;">Please input Featherless Key.</span><br><br>
+				</span>
+				<span id="grokdesc" class="hidden">
+				Entering your Grok API key will allow you to use KoboldAI Lite with their API.<br><br>
+				Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature. Your API key is used directly with the Grok API and is not transmitted to us.<br>Only Temperature, Top-P, Top-K, Min-P and Repetition Penalty samplers are used.<br><br>
+				<span class="color_green" style="font-weight: bold;">Please input Grok Key.</span><br><br>
+				</span>
+
+				<input class="form-control" type="text" id="custom_oai_endpoint" placeholder="OpenAI API URL" value="" onblur="try_fetch_oai_models_auto()">
+				<input class="form-control" type="password" id="custom_oai_key" placeholder="API Key (Required)" value="" onfocus="focus_api_keys()" onblur="blur_api_keys()"><br>
+				Model Choice:<br>
+				<select style="padding:4px;display:inline;width:calc(100% - 220px)" class="form-control" id="custom_oai_model" onchange="oai_model_change(true)">
+					<option value="gpt-3.5-turbo-instruct" selected="selected">gpt-3.5-turbo-instruct</option>
+					<option value="davinci-002">davinci-002</option>
+					<option value="gpt-3.5-turbo">gpt-3.5-turbo</option>
+					<option value="gpt-3.5-turbo-16k">gpt-3.5-turbo-16k</option>
+					<option value="gpt-4">gpt-4</option>
+					<option value="gpt-4-turbo">gpt-4-turbo</option>
+					<option value="gpt-4o">gpt-4o</option>
+					<option value="gpt-4-32k">gpt-4-32k</option>
+					<option value="o1-mini">o1-mini</option>
+					<option value="o1-preview">o1-preview</option>
+					<option style="display:none;" class="custom_model_option" value="custom">[Custom]</option>
+				</select>
+				<select style="padding:4px;display:inline;width:calc(100% - 220px)" class="form-control hidden" id="custom_openrouter_model" onchange="oai_model_change(true)">
+					<option value="openai/gpt-3.5-turbo">openai/gpt-3.5-turbo</option>
+					<option value="openai/gpt-4">openai/gpt-4</option>
+					<option value="openai/gpt-3.5-turbo-instruct">openai/gpt-3.5-turbo-instruct</option>
+					<option value="mistralai/mistral-7b-instruct" selected="selected">mistralai/mistral-7b-instruct</option>
+					<option value="gryphe/mythomax-l2-13b">gryphe/mythomax-l2-13b</option>
+					<option value="huggingfaceh4/zephyr-7b-beta">huggingfaceh4/zephyr-7b-beta</option>
+					<option value="anthropic/claude-2.0">anthropic/claude-2.0</option>
+					<option style="display:none;" class="custom_model_option" value="custom">[Custom]</option>
+				</select>
+				<select style="padding:4px;display:inline;width:calc(100% - 220px)" class="form-control hidden" id="custom_mistralai_model" onchange="oai_model_change(true)">
+					<option value="open-mistral-7b">open-mistral-7b</option>
+					<option value="open-mistral-nemo">open-mistral-nemo</option>
+					<option value="open-mixtral-8x22b">open-mixtral-8x22b</option>
+					<option value="mistral-tiny">mistral-tiny</option>
+					<option value="mistral-small">mistral-small</option>
+					<option value="mistral-medium-latest">mistral-medium-latest</option>
+					<option value="mistral-large-latest">mistral-large-latest</option>
+					<option value="pixtral-12b-latest">pixtral-12b-latest</option>
+					<option value="pixtral-large-latest">pixtral-large-latest</option>
+					<option value="codestral-latest">codestral-latest</option>
+					<option style="display:none;" class="custom_model_option" value="custom">[Custom]</option>
+				</select>
+				<select style="padding:4px;display:inline;width:calc(100% - 220px)" class="form-control hidden" id="custom_featherless_model" onchange="oai_model_change(true)">
+					<option value="Sao10K/L3-8B-Lunaris-v1">Sao10K/L3-8B-Lunaris-v1</option>
+					<option value="Sao10K/L3-8B-Stheno-v3.2">Sao10K/L3-8B-Stheno-v3.2</option>
+					<option value="unsloth/llama-3-8b-Instruct">unsloth/llama-3-8b-Instruct</option>
+					<option value="beomi/Llama-3-Open-Ko-8B">beomi/Llama-3-Open-Ko-8B</option>
+					<option value="Sao10K/Fimbulvetr-11B-v2">Sao10K/Fimbulvetr-11B-v2</option>
+					<option value="HuggingFaceH4/zephyr-7b-beta">HuggingFaceH4/zephyr-7b-beta</option>
+					<option value="upstage/SOLAR-10.7B-Instruct-v1.0">upstage/SOLAR-10.7B-Instruct-v1.0</option>
+					<option value="alpindale/magnum-72b-v1">alpindale/magnum-72b-v1</option>
+					<option value="Sao10K/L3-70B-Euryale-v2.1">Sao10K/L3-70B-Euryale-v2.1</option>
+					<option value="alpindale/WizardLM-2-8x22B">alpindale/WizardLM-2-8x22B</option>
+					<option value="meta-llama/Meta-Llama-3.1-405B-Instruct">meta-llama/Meta-Llama-3.1-405B-Instruct</option>
+				<option style="display:none;" class="custom_model_option" value="custom">[Custom]</option>
+				</select>
+				<select style="padding:4px;display:inline;width:calc(100% - 220px)" class="form-control hidden" id="custom_grok_model" onchange="oai_model_change(true)">
+					<option value="grok-beta">grok-beta</option>
+					<option style="display:none;" class="custom_model_option" value="custom">[Custom]</option>
+				</select>
+				<button type="button" class="btn btn-primary" style="display:inline;width:105px;" id="oaifetchlist" onclick="oai_fetch_models()">Fetch List</button>
+				<button type="button" class="btn btn-primary" style="display:inline;width:105px;" id="oaiusecustom" onclick="select_custom_oai_model()">Use Custom</button>
+				<div class="hidden" id="oaiemulatecompletionsbox">
+					<div><input type="checkbox" id="oaiemulatecompletions" title="Emulate Completions with Prefill">
+					<div class="box-label">Emulate Completions API with Prefill</div></div>
+				</div>
+				<div style="display:inline-flex">
+					<div><input type="checkbox" id="oaiaddversion" title="Add Endpoint Version Number" onchange="" checked>
+					<div class="box-label">Add Ver. Num</div></div>
+					<div><input type="checkbox" id="oaistreaming" title="Enable SSE Streaming" onchange="">
+					<div class="box-label">Streaming</div></div>
+					<div><input type="checkbox" id="useoaichatcompl" title="Use ChatCompletions API" onchange="toggleoaichatcompl()">
+					<div class="box-label">Chat-Completions API</div></div>
+					<div><input type="checkbox" id="useoainonstandard" title="Send Non-Standard Fields">
+					<div class="box-label">Non-Standard Fields</div></div>
+				</div>
+				<span id="useoaichatcomplbox" class="hidden" onload="toggleoaichatcompl();">
+					<br>
+					Main Message Role:
+					<select class="form-control" style="height: 25px; font-size:12px; padding:4px;display:inline;width:100px" id="oairoledropdown">
+						<option value="0" selected>User</option>
+						<option value="1">Assistant</option>
+						<option value="2">System</option>
+					</select>
+					<input type="checkbox" id="jailbreakprompt" onchange="togglejailbreak()">
+					<div class="box-label" title="Adds extra text at the start to improve AI response">Add Prefix</div>
+					<input type="checkbox" id="jailbreakprompt2" onchange="togglejailbreak2()">
+					<div class="box-label" title="Adds extra text to the end to improve AI response">Add Postfix</div>
+
+					<div style="display:flex" id="oaijailbreakpromptblock1">
+					<select class="form-control" style="height: 25px; font-size:12px; padding:4px;display:inline;width:100px" id="jailbreakprompttextrole">
+						<option value="0">User</option>
+						<option value="1">Assistant</option>
+						<option value="2" selected>System</option>
+					</select>
+					<textarea class="form-control" rows="3" style="resize: vertical; line-height:1.1; padding:4px; display:inline; width: 100%" type="text" id="jailbreakprompttext" placeholder="(Enter System Prefix)"
+					value="" onload="togglejailbreak();"></textarea>
+					</div>
+
+					<div style="display:flex" id="oaijailbreakpromptblock2">
+					<select class="form-control" style="height: 25px; font-size:12px; padding:4px;display:inline;width:100px" id="jailbreakprompttext2role">
+						<option value="0">User</option>
+						<option value="1" selected>Assistant</option>
+						<option value="2">System</option>
+					</select>
+					<textarea class="form-control" rows="3" style="resize: vertical; line-height:1.1; padding:4px;  display:inline; width: 100%;" type="text" id="jailbreakprompttext2" placeholder="(Enter Assistant Postfix)"
+					value="" onload="togglejailbreak2();"></textarea>
+					</div>
+				</span>
+				<span id="openrouterproviderbox" class="hidden"><br>Preferred Provider: <input style="height: 25px; font-size:12px;padding:4px;display:inline;width:calc(100% - 140px)" class="form-control" type="text" id="openrouterproviders" placeholder="(Automatic)" value="">
+					<div style="display:inline;width:210px;">
+					</div>
+				</span>
+
+			</div>
+			<div id="claudecustom" class="menutext hidden">
+				Entering your Claude API key will allow you to use KoboldAI Lite with their API.<br><br>
+				Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature. <br><span class="color_red">At this time, the official Claude API has CORS restrictions and must be accessed with a CORS proxy. Your connection WILL be proxied.</span><br>Only Temperature, Top-P and Top-K samplers are used.<br><br>
+				<span class="color_green" style="font-weight: bold;">Please input Claude API URL and Key.</span><br><br>
+				<input class="form-control" type="text" id="custom_claude_endpoint" placeholder="Claude API URL" value="">
+				<input class="form-control" type="password" id="custom_claude_key" placeholder="Claude API Key (Required)" value="" onfocus="focus_api_keys()" onblur="blur_api_keys()"><br>
+				Model Choice:<br>
+				<select style="padding:4px; width:calc(100% - 110px); display:inline-block" class="form-control" id="custom_claude_model"  onload="toggleclaudemodel()"  onchange="toggleclaudemodel()">
+					<option value="claude-v1">claude-v1</option>
+					<option value="claude-v1-100k">claude-v1-100k</option>
+					<option value="claude-instant-v1">claude-instant-v1</option>
+					<option value="claude-instant-v1-100k">claude-instant-v1-100k</option>
+					<option value="claude-2">claude-2</option>
+					<option value="claude-2.1">claude-2.1</option>
+					<option value="claude-2.0">claude-2.0</option>
+					<option value="claude-3-opus-20240229">claude-3-opus</option>
+					<option value="claude-3-sonnet-20240229">claude-3-sonnet</option>
+					<option value="claude-3-haiku-20240307">claude-3-haiku</option>
+					<option value="claude-3-5-sonnet-20240620">claude-3-5-sonnet-20240620</option>
+					<option value="claude-3-5-sonnet-20241022">claude-3-5-sonnet-20241022</option>
+					<option value="claude-3-5-sonnet-latest" selected="selected">claude-3-5-sonnet-latest</option>
+					<option value="claude-3-5-haiku-20241022">claude-3-5-haiku-20241022</option>
+					<option value="claude-3-7-sonnet-20250219">claude-3-7-sonnet-20250219</option>
+				</select>
+				<button type="button" class="btn btn-primary" style="display:inline;width:105px;" id="claudefetchlist" onclick="claude_fetch_models()">Fetch List</button>
+				<input type="checkbox" id="claudeaddversion" onchange="" checked>
+				<div class="box-label" title="Add endpoint version">Add Endpoint Version</div>
+				<span id="clauderenamecompatdiv">
+				<input type="checkbox" id="clauderenamecompat" onchange="" checked>
+				<div class="box-label" title="Rename User and Bot tags to work with claude, force inject them otherwise">Claude Compatibility Rename Fix</div>
+				</span>
+
+				<textarea class="form-control hidden" rows="2" style="resize: vertical; line-height:1.1; padding:4px; display:inline; width: 100%" type="text" id="claudesystemprompt" placeholder="(Enter System Prompt)"
+				value="" onload=""></textarea>
+				<textarea class="form-control hidden" rows="2" style="resize: vertical; line-height:1.1; padding:4px; display:inline; width: 100%" type="text" id="claudejailbreakprompt" placeholder="(Enter Assistant Postfix)"
+				value="" onload=""></textarea>
+
+				<div id="claudethinkingbox" class="hidden">
+				<div class="box-label" title="Enable Thinking">Enable Thinking </div>
+				<input type="checkbox" style="display:inline;" id="claudethinking">
+				</div>
+
+			</div>
+			<div id="palmcustom" class="menutext hidden">
+				Uses Gemini or PaLM Text Bison by Google.<br><br>
+				Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature. Your API key is used directly with the Gemini API and is not transmitted to us.<br><br>
+				<div>
+				<select style="padding:4px; width:calc(100% - 110px); display:inline-block" class="form-control" id="custom_palm_model" onchange="togglepalmmodel()">
+					<option value="gemini-pro" selected="selected">gemini-pro</option>
+					<option value="gemini-1.5-pro-001">gemini-1.5-pro-001</option>
+					<option value="gemini-1.5-pro-002">gemini-1.5-pro-002</option>
+					<option value="gemini-1.5-pro-latest">gemini-1.5-pro-latest</option>
+					<option value="gemini-1.5-flash-latest">gemini-1.5-flash-latest</option>
+					<option value="gemini-1.5-pro-exp-0801">gemini-1.5-pro-exp-0801</option>
+					<option value="gemini-1.5-pro-exp-0827">gemini-1.5-pro-exp-0827</option>
+					<option value="gemini-2.0-flash">gemini-2.0-flash</option>
+					<option value="gemini-2.0-pro-exp">gemini-2.0-pro-exp</option>
+					<option value="gemini-exp-1114">gemini-exp-1114</option>
+					<option value="gemini-exp-1121">gemini-exp-1121</option>
+					<option value="text-bison-001">text-bison-001</option>
+				</select>
+				<button type="button" class="btn btn-primary" style="display:inline;width:105px;" id="geminifetchlist" onclick="gemini_fetch_models()">Fetch List</button>
+				</div>
+				<span class="color_green" style="font-weight: bold;">Please input Gemini or PaLM API Key.</span><br><br>
+				<input class="form-control" type="password" id="custom_palm_key" placeholder="PaLM/Gemini API Key (Required)" value="" onfocus="focus_api_keys()" onblur="blur_api_keys()"><br>
+				<div id="gemini_role_options">
+				<div>
+					Main Message Role:
+					<select class="form-control" style="height: 25px; font-size:12px; padding:4px;display:inline;width:100px" onload="togglegeminirole();" onchange="togglegeminirole();" id="geminiroledropdown">
+						<option value="" selected>Default</option>
+						<option value="user">User</option>
+						<option value="model">Model</option>
+					</select>
+				</div>
+				<div id="gemini_role_options2" style="display:flex">
+					<select class="form-control" style="height: 25px; font-size:12px; padding:4px;display:inline;width:100px" id="gemini_postfix_role">
+						<option value="user">User</option>
+						<option value="model" selected>Model</option>
+					</select>
+					<textarea class="form-control" rows="3" style="resize: vertical; line-height:1.1; padding:4px;  display:inline; width: 100%;" type="text" id="gemini_postfix_text" placeholder="(Enter Gemini Postfix)"
+					value=""></textarea>
+				</div>
+				</div>
+				<textarea class="form-control" rows="3" style="resize: vertical; line-height:1.1; padding:4px; display:inline; width: 100%" type="text" id="gemini_system_instruction" placeholder="(Enter System Instruction)"
+				value=""></textarea><br>
+			</div>
+			<div id="coherecustom" class="menutext hidden">
+				Uses Cohere's models through their own API.<br><br>
+				Note that KoboldAI Lite takes no responsibility for your usage or consequences of this feature. Your API key is used directly with the Cohere API and is not transmitted to us.<br><br>
+				<select style="padding:4px;" class="form-control" id="custom_cohere_model">
+					<option value="command" selected="selected">command</option>
+					<option value="command-r">command-r</option>
+					<option value="command-r-plus">command-r-plus</option>
+					<option value="command-r-08-2024">command-r-08-2024</option>
+					<option value="command-r-plus-08-2024">command-r-plus-08-2024</option>
+				</select>
+				<span class="color_green" style="font-weight: bold;">Please input Cohere API Key.</span><br><br>
+				<input class="form-control" type="password" id="custom_cohere_key" placeholder="Cohere API Key (Required)" value="" onfocus="focus_api_keys()" onblur="blur_api_keys()"><br>
+				<input type="checkbox" id="usecohereweb">
+				<div class="box-label" id="usecohereweblabel">Use WebSearch</div>
+				<input type="checkbox" id="useocoherepreamble" onchange="togglecoherepreamble()">
+				<div class="box-label" id="useocoherepreamblelabel">Use Preamble</div>
+
+				<span id="useocoherepreamblebox" class="hidden" onload="togglecoherepreamble();">
+					<textarea class="form-control" id="cohere_preamble" rows="3" style="resize: vertical; line-height:1.1; padding:4px; display:inline; width: 100%" type="text" placeholder="(Enter Preamble)" value=""></textarea>
+				</span>
+			</div>
+			<div class="popupfooter">
+				<button type="button" class="btn btn-primary" onclick="clear_cors_proxy_flag();connect_custom_endpoint()">Ok</button>
+				<button type="button" class="btn btn-primary" onclick="dismiss_endpoint_container()">Cancel</button>
+			</div>
+		</div>
+	</div>
+
+	<div class="popupcontainer flex hidden" id="admincontainer">
+		<div class="popupbg flex"></div>
+		<div class="nspopup flexsizevsmall">
+			<div class="popuptitlebar">
+				<div class="popuptitletext">Change Loaded KoboldCpp Config</div>
+			</div>
+			<div class="menutext">
+				<b></b>Warning: This will terminate the current KoboldCpp instance and relaunch it with a new config.</b><br><br>
+				If an invalid configuration is selected, the new server may fail to relaunch!<br><br>
+				<div>
+					<select title="Select New Config" style="padding:4px;" class="form-control" id="adminconfigdropdown">
+					</select>
+				</div>
+				<br>
+			</div>
+			<div class="popupfooter">
+				<button type="button" style="width:200px" class="btn btn-primary" onclick="trigger_admin_reload()">Reload KoboldCpp</button>
+				<button type="button" class="btn btn-primary" onclick="hide_popups()">Cancel</button>
+			</div>
+		</div>
+	</div>
+
+	<div class="popupcontainer flex hidden" id="newgamecontainer">
+		<div class="popupbg flex"></div>
+		<div class="nspopup flexsizevsmall">
+			<div class="popuptitlebar">
+				<div class="popuptitletext">Really Start A New Session?</div>
+			</div>
+			<div class="menutext">
+				Unsaved data will be lost.<br><br>
+				<div>
+					<div style="vertical-align: middle;">
+						<div title="If disabled, brings you back to the start page">
+							<span>Keep AI Selected? </span>
+							<input type="checkbox" id="keep_ai_selected" style=" vertical-align: top;" checked>
+						</div>
+						<div>
+							<span>Keep Memory and World Info? </span>
+							<input type="checkbox" id="keep_memory" style=" vertical-align: top;">
+						</div>
+					</div>
+				</div>
+				<br>
+			</div>
+			<div class="popupfooter">
+				<button type="button" class="btn btn-primary" onclick="confirm_newgame()">Ok</button>
+				<button type="button" class="btn btn-primary" onclick="hide_popups()">Cancel</button>
+			</div>
+		</div>
+	</div>
+
+	<div class="popupcontainer flex hidden" id="advancedloadfile">
+		<div class="popupbg flex"></div>
+		<div class="nspopup flexsizevsmall">
+			<div class="popuptitlebar">
+				<div class="popuptitletext">Advanced Load File</div>
+			</div>
+			<div class="menutext">
+				Select categories to import from saved file. Selected categories will be overwritten. Unselected categories will retain original values.<br>
+				<br><div>
+				<table style="width:90%; margin:8px auto;">
+				<tr><td><span style="vertical-align: middle;">Main Story</span></td><td><input type="checkbox" id="advset_mainstory" style=" vertical-align: top;" checked></td></tr>
+				<tr><td><span style="vertical-align: middle;">Memory and Author's Note</span></td><td><input type="checkbox" id="advset_memanote" style=" vertical-align: top;" checked></td></tr>
+				<tr><td><span style="vertical-align: middle;">World Info</span></td><td><input type="checkbox" id="advset_worldinfo" style=" vertical-align: top;" checked></td></tr>
+				<tr><td><span style="vertical-align: middle;">Stop Sequences</span></td><td><input type="checkbox" id="advset_stopseq" style=" vertical-align: top;" checked></td></tr>
+				<tr><td><span style="vertical-align: middle;">General Settings</span></td><td><input type="checkbox" id="advset_gensettings" style=" vertical-align: top;" checked></td></tr>
+				<tr><td><span style="vertical-align: middle;">Aesthetic Settings</span></td><td><input type="checkbox" id="advset_aessettings" style=" vertical-align: top;" checked></td></tr>
+				</table>
+				</div>
+			</div>
+			<div class="popupfooter">
+				<button type="button" class="btn btn-primary" onclick="advload_btnok()">Ok</button>
+				<button type="button" class="btn btn-primary" onclick="hide_popups()">Cancel</button>
+			</div>
+		</div>
+	</div>
+
+	<div class="popupcontainer flex hidden" id="zoomedimgcontainer">
+		<div class="popupbg flex"></div>
+		<div class="nspopup flexsize highest">
+			<div class="popuptitlebar">
+				<div class="popuptitletext">Image Information</div>
+			</div>
+
+			<div class="zoomedimgdiv">
+				<img class="zoomedimg" id="zoomedimg" src="">
+			</div>
+
+			<div class="menutext zoomedimgdesc" id="zoomedimgdesc" style="word-wrap: break-word;">
+				Loading...
+			</div>
+			<br>
+			<div class="popupfooter">
+				<button type="button" class="bg_red btn btn-primary" style="width: 124px;" onclick="delete_curr_image();hide_popups();">Delete Image</button>
+				<button type="button" class="btn btn-primary" onclick="hide_popups()">Close</button>
 			</div>
 		</div>
 	</div>
@@ -22273,184 +22613,6 @@ Current version indicated by LITEVER below.
 		</div>
 	</div>
 
-	<div class="popupcontainer flex hidden" id="aestheticsettingscontainer">
-		<div class="popupbg flex"></div>
-		<div class="nspopup evenhigher" style="margin-left: 20px; margin-right: 20px;">
-			<div class="popuptitlebar" id="aesthetic_customization_panel">
-				<div class="popuptitletext">Aesthetic UI customization panel</div>
-			</div>
-			<div class="menutext" style="display: flex; flex-direction: row; height:max(70vh, 480px);">
-				<!-- Settings panel -->
-				<div style="background-color: #122b40;" onchange="refreshAestheticPreview()">
-					<div style="padding: 10px; width:350px; height:100%">
-
-						<!-- BACKGROUND STYLE SETTINGS -->
-						<div>
-							<div class="settinglabel" style="display: flex;flex-direction: column; margin-top:5px; border-top: solid 1px rgba(180, 180, 255, 0.2);">
-								<!-- Background style header -->
-								<div class="justifyleft settingsmall" style="font-size: 14px; margin-bottom: 2px;">Background Style</div>
-
-								<!-- Background style main settings -->
-								<div style="margin-left: 12px;">
-									<div class="ui-settings-inline">
-										<div style="margin-right: 5px">Bubble Color: </div>
-										<div class="enhancedStandardColorPicker" id="sys-bubble-colorselector">System 🖌️</div>
-										<div class="enhancedStandardColorPicker" id="you-bubble-colorselector">You 🖌️</div>
-										<div class="enhancedStandardColorPicker" id="AI-bubble-colorselector">AI 🖌️</div>
-									</div>
-
-									<div class="ui-settings-inline" style="font-size: 10px; margin-left: 10px">
-										<div style="padding-top: 2px;">Rounded Bubbles: </div>
-										<input id="aui_rounded_bubbles"  type="checkbox" style="height: 10px">
-
-										<div style="padding-top: 2px; padding-left: 5px;">Color Background: </div>
-										<input id="aui_match_background"  type="checkbox" style="height: 10px">
-									</div>
-
-									<div class="ui-settings-inline">
-										<div style="margin-right:20px;">Min Height: </div>
-										<div class="instruct-settings-input"><input id ="instruct-min-backgroundHeight" type="number"/> px</div>
-										<div class="ui-settings-inline">
-											<div style="padding-top: 4px; font-size: 10px; margin-left: 10px;">Horizontally-centered text:</div>
-											<input id="instructModeCenterHorizontally" type="checkbox" style="height: 10px; margin-top: 6px;">
-										</div>
-									</div>
-									<div class="ui-settings-inline">
-										<div style="margin-right:20px;">Margin (px): </div>
-										<div class="instruct-settings-input" data-type="margin" data-side="left"  >L: <input type="number"/></div>
-										<div class="instruct-settings-input" data-type="margin" data-side="right" >R: <input type="number"/></div>
-										<div class="instruct-settings-input" data-type="margin" data-side="top"   >T: <input type="number"/></div>
-										<div class="instruct-settings-input" data-type="margin" data-side="bottom">B: <input type="number"/></div>
-									</div>
-									<div class="ui-settings-inline">
-										<div style="margin-right:13px">Padding (px): </div>
-										<div class="instruct-settings-input" data-type="padding" data-side="left"  >L: <input type="number"/></div>
-										<div class="instruct-settings-input" data-type="padding" data-side="right" >R: <input type="number"/></div>
-										<div class="instruct-settings-input" data-type="padding" data-side="top"   >T: <input type="number"/></div>
-										<div class="instruct-settings-input" data-type="padding" data-side="bottom">B: <input type="number"/></div>
-									</div>
-								</div>
-							</div>
-						</div>
-
-						<!-- PORTRAIT STYLE SETTINGS -->
-						<div>
-							<div class="settinglabel" style="display: flex;flex-direction: column; margin-top:5px; border-top: solid 1px rgba(180, 180, 255, 0.2);">
-								<!-- Portrait style header -->
-								<div class="justifyleft settingsmall" style="font-size: 15px; margin-bottom: 5px;">Portrait Style</div>
-
-								<!-- Portrait style main settings -->
-								<div style="margin-left: 12px;">
-									<div class="ui-settings-inline">
-										<div style="margin-right: 27px">Portraits: </div>
-										<div id="you-portrait">🖼️ Your Portrait</div>
-										<div id="AI-portrait">🖼️ AI's Portrait</div>
-									</div>
-								</div>
-								<div style="margin-left: 12px;">
-									<div class="ui-settings-inline">
-										<div style="margin-right:17px;">Portrait Style: </div>
-										<select class="form-control" id="instructBorderStyle" style="width:70px;height:16px;padding:0; font-size: 10px;">
-											<option value="None">None</option>
-											<option value="Circle">Circle</option>
-											<option value="Rounded">Rounded</option>
-											<option value="Rect">Rect</option>
-										</select>
-										<div style="margin-left: 10px;"><a href="#" id="reset-portrait" class="color_blueurl">(Reset Image)</a></div>
-									</div>
-									<div class="ui-settings-inline">
-										<div style="margin-right:18px;">User Portrait: </div>
-										<div>						 <span class="rectPortraitMode">Size: </span><input id="portrait_width_you"  type="number" placeholder="100" value="100" style='width:40px;height:20px;font-size:10px;'/></div>
-										<div style="align-self: left;">px</div>
-										<div style="margin-left:20px"><span class="rectPortraitMode">A/R: </span><input id="portrait_ratio_you" type="number" placeholder="1.0" step="0.01" value="1.0" style='width:46px;height:20px;font-size:10px;' class="rectPortraitMode"/></div>
-									</div>
-									<div class="ui-settings-inline">
-										<div style="margin-right:32px;">AI Portrait: </div>
-										<div>						 <span class="rectPortraitMode">Size: </span><input id="portrait_width_AI"  type="number" placeholder="100" value="100" style='width:40px;height:20px;font-size:10px;'/></div>
-										<div style="align-self: left;">px</div>
-										<div style="margin-left:20px"><span class="rectPortraitMode">A/R: </span><input id="portrait_ratio_AI" type="number" placeholder="1.0" step="0.01" value="1.0" style='width:46px;height:20px;font-size:10px;' class="rectPortraitMode"/></div>
-									</div>
-									<div class="ui-settings-inline" style="font-size: 10px; margin-left: 10px">
-										<div style="padding-top: 2px;">Show Names (Chat Mode): </div>
-										<input id="aui_show_chat_names" type="checkbox" style="height: 10px">
-									</div>
-								</div>
-							</div>
-						</div>
-
-						<!-- FONT STYLE SETTINGS -->
-						<div>
-							<div class="settinglabel" style="display: flex;flex-direction: column; margin-top:5px; border-top: solid 1px rgba(180, 180, 255, 0.2);">
-								<!-- Font style header -->
-								<div class="justifyleft settingsmall" style="font-size: 15px; margin-bottom:5px;">Font Style</div>
-
-								<!-- Font style main settings -->
-								<div style="margin-left: 12px;">
-									<div class="ui-settings-inline">
-										<div style="margin-right:20px;text-align: center;">Font Size: </div>
-										<div style="margin: 0px 10px"><input id="instruct-font-size" type="number" min="8" max="40" style='width:40px;height:20px;font-size:10px;'/> px</div>
-									</div>
-									<div class="ui-settings-inline">
-										<div style="font-size: 12px; margin-right:27px; text-align: center;">Customize: </div>
-										<div class="ui-settings-inline" style="font-size: 10px">
-											<div style="padding-top: 2px;">Per-entity: </div>
-											<input id="instructModeCustomized" type="checkbox" style="height: 10px;">
-										</div>
-										<div class="ui-settings-inline" style="font-size: 10px; margin-left: 10px">
-											<div style="padding-top: 2px;">Style Text: </div>
-											<input id="instructModeMarkdown"  type="checkbox" style="height: 10px">
-										</div>
-									</div>
-									<div class="ui-settings-inline uniform-mode-font">
-										<div style="margin-right:48px; text-align: center;">Colors: </div>
-										<div class="enhancedcolorPicker" id="uniform-text-colorselector">text🖌️</div>
-										<div class="enhancedcolorPicker instruct-markdown-user" id="uniform-speech-colorselector">"speech"🖌️</div>
-										<div class="enhancedcolorPicker instruct-markdown-user" id="uniform-action-colorselector">*action*🖌️</div>
-									</div>
-									<div class="ui-settings-inline custom-mode-font">
-										<div style="margin-right:58px; text-align: center;">You: </div>
-										<div class="enhancedcolorPicker" id="you-text-colorselector">text🖌️</div>
-										<div class="enhancedcolorPicker instruct-markdown-user" id="you-speech-colorselector">"speech"🖌️</div>
-										<div class="enhancedcolorPicker instruct-markdown-user" id="you-action-colorselector">*action*🖌️</div>
-									</div>
-									<div class="ui-settings-inline custom-mode-font">
-										<div style="margin-right:67px; text-align: center;">AI: </div>
-										<div class="enhancedcolorPicker" id="AI-text-colorselector">text🖌️</div>
-										<div class="enhancedcolorPicker instruct-markdown-user" id="AI-speech-colorselector">"speech"🖌️</div>
-										<div class="enhancedcolorPicker instruct-markdown-user" id="AI-action-colorselector">*action*🖌️</div>
-									</div>
-									<div class="ui-settings-inline custom-mode-font">
-										<div style="margin-right:38px; text-align: center;">System: </div>
-										<div class="enhancedcolorPicker" id="sys-text-colorselector">text🖌️</div>
-										<div class="enhancedcolorPicker instruct-markdown-user" id="sys-speech-colorselector">"speech"🖌️</div>
-										<div class="enhancedcolorPicker instruct-markdown-user" id="sys-action-colorselector">*action*🖌️</div>
-									</div>
-									<div class="ui-settings-inline instruct-markdown-user">
-										<div style="margin-right:11px; text-align: center;">Code blocks: </div>
-										<div class="enhancedcolorPicker" id="code-block-background-colorselector">background🖌️</div>
-										<div class="enhancedcolorPicker" id="code-block-foreground-colorselector">foreground🖌️</div>
-									</div>
-								</div>
-								<br>
-									<div style="margin-left: 10px;"><a href="#" id="reset-all-aesthetic-instruct" class="color_blueurl">(Reset All Styles)</a></div>
-							</div>
-						</div>
-
-					</div>
-					<div class="popupfooter" id="aesthetic_instruct_footer" style="margin-top: -55px;height:55px;">
-						<button type="button" class="btn btn-primary" id="btn_settingsaccept" onclick="hideAestheticUISettingsMenu(true)">OK</button>
-						<button type="button" class="btn btn-primary" id="btn_settingsclose" onclick="hideAestheticUISettingsMenu(false)">Cancel</button>
-					</div>
-				</div>
-				<div id="aesthetic_text_preview_panel" style="background-color: black; padding: 10px; height:100%; overflow-y: auto; ">
-					<p>Style Preview</p>
-					<div id="aesthetic_text_preview" style="background-color: black; margin: 2px; text-align: left; word-wrap: break-word;"></div>
-				</div>
-				<input type="file" id="portraitFileInput" style="display:none" accept="image/*">
-			</div>
-		</div>
-	</div>
-
 	<div class="popupcontainer flex hidden" id="inputboxcontainer">
 		<div class="popupbg flex"></div>
 		<div class="nspopup flexsize moderate">
diff --git a/koboldcpp.py b/koboldcpp.py
index be904e6b9..68064aa90 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -49,7 +49,7 @@ logit_bias_max = 512
 dry_seq_break_max = 128
 
 # global vars
-KcppVersion = "1.85"
+KcppVersion = "1.85.1"
 showdebug = True
 kcpp_instance = None #global running instance
 global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False}
@@ -3198,7 +3198,7 @@ def show_gui():
         if not args.model_param and not args.sdmodel and not args.whispermodel and not args.ttsmodel and not args.nomodel:
             global exitcounter
             exitcounter = 999
-            exit_with_error(2,"No ggml model or kcpps file was selected. Exiting.")
+            exit_with_error(2,"No gguf model or kcpps file was selected. Exiting.")
         return
 
     #dummy line to get darkdetect imported in pyinstaller
@@ -4012,24 +4012,7 @@ def show_gui():
         savdict = json.loads(json.dumps(args.__dict__))
         file_type = [("KoboldCpp LaunchTemplate", "*.kcppt")]
         #remove blacklisted fields
-        savdict["istemplate"] = True
-        savdict["gpulayers"] = -1
-        savdict["threads"] = -1
-        savdict["hordekey"] = ""
-        savdict["hordeworkername"] = ""
-        savdict["sdthreads"] = 0
-        savdict["password"] = None
-        savdict["usemmap"] = False
-        savdict["usemlock"] = False
-        savdict["debugmode"] = 0
-        savdict["ssl"] = None
-        savdict["useclblast"] = None
-        savdict["usecublas"] = None
-        savdict["usevulkan"] = None
-        savdict["tensor_split"] = None
-        savdict["draftgpusplit"] = None
-        savdict["config"] = None
-        savdict["ttsthreads"] = 0
+        savdict = convert_args_to_template(savdict)
         filename = asksaveasfilename(filetypes=file_type, defaultextension=file_type)
         if not filename:
             return
@@ -4923,11 +4906,36 @@ def load_config_cli(filename):
             else:
                 setattr(args, key, value)
         if args.istemplate:
-            print("\nA .kcppt template was selected from CLI - automatically selecting your backend...")
-            auto_set_backend_cli()
+            print("\nA .kcppt template was selected from CLI...")
+            if (args.usecublas is None) and (args.usevulkan is None) and (args.useclblast is None):
+                print("Automatically selecting your backend...")
+                auto_set_backend_cli()
 
-def save_config_cli(filename):
+def convert_args_to_template(savdict):
+    savdict["istemplate"] = True
+    savdict["gpulayers"] = -1
+    savdict["threads"] = -1
+    savdict["hordekey"] = ""
+    savdict["hordeworkername"] = ""
+    savdict["sdthreads"] = 0
+    savdict["password"] = None
+    savdict["usemmap"] = False
+    savdict["usemlock"] = False
+    savdict["debugmode"] = 0
+    savdict["ssl"] = None
+    savdict["useclblast"] = None
+    savdict["usecublas"] = None
+    savdict["usevulkan"] = None
+    savdict["tensor_split"] = None
+    savdict["draftgpusplit"] = None
+    savdict["config"] = None
+    savdict["ttsthreads"] = 0
+    return savdict
+
+def save_config_cli(filename, template):
     savdict = json.loads(json.dumps(args.__dict__))
+    if template:
+        savdict = convert_args_to_template(savdict)
     if filename is None:
         return
     filenamestr = str(filename).strip()
@@ -5103,7 +5111,10 @@ def main(launch_args):
         return
 
     if args.exportconfig and args.exportconfig!="":
-        save_config_cli(args.exportconfig)
+        save_config_cli(args.exportconfig,False)
+        return
+    if args.exporttemplate and args.exporttemplate!="":
+        save_config_cli(args.exporttemplate,True)
         return
 
     if args.config and len(args.config)==1: #handle initial config loading for launch
@@ -5399,7 +5410,9 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
         has_multiplayer = True
 
     if args.savedatafile and isinstance(args.savedatafile, str):
-        filepath = args.savedatafile
+        filepath = os.path.abspath(args.savedatafile)  # Ensure it's an absolute path
+        if not filepath.endswith(".jsondb"):
+            filepath += ".jsondb"
         try:
             with open(filepath, 'r+', encoding='utf-8', errors='ignore') as f:
                 loaded = json.load(f)
@@ -5926,7 +5939,7 @@ if __name__ == '__main__':
     advparser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
     advparser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true')
     advparser.add_argument("--preloadstory", metavar=('[savefile]'), help="Configures a prepared story json save file to be hosted on the server, which frontends (such as KoboldAI Lite) can access over the API.", default="")
-    advparser.add_argument("--savedatafile", metavar=('[savefile]'), help="If enabled, creates or opens a persistent database file on the server, that allows users to save and load their data remotely.", default="")
+    advparser.add_argument("--savedatafile", metavar=('[savefile]'), help="If enabled, creates or opens a persistent database file on the server, that allows users to save and load their data remotely. A new file is created if it does not exist.", default="")
     advparser.add_argument("--quiet", help="Enable quiet mode, which hides generation inputs and outputs in the terminal. Quiet mode is automatically enabled when running a horde worker.", action='store_true')
     advparser.add_argument("--ssl", help="Allows all content to be served over SSL instead. A valid UNENCRYPTED SSL cert and key .pem files must be provided", metavar=('[cert_pem]', '[key_pem]'), nargs='+')
     advparser.add_argument("--nocertify", help="Allows insecure SSL connections. Use this if you have cert errors and need to bypass certificate restrictions.", action='store_true')
@@ -5945,6 +5958,7 @@ if __name__ == '__main__':
     advparser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently. Outdated. Not recommended.", action='store_true')
     advparser.add_argument("--unpack", help="Extracts the file contents of the KoboldCpp binary into a target directory.", metavar=('destination'), type=str, default="")
     advparser.add_argument("--exportconfig", help="Exports the current selected arguments as a .kcpps settings file", metavar=('[filename]'), type=str, default="")
+    advparser.add_argument("--exporttemplate", help="Exports the current selected arguments as a .kcppt template file", metavar=('[filename]'), type=str, default="")
     advparser.add_argument("--nomodel", help="Allows you to launch the GUI alone, without selecting any model.", action='store_true')
     advparser.add_argument("--moeexperts", metavar=('[num of experts]'), help="How many experts to use for MoE models (default=follow gguf)", type=int, default=-1)
     compatgroup2 = parser.add_mutually_exclusive_group()
diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h
index 397ce6983..62c03a047 100644
--- a/otherarch/otherarch.h
+++ b/otherarch/otherarch.h
@@ -47,7 +47,7 @@ struct kcpp_params {
     std::vector<std::string> dry_sequence_breakers; // DRY sequence breakers
     float xtc_threshold        = 0;
     float xtc_probability      = 0;
-    float   dynatemp_range     = 0.0f;  // enables DynaTemp if greater than 0. dynatemp_min = temperature - dt_range, dynatemp_max = temperature + dt_range
+    float   dynatemp_range     = 0.0f;  // enables DynaTemp if neq 0. dynatemp_min = temperature - dt_range, dynatemp_max = temperature + dt_range
     float   dynatemp_exponent  = 1.0f;
 
     std::string model_filename       = ""; // model path
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 236b2b8dc..0f0205581 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2297,13 +2297,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 } break;
             case LLM_ARCH_PHI3:
                 {
-                    const int64_t n_embd_head = n_embd / n_head;
-
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
 
                     // output
                     output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
-                    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
 
                     for (int i = 0; i < n_layer; ++i) {
                         auto & layer = layers[i];
@@ -2318,8 +2321,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
                         layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
 
-                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
-                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+                        layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
+                        layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
                     }
                 } break;
             case LLM_ARCH_PHIMOE:
@@ -3937,6 +3940,10 @@ int32_t llama_model_n_head(const struct llama_model * model) {
     return model->hparams.n_head();
 }
 
+int32_t llama_model_n_head_kv(const struct llama_model * model) {
+    return model->hparams.n_head_kv();
+}
+
 // deprecated
 int32_t llama_n_ctx_train(const struct llama_model * model) {
     return llama_model_n_ctx_train(model);
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index df3cb8bc6..a887d07e7 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -617,6 +617,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                     "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                 };
                 break;
+            case LLAMA_VOCAB_PRE_TYPE_GPT4O:
+                regex_exprs = {
+                    // original regex from tokenizer.json
+                    // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
             default:
                 // default regex for BPE tokenization pre-processing
                 regex_exprs = {
@@ -1827,6 +1834,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             } else if (
                 tokenizer_pre == "megrez") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
+            } else if (
+                tokenizer_pre == "gpt-4o") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
+                clean_spaces = false;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }