Merge branch 'upstream' into concedo_experimental

# Conflicts: # CMakePresets.json # README.md # common/CMakeLists.txt # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # scripts/sync-ggml.last # tests/test-backend-ops.cpp # tools/run/CMakeLists.txt
2025-09-13 02:19:41 +00:00 · 2025-07-13 23:39:41 +08:00 · 2025-07-13 23:39:41 +08:00 · 8cebec5128
commit 8cebec5128
parent 66755c8fe9 923e3ea2e3
41 changed files with 28682 additions and 366 deletions
--- a/.github/workflows/update-ops-docs.yml
+++ b/.github/workflows/update-ops-docs.yml
@ -0,0 +1,40 @@
 name: Update Operations Documentation
 on:
    push:
        paths:
            - 'docs/ops/**'
            - 'scripts/create_ops_docs.py'
    pull_request:
        paths:
            - 'docs/ops/**'
            - 'scripts/create_ops_docs.py'
 jobs:
    update-ops-docs:
        runs-on: ubuntu-latest
        steps:
        - name: Checkout repository
          uses: actions/checkout@v4
        - name: Set up Python
          uses: actions/setup-python@v5
          with:
              python-version: '3.x'
        - name: Generate operations documentation to temporary file
          run: |
              mkdir -p /tmp/ops_check
              ./scripts/create_ops_docs.py /tmp/ops_check/ops.md
        - name: Check if docs/ops.md matches generated version
          run: |
              if ! diff -q docs/ops.md /tmp/ops_check/ops.md; then
                  echo "Operations documentation (docs/ops.md) is not up to date with the backend CSV files."
                  echo "To fix: run ./scripts/create_ops_docs.py and commit the updated docs/ops.md along with your changes"
                  echo "Differences found:"
                  diff docs/ops.md /tmp/ops_check/ops.md || true
                  exit 1
              fi
              echo "Operations documentation is up to date."
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -300,6 +300,7 @@ class ModelBase:
                            gguf.MODEL_TENSOR.POS_EMBD,
                            gguf.MODEL_TENSOR.TOKEN_TYPES,
                            gguf.MODEL_TENSOR.SSM_CONV1D,
                            gguf.MODEL_TENSOR.SHORTCONV_CONV,
                            gguf.MODEL_TENSOR.TIME_MIX_FIRST,
                            gguf.MODEL_TENSOR.TIME_MIX_W1,
                            gguf.MODEL_TENSOR.TIME_MIX_W2,
@ -833,6 +834,12 @@ class TextModel(ModelBase):
        if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b":
            # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base
            res = "falcon-h1"
        if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4":
            # ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
            res = "midm-2.0"
        if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
            # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
            res = "lfm2"
        if res is None:
            logger.warning("\n")
@ -4890,6 +4897,9 @@ class Mamba2Model(TextModel):
            with open(dir_model / "config.json", "r", encoding="utf-8") as f:
                hparams = json.load(f)
        super().__init__(dir_model, *args, hparams=hparams, **kwargs)
        self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
        self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
        self.n_group = self.find_hparam(["n_groups"], optional=True) or 1
    def set_vocab(self):
        vocab_size = self.hparams["vocab_size"]
@ -4912,12 +4922,9 @@ class Mamba2Model(TextModel):
            self._set_vocab_builtin("gpt-neox", vocab_size)
    def set_gguf_parameters(self):
        d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
        d_conv  = self.find_hparam(["conv_kernel", "d_conv"],     optional=True) or 4
        d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * d_model
        d_state = self.find_hparam(["state_size",  "d_state"],    optional=True) or 128
        head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64
        n_group = self.find_hparam(["n_groups"],                     optional=True) or 1
        rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
@ -4925,19 +4932,19 @@ class Mamba2Model(TextModel):
        # TODO: does this really matter?
        # skip the assertion for FalconH1 Model
        if self.model_arch != gguf.MODEL_ARCH.FALCON_H1:
-            assert d_inner == 2 * d_model
+            assert self.d_inner == 2 * self.d_model
-            assert d_inner % head_dim == 0
+            assert self.d_inner % head_dim == 0
        self.gguf_writer.add_context_length(2**20)  # arbitrary value; for those who use the default
-        self.gguf_writer.add_embedding_length(d_model)
+        self.gguf_writer.add_embedding_length(self.d_model)
        self.gguf_writer.add_feed_forward_length(0)  # unused, but seemingly required when loading
        self.gguf_writer.add_head_count(0)  # unused, but seemingly required when loading
        self.gguf_writer.add_block_count(self.block_count)
        self.gguf_writer.add_ssm_conv_kernel(d_conv)
-        self.gguf_writer.add_ssm_inner_size(d_inner)
+        self.gguf_writer.add_ssm_inner_size(self.d_inner)
        self.gguf_writer.add_ssm_state_size(d_state)
-        self.gguf_writer.add_ssm_time_step_rank(d_inner // head_dim)
+        self.gguf_writer.add_ssm_time_step_rank(self.d_inner // head_dim)
-        self.gguf_writer.add_ssm_group_count(n_group)
+        self.gguf_writer.add_ssm_group_count(self.n_group)
        self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
        self.gguf_writer.add_file_type(self.ftype)
@ -4962,10 +4969,7 @@ class Mamba2Model(TextModel):
            # (D is also unsqueezed, but for more straightforward broadcast internally)
            data_torch = data_torch.reshape((*data_torch.shape, 1))
        elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid):
-            d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
+            data_torch = data_torch.reshape((self.n_group, self.d_inner // self.n_group))
            d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * d_model
            n_group = self.hparams.get("n_groups", 1)
            data_torch = data_torch.reshape((n_group, d_inner // n_group))
        if name.endswith(".A_log"):
            logger.debug("A_log --> A ==> " + new_name)
@ -6452,18 +6456,148 @@ class GraniteMoeModel(GraniteModel):
                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
            ]
        has_experts = bool(self.hparams.get('num_local_experts'))
        if name.endswith("shared_mlp.input_linear.weight"):
            ffn_dim = self.hparams["shared_intermediate_size"]
            assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
            gate, up = data_torch.split(ffn_dim, dim=-2)
            if has_experts:
                return [
                    (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
                    (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
                ]
            return [
                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), gate),
                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), up),
            ]
        if not has_experts and name.endswith("shared_mlp.output_linear.weight"):
            return [
                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), data_torch)
            ]
        return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("GraniteMoeHybridForCausalLM", "BambaForCausalLM")
 class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
    """GraniteHybrid is a hybrid SSM + Attention model that uses Mamba2 SSM
    layers and optionally uses MoE w/ a shared expert"""
    model_arch = gguf.MODEL_ARCH.GRANITE_HYBRID
    undo_permute = True
    def __init__(self, *args, **kwargs):
        # Hybrid mamba models use a prefix for the mamba-specific params.
        # TODO: Extend this if the prefix(es) need to be configurable
        self.hparam_prefixes = ["mamba"]
        super().__init__(*args, **kwargs)
        # Lists of which layers use ssm vs attention
        self._attn_layers = self.get_attn_layers()
        self._ssm_layers = [
            i for i in range(self.block_count)
            if i not in self._attn_layers
        ]
        # n_group and d_inner are used during reshape_tensors for mamba2
        self.d_model = self.find_hparam(["hidden_size", "d_model"])
        self.n_group = self.find_hparam(["n_groups"])
        self.d_inner = self.find_hparam(["expand"]) * self.d_model
    def get_attn_layers(self):
        # Explicit list of layer type names
        if layer_types := self.hparams.get("layer_types"):
            return [
                i for i, typ in enumerate(layer_types)
                if typ == "attention"
            ]
        # Layer types indicated by index or period
        attn_layers = self.hparams.get("attn_layer_indices", [])
        if not attn_layers:
            attn_period = self.hparams.get("attn_layer_period")
            assert attn_period, "Didn't find attn_layer_indices or attn_layer_period"
            attn_offset = self.hparams.get("attn_layer_offset")
            assert attn_offset is not None, "No attention layer offset set with attn_layer_period"
            attn_layers = [
                i for i in range(self.block_count)
                if i % attn_period == attn_offset
            ]
        return attn_layers
    def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any:
        prefixed = []
        for pfx in self.hparam_prefixes:
            prefixed.extend(
                "_".join([pfx, k])
                for k in keys
            )
        keys = list(keys) + prefixed
        return Mamba2Model.find_hparam(self, keys, *args, **kwargs)
    def modify_tensors(
        self, data_torch: Tensor, name: str, bid: int | None
    ) -> Iterable[tuple[str, Tensor]]:
        if (
            name.endswith("block_sparse_moe.input_linear.weight")
            or "shared_mlp" in name
        ):
            return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
        # Determine whether this is a mamba layer or an attention layer
        if bid in self._ssm_layers:
            return Mamba2Model.modify_tensors(self, data_torch, name, bid)
        elif bid in self._attn_layers:
            return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
        return [(self.map_tensor_name(name), data_torch)]
    def set_gguf_parameters(self):
        """This method merges params from both parents and some that are
        specific to this model. The result is some duplication of how the params
        get set. The following warnings are expected during conversion:
        WARNING:Duplicated key name 'granitehybrid.attention.head_count_kv'
        WARNING:Duplicated key name 'granitehybrid.context_length'
        """
        GraniteMoeModel.set_gguf_parameters(self)
        ## Mamba mixer params ##
        self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"]))
        self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state"]))
        self.gguf_writer.add_ssm_group_count(self.n_group)
        self.gguf_writer.add_ssm_inner_size(self.d_inner)
        # NOTE: The mamba_dt_rank is _not_ the right field for how this is used
        #   in llama.cpp
        self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads"]))
        ## Attention params ##
        head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
        head_count_kv_vec = [
            head_count_kv if i in self._attn_layers else 0 for i in range(self.block_count)
        ]
        if rope_dim := self.hparams.get("attn_rotary_emb"):
            self.gguf_writer.add_rope_dimension_count(rope_dim)
        self.gguf_writer.add_head_count_kv(head_count_kv_vec)
        ## If Bamba, use rope, otherwise don't
        use_rope = "BambaForCausalLM" in self.hparams["architectures"]
        self.gguf_writer.add_rope_scaling_finetuned(use_rope)
        if not use_rope:
            self.gguf_writer.add_context_length(2**20)
        ## Validation ##
        d_head = self.find_hparam(["d_head"], optional=True) or 64
        assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
        assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}"
    def set_vocab(self):
        self.hparams["pad_vocab_size_multiple"] = 8
        Mamba2Model.set_vocab(self)
@ModelBase.register("BailingMoeForCausalLM")
 class BailingMoeModel(TextModel):
    model_arch = gguf.MODEL_ARCH.BAILINGMOE
@ -6687,7 +6821,7 @@ class FalconH1Model(Mamba2Model):
        # Use Llama conversion for attention
        self._transformer_model_class = LlamaModel
-        # n_group and d_inner are used during reshape_tensors for mamaba2
+        # n_group and d_inner are used during reshape_tensors for mamba2
        self.n_group = self.find_hparam(["n_groups"])
        self.d_inner = self.find_hparam(["mamba_d_ssm"])
        self.d_head = self.find_hparam(["d_head"])
@ -6943,6 +7077,50 @@ class SmolLM3Model(LlamaModel):
            chat_template = tokenizer.chat_template.replace("[:]", "")
            self.gguf_writer.add_chat_template(chat_template)
@ModelBase.register("Lfm2ForCausalLM")
@ModelBase.register("LFM2ForCausalLM")
 class LFM2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.LFM2
    def _add_feed_forward_length(self):
        ff_dim = self.hparams["block_ff_dim"]
        auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"]
        ff_dim = self.hparams["block_ff_dim"]
        ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"]
        multiple_of = self.hparams["block_multiple_of"]
        if auto_adjust_ff_dim:
            ff_dim = int(2 * ff_dim / 3)
            # custom dim factor multiplier
            if ffn_dim_multiplier is not None:
                ff_dim = int(ffn_dim_multiplier * ff_dim)
            ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
        self.gguf_writer.add_feed_forward_length(ff_dim)
    def set_gguf_parameters(self):
        # set num_key_value_heads only for attention layers
        self.hparams["num_key_value_heads"] = [
            self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
            for layer_type in self.hparams["layer_types"]
        ]
        super().set_gguf_parameters()
        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
        self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"])
        self._add_feed_forward_length()
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        # conv op requires 2d tensor
        if 'conv.conv' in name:
            data_torch = data_torch.squeeze(1)
        return [(self.map_tensor_name(name), data_torch)]
 ###### CONVERSION LOGIC ######
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@ -129,6 +129,8 @@ models = [
    {"name": "pixtral",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
    {"name": "seed-coder",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
    {"name": "a.x-4.0",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
    {"name": "midm-2.0",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
    {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
 ]
 # some models are known to be broken upstream, so we will skip them as exceptions
--- a/docs/ops.md
+++ b/docs/ops.md
@ -0,0 +1,95 @@
 # GGML Operations
 List of GGML operations and backend support status.
 Legend:
 - ✅ Fully supported by this backend
 - 🟡 Partially supported by this backend
 - ❌ Not supported by this backend
 | Operation | BLAS | CPU | CUDA | Metal |
 |-----------|------|------|------|------|
 |                              ABS | ❌ | ✅ | 🟡 | ❌ |
 |                              ACC | ❌ | ✅ | ✅ | ✅ |
 |                              ADD | ❌ | ✅ | ✅ | 🟡 |
 |                             ADD1 | ❌ | ✅ | ✅ | ❌ |
 |                           ARANGE | ❌ | ✅ | ✅ | ✅ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ |
 |                            CLAMP | ❌ | ✅ | ✅ | 🟡 |
 |                           CONCAT | ❌ | ✅ | 🟡 | ✅ |
 |                             CONT | ❌ | ✅ | 🟡 | ✅ |
 |                       CONV_2D_DW | ❌ | ✅ | ✅ | ❌ |
 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ |
 |                CONV_TRANSPOSE_2D | ❌ | ✅ | ✅ | ❌ |
 |                              COS | ❌ | ✅ | ✅ | 🟡 |
 |                      COUNT_EQUAL | ❌ | ✅ | ✅ | ❌ |
 |                              CPY | ❌ | 🟡 | 🟡 | 🟡 |
 |               CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ❌ |
 |          CROSS_ENTROPY_LOSS_BACK | ❌ | ✅ | ✅ | ❌ |
 |                    DIAG_MASK_INF | ❌ | ✅ | ✅ | 🟡 |
 |                              DIV | ❌ | ✅ | ✅ | 🟡 |
 |                              DUP | ❌ | ✅ | 🟡 | 🟡 |
 |                              ELU | ❌ | ✅ | ❌ | 🟡 |
 |                              EXP | ❌ | ✅ | 🟡 | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | ✅ | 🟡 | 🟡 |
 |                GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ❌ |
 |                            GEGLU | ❌ | ✅ | ✅ | 🟡 |
 |                        GEGLU_ERF | ❌ | ✅ | ✅ | 🟡 |
 |                      GEGLU_QUICK | ❌ | ✅ | ✅ | 🟡 |
 |                             GELU | ❌ | ✅ | 🟡 | 🟡 |
 |                         GELU_ERF | ❌ | ✅ | 🟡 | 🟡 |
 |                       GELU_QUICK | ❌ | ✅ | 🟡 | 🟡 |
 |                         GET_ROWS | ❌ | ✅ | 🟡 | ✅ |
 |                    GET_ROWS_BACK | ❌ | 🟡 | 🟡 | ❌ |
 |                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ |
 |                      HARDSIGMOID | ❌ | ✅ | 🟡 | ❌ |
 |                        HARDSWISH | ❌ | ✅ | 🟡 | ❌ |
 |                           IM2COL | ❌ | ✅ | ✅ | 🟡 |
 |                          L2_NORM | ❌ | ✅ | ✅ | ✅ |
 |                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ |
 |                              LOG | ❌ | ✅ | ✅ | ❌ |
 |                             MEAN | ❌ | ✅ | ✅ | ✅ |
 |                              MUL | ❌ | ✅ | ✅ | 🟡 |
 |                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 |
 |                       MUL_MAT_ID | ❌ | ✅ | ✅ | ✅ |
 |                              NEG | ❌ | ✅ | 🟡 | 🟡 |
 |                             NORM | ❌ | ✅ | ✅ | 🟡 |
 |                   OPT_STEP_ADAMW | ❌ | ✅ | ✅ | ❌ |
 |                         OUT_PROD | 🟡 | 🟡 | 🟡 | ❌ |
 |                              PAD | ❌ | ✅ | ✅ | ✅ |
 |                   PAD_REFLECT_1D | ❌ | ✅ | ❌ | ✅ |
 |                          POOL_2D | ❌ | ✅ | ✅ | ✅ |
 |                            REGLU | ❌ | ✅ | ✅ | 🟡 |
 |                             RELU | ❌ | ✅ | 🟡 | 🟡 |
 |                           REPEAT | ❌ | ✅ | 🟡 | ✅ |
 |                      REPEAT_BACK | ❌ | ✅ | ✅ | ❌ |
 |                         RMS_NORM | ❌ | ✅ | ✅ | 🟡 |
 |                    RMS_NORM_BACK | ❌ | ✅ | ✅ | ❌ |
 |                     RMS_NORM_MUL | ❌ | ✅ | ✅ | ✅ |
 |                             ROPE | ❌ | ✅ | ✅ | ✅ |
 |                        ROPE_BACK | ❌ | ✅ | ✅ | ❌ |
 |                        RWKV_WKV6 | ❌ | ✅ | ✅ | ✅ |
 |                        RWKV_WKV7 | ❌ | ✅ | ✅ | ✅ |
 |                            SCALE | ❌ | ✅ | ✅ | ✅ |
 |                              SET | ❌ | ✅ | ❌ | ✅ |
 |                         SET_ROWS | ❌ | 🟡 | ❌ | 🟡 |
 |                              SGN | ❌ | ✅ | 🟡 | ❌ |
 |                          SIGMOID | ❌ | ✅ | 🟡 | 🟡 |
 |                             SILU | ❌ | ✅ | 🟡 | 🟡 |
 |                        SILU_BACK | ❌ | ✅ | ✅ | ❌ |
 |                              SIN | ❌ | ✅ | ✅ | 🟡 |
 |                         SOFT_MAX | ❌ | ✅ | ✅ | ✅ |
 |                    SOFT_MAX_BACK | ❌ | 🟡 | 🟡 | ❌ |
 |                              SQR | ❌ | ✅ | ✅ | 🟡 |
 |                             SQRT | ❌ | ✅ | ✅ | 🟡 |
 |                         SSM_CONV | ❌ | ✅ | ✅ | ✅ |
 |                         SSM_SCAN | ❌ | ✅ | ✅ | ✅ |
 |                             STEP | ❌ | ✅ | 🟡 | ❌ |
 |                              SUB | ❌ | ✅ | ✅ | 🟡 |
 |                              SUM | ❌ | ✅ | ✅ | ❌ |
 |                         SUM_ROWS | ❌ | ✅ | ✅ | ✅ |
 |                           SWIGLU | ❌ | ✅ | ✅ | 🟡 |
 |                             TANH | ❌ | ✅ | 🟡 | 🟡 |
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ |
 |                          UPSCALE | ❌ | ✅ | ✅ | 🟡 |
--- a/docs/ops/BLAS.csv
+++ b/docs/ops/BLAS.csv
--- a/docs/ops/CPU.csv
+++ b/docs/ops/CPU.csv
--- a/docs/ops/CUDA.csv
+++ b/docs/ops/CUDA.csv
--- a/docs/ops/Metal.csv
+++ b/docs/ops/Metal.csv
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -45,6 +45,7 @@ bool g_mul_mat_q = true;
 #include "ggml-cuda/upscale.cuh"
 #include "ggml-cuda/wkv.cuh"
 #include "ggml-cuda/gla.cuh"
 #include "ggml-cuda/set-rows.cuh"
 #include "ggml.h"
 #include <algorithm>
@ -2235,6 +2236,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_GET_ROWS_BACK:
            ggml_cuda_op_get_rows_back(ctx, dst);
            break;
        case GGML_OP_SET_ROWS:
            ggml_cuda_op_set_rows(ctx, dst);
            break;
        case GGML_OP_DUP:
            ggml_cuda_dup(ctx, dst);
            break;
@ -2304,6 +2308,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
                case GGML_UNARY_OP_EXP:
                    ggml_cuda_op_exp(ctx, dst);
                    break;
                case GGML_UNARY_OP_ELU:
                    ggml_cuda_op_elu(ctx, dst);
                    break;
                default:
                    return false;
            }
@ -3117,6 +3124,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_EXP:
                case GGML_UNARY_OP_ELU:
                    return ggml_is_contiguous(op->src[0]);
                default:
                    return false;
@ -3221,6 +3229,13 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            {
                return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
            } break;
        case GGML_OP_SET_ROWS:
            {
 #pragma message("TODO: implement Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, IQ4_NL support (https://github.com/ggml-org/llama.cpp/pull/14661)")
                return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16) &&
                       op->src[0]->type == GGML_TYPE_F32 &&
                       op->src[1]->type == GGML_TYPE_I64;
            } break;
        case GGML_OP_CPY:
            {
                ggml_type src0_type = op->src[0]->type;
--- a/ggml/src/ggml-cuda/set-rows.cu
+++ b/ggml/src/ggml-cuda/set-rows.cu
@ -0,0 +1,145 @@
 #include "set-rows.cuh"
 typedef void (*set_rows_kernel_t)(const char * src, char * dst);
 template<typename src_t, typename dst_t>
 __device__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {}
 template<>
 __device__ __forceinline__ void set_rows_1<float, half>(const float * src_f, half * dst_h) {
    *dst_h = __float2half(*src_f);
 }
 template<>
 __device__ __forceinline__ void set_rows_1<float, nv_bfloat16>(const float * src_f, nv_bfloat16 * dst_b) {
    *dst_b = *src_f;
 }
 template<>
 __device__ __forceinline__ void set_rows_1<float, float>(const float * src_f, float * dst_f) {
    *dst_f = *src_f;
 }
 template<typename src_t, typename dst_t>
 static __global__ void k_set_rows(
        const src_t * __restrict__ src0, const int64_t * __restrict__ src1, dst_t * __restrict__ dst,
        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
        const int64_t s01, const int64_t s02, const int64_t s03,
        const int64_t s10, const int64_t s11, const int64_t s12,
        const int64_t s1, const int64_t s2, const int64_t s3) {
    const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x;
    const int64_t ne_total = ne00 * ne01 * ne02 * ne03;
    if (i >= ne_total) {
        return;
    }
    const int64_t i03 = i / (ne00 * ne01 * ne02);
    const int64_t i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
    const int64_t i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00;
    const int64_t i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00;
    const int64_t i12 = i03 % ne12;
    const int64_t i11 = i02 % ne11;
    const int64_t i10 = i01;
    const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
    const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
    dst_t * dst_row_ptr    = dst + dst_row*s1 + i02*s2 + i03*s3;
    const src_t* src_elem = src0_row + i00;
    dst_t* dst_elem = dst_row_ptr + i00;
    set_rows_1(src_elem, dst_elem);
 }
 template<typename src_t, typename dst_t>
 static void set_rows_cuda(
        const src_t * src0_d, const int64_t * src1_d, dst_t * dst_d,
        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
        const size_t nb01, const size_t nb02, const size_t nb03,
        const size_t nb10, const size_t nb11, const size_t nb12,
        const size_t nb1, const size_t nb2, const size_t nb3,
        cudaStream_t stream) {
    const int64_t ne_total = ne00 * ne01 * ne02 * ne03;
    const int num_blocks = (ne_total + CUDA_SET_ROWS_BLOCK_SIZE - 1) / CUDA_SET_ROWS_BLOCK_SIZE;
    const dim3 block_size(CUDA_SET_ROWS_BLOCK_SIZE);
    const dim3 grid_size(num_blocks);
    const int64_t s01 = nb01/sizeof(src_t);
    const int64_t s02 = nb02/sizeof(src_t);
    const int64_t s03 = nb03/sizeof(src_t);
    const int64_t s10 = nb10/sizeof(int64_t);
    const int64_t s11 = nb11/sizeof(int64_t);
    const int64_t s12 = nb12/sizeof(int64_t);
    const int64_t s1  = nb1/sizeof(dst_t);
    const int64_t s2  = nb2/sizeof(dst_t);
    const int64_t s3  = nb3/sizeof(dst_t);
    if (ne_total > 0) {
        k_set_rows<<<grid_size, block_size, 0, stream>>>(
            src0_d, src1_d, dst_d,
            ne00, ne01, ne02, ne03,
            ne10, ne11, ne12, ne13,
            s01, s02, s03,
            s10, s11, s12,
            s1, s2, s3);
    }
 }
 void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(src1->type == GGML_TYPE_I64);
    GGML_TENSOR_BINARY_OP_LOCALS
    const float * src0_d   = (const float *)src0->data;
    const int64_t * src1_d = (const int64_t *)src1->data;
    cudaStream_t stream = ctx.stream();
    if (dst->type == GGML_TYPE_F32) {
        set_rows_cuda(
            src0_d, src1_d, (float*)dst->data,
            ne00, ne01, ne02, ne03,
            ne10, ne11, ne12, ne13,
            nb01, nb02, nb03,
            nb10, nb11, nb12,
            nb1, nb2, nb3,
            stream
        );
    } else if (dst->type == GGML_TYPE_F16) {
        set_rows_cuda(
            src0_d, src1_d, (half*)dst->data,
            ne00, ne01, ne02, ne03,
            ne10, ne11, ne12, ne13,
            nb01, nb02, nb03,
            nb10, nb11, nb12,
            nb1, nb2, nb3,
            stream
        );
    } else if (dst->type == GGML_TYPE_BF16) {
        set_rows_cuda(
            src0_d, src1_d, (nv_bfloat16*)dst->data,
            ne00, ne01, ne02, ne03,
            ne10, ne11, ne12, ne13,
            nb01, nb02, nb03,
            nb10, nb11, nb12,
            nb1, nb2, nb3,
            stream
        );
    } else {
        GGML_ABORT("unsupported type");
    }
 }
--- a/ggml/src/ggml-cuda/set-rows.cuh
+++ b/ggml/src/ggml-cuda/set-rows.cuh
@ -0,0 +1,7 @@
 #pragma once
 #include "common.cuh"
 #define CUDA_SET_ROWS_BLOCK_SIZE 256
 void ggml_cuda_op_set_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/ssm-conv.cu
+++ b/ggml/src/ggml-cuda/ssm-conv.cu
@ -107,8 +107,11 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
        if (nc == 4) {
            ssm_conv_f32<threads, 4><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
                                                                     dst, dst_nb0, dst_nb1, dst_nb2, n_t);
        } else if (nc == 3) {
            ssm_conv_f32<threads, 3><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
                                                                     dst, dst_nb0, dst_nb1, dst_nb2, n_t);
        } else {
-            GGML_ABORT("Only support kernel size = 4  now.");
+            GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
        }
    } else {
        if (nc == 4) {
@ -116,8 +119,13 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
            dim3          blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
            ssm_conv_long_token_f32<threads, 4, split_n_t><<<blocks, threads, 0, stream>>>(
                src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
        } else if (nc == 3) {
            const int64_t split_n_t = 32;
            dim3          blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
            ssm_conv_long_token_f32<threads, 3, split_n_t><<<blocks, threads, 0, stream>>>(
                src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
        } else {
-            GGML_ABORT("Only support kernel size = 4 right now.");
+            GGML_ABORT("Only support kernel size = 3 or size = 4 right now.");
        }
    }
 }
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@ -83,6 +83,10 @@ static __device__ __forceinline__ float op_log(float x) {
    return logf(x);
 }
 static __device__ __forceinline__ float op_elu(float x) {
    return (x > 0.f) ? x : expm1f(x);
 }
 template <float (*op)(float), typename T>
 static __global__ void unary_op_kernel(const T * x, T * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;
@ -196,6 +200,9 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_unary<op_log>(ctx, dst);
 }
 void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_unary<op_elu>(ctx, dst);
 }
 /* gated ops */
 template <float (*op)(float), typename T>
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@ -59,6 +59,8 @@ void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@ -10,9 +10,6 @@
 #include "rocblas/rocblas.h"
 #endif // __HIP_PLATFORM_AMD__
 #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
 #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
 #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_OP_N HIPBLAS_OP_N
@ -30,7 +27,6 @@
 #define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
 #define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
 #define cublasCreate hipblasCreate
 #define cublasDestroy hipblasDestroy
 #define cublasGemmEx hipblasGemmEx
@ -42,7 +38,6 @@
 #define cublasSgemm hipblasSgemm
 #define cublasStatus_t hipblasStatus_t
 #define cublasOperation_t hipblasOperation_t
 #define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
 #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
 #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
@ -144,6 +139,20 @@
 #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
 #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
 #if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION >= 70000000
 #define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
 #define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
 #define cublasComputeType_t hipblasComputeType_t
 #define cudaDataType_t hipDataType
 #else
 #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
 #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
 #define cublasComputeType_t hipblasDatatype_t
 #define cudaDataType_t hipblasDatatype_t
 #endif
 #define __CUDA_ARCH__ 1300
 #if defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__)
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@ -173,6 +173,12 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_SILU,
    GGML_METAL_KERNEL_TYPE_SILU_4,
    GGML_METAL_KERNEL_TYPE_ELU,
    GGML_METAL_KERNEL_TYPE_ABS,
    GGML_METAL_KERNEL_TYPE_SGN,
    GGML_METAL_KERNEL_TYPE_STEP,
    GGML_METAL_KERNEL_TYPE_HARDSWISH,
    GGML_METAL_KERNEL_TYPE_HARDSIGMOID,
    GGML_METAL_KERNEL_TYPE_EXP,
    GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16,
    GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4,
    GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32,
@ -1155,6 +1161,12 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU,                            silu,                            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4,                          silu_4,                          true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ELU,                             elu,                             true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ABS,                             abs,                             true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SGN,                             sgn,                             true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_STEP,                            step,                            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_HARDSWISH,                       hardswish,                       true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_HARDSIGMOID,                     hardsigmoid,                     true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_EXP,                             exp,                             true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16,                    soft_max_f16,                    has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4,                  soft_max_f16_4,                  has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32,                    soft_max_f32,                    has_simdgroup_reduction);
@ -1688,6 +1700,12 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
                case GGML_UNARY_OP_SILU:
                case GGML_UNARY_OP_ELU:
                case GGML_UNARY_OP_NEG:
                case GGML_UNARY_OP_ABS:
                case GGML_UNARY_OP_SGN:
                case GGML_UNARY_OP_STEP:
                case GGML_UNARY_OP_HARDSWISH:
                case GGML_UNARY_OP_HARDSIGMOID:
                case GGML_UNARY_OP_EXP:
                    return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
                default:
                    return false;
@ -2439,6 +2457,78 @@ static bool ggml_metal_encode_node(
                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                } break;
                case GGML_UNARY_OP_ABS:
                {
                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ABS].pipeline;
                    [encoder setComputePipelineState:pipeline];
                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                    const int64_t n = ggml_nelements(dst);
                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                } break;
                case GGML_UNARY_OP_SGN:
                {
                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SGN].pipeline;
                    [encoder setComputePipelineState:pipeline];
                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                    const int64_t n = ggml_nelements(dst);
                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                } break;
                case GGML_UNARY_OP_STEP:
                {
                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_STEP].pipeline;
                    [encoder setComputePipelineState:pipeline];
                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                    const int64_t n = ggml_nelements(dst);
                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                } break;
                case GGML_UNARY_OP_HARDSWISH:
                {
                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_HARDSWISH].pipeline;
                    [encoder setComputePipelineState:pipeline];
                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                    const int64_t n = ggml_nelements(dst);
                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                } break;
                case GGML_UNARY_OP_HARDSIGMOID:
                {
                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_HARDSIGMOID].pipeline;
                    [encoder setComputePipelineState:pipeline];
                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                    const int64_t n = ggml_nelements(dst);
                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                } break;
                case GGML_UNARY_OP_EXP:
                {
                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_EXP].pipeline;
                    [encoder setComputePipelineState:pipeline];
                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                    const int64_t n = ggml_nelements(dst);
                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                } break;
                default:
                {
                    GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@ -1199,6 +1199,51 @@ kernel void kernel_neg(
    dst[tpig] = -src0[tpig];
 }
 kernel void kernel_abs(
        device const float * src0,
        device       float * dst,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = fabs(src0[tpig]);
 }
 kernel void kernel_sgn(
        device const float * src0,
        device       float * dst,
        uint tpig[[thread_position_in_grid]]) {
    device const float & x = src0[tpig];
    dst[tpig] = (x > 0.0f) ? 1.0f : ((x < 0.0f) ? -1.0f : 0.0f);
 }
 kernel void kernel_step(
        device const float * src0,
        device       float * dst,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = src0[tpig] > 0.0f ? 1.0f : 0.0f;
 }
 kernel void kernel_hardswish(
        device const float * src0,
        device       float * dst,
        uint tpig[[thread_position_in_grid]]) {
    device const float & x = src0[tpig];
    dst[tpig] = x * fmin(1.0f, fmax(0.0f, (x + 3.0f) / 6.0f));
 }
 kernel void kernel_hardsigmoid(
        device const float * src0,
        device       float * dst,
        uint tpig[[thread_position_in_grid]]) {
    device const float & x = src0[tpig];
    dst[tpig] = fmin(1.0f, fmax(0.0f, (x + 3.0f) / 6.0f));
 }
 kernel void kernel_exp(
        device const float * src0,
        device       float * dst,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = exp(src0[tpig]);
 }
 kernel void kernel_reglu(
        device const char * src0,
        device const char * src1,
--- a/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl
@ -0,0 +1,130 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #if defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #else
 #define REQD_SUBGROUP_SIZE_128
 #endif
 #define OPWM 64
 #define OPWN 64
 #define CPWK 8
 #define OPTM 4
 #define OPTN 8
 #define WG_M (OPWM / OPTM)
 #define WG_N (OPWN / OPTN)
 #define VEC_K (CPWK / 4)
 REQD_SUBGROUP_SIZE_128
 __kernel void mul_mat_f16_f32(
    const int M, const int N, const int K,
    __global const void* A_void, ulong A_offset,
    __global const void* B_void, ulong B_offset,
    __global       void* C_void, ulong C_offset) {
    __global const half*  A = (__global const half* )((__global const char*)A_void + A_offset);
    __global const float* B = (__global const float*)((__global const char*)B_void + B_offset);
    __global       float* C = (__global       float*)((__global       char*)C_void + C_offset);
    const int lidm = get_local_id(0);
    const int lidn = get_local_id(1);
    const int lid = lidn * WG_M + lidm;
    const int offsetM = get_group_id(0) * OPWM;
    const int offsetN = get_group_id(1) * OPWN;
    __local half4  Alocal[OPWM][VEC_K];
    __local float4 Blocal[OPWN][VEC_K];
    float sum[OPTM][OPTN];
    for (int wm = 0; wm < OPTM; wm++) {
        for (int wn = 0; wn < OPTN; wn++) {
            sum[wm][wn] = 0.0f;
        }
    }
    const int numTiles = (K + CPWK - 1) / CPWK;
    const int load_row_a = lid % OPWM;
    const int load_vec_k_a = lid / OPWM;
    const int global_row_a = offsetM + load_row_a;
    const int load_row_b = lid % OPWN;
    const int load_vec_k_b = lid / OPWN;
    const int global_row_b = offsetN + load_row_b;
    for (int t = 0; t < numTiles; t++) {
        const int k_start = t * CPWK;
        const int k_vec_start_a = k_start + load_vec_k_a * 4;
        const int k_vec_start_b = k_start + load_vec_k_b * 4;
        if (global_row_a < M && k_vec_start_a < K) {
            if (k_vec_start_a + 3 < K) {
                Alocal[load_row_a][load_vec_k_a] = vload4(0, A + global_row_a * K + k_vec_start_a);
            } else {
                half4 tempA = (half4)(0.0h);
                if (k_vec_start_a < K) tempA.s0 = A[global_row_a * K + k_vec_start_a];
                if (k_vec_start_a + 1 < K) tempA.s1 = A[global_row_a * K + k_vec_start_a + 1];
                if (k_vec_start_a + 2 < K) tempA.s2 = A[global_row_a * K + k_vec_start_a + 2];
                Alocal[load_row_a][load_vec_k_a] = tempA;
            }
        } else {
            Alocal[load_row_a][load_vec_k_a] = (half4)(0.0h);
        }
        if (global_row_b < N && k_vec_start_b < K) {
            if (k_vec_start_b + 3 < K) {
                Blocal[load_row_b][load_vec_k_b] = vload4(0, B + global_row_b * K + k_vec_start_b);
            } else {
                float4 tempB = (float4)(0.0f);
                if (k_vec_start_b < K) tempB.s0 = B[global_row_b * K + k_vec_start_b];
                if (k_vec_start_b + 1 < K) tempB.s1 = B[global_row_b * K + k_vec_start_b + 1];
                if (k_vec_start_b + 2 < K) tempB.s2 = B[global_row_b * K + k_vec_start_b + 2];
                Blocal[load_row_b][load_vec_k_b] = tempB;
            }
        } else {
            Blocal[load_row_b][load_vec_k_b] = (float4)(0.0f);
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        #pragma unroll
        for (int k_vec = 0; k_vec < VEC_K; k_vec++) {
            float4 a_fvecs[OPTM];
            int current_row_a = lidm;
            for (int wm = 0; wm < OPTM; wm++) {
                a_fvecs[wm] = convert_float4(Alocal[current_row_a][k_vec]);
                current_row_a += WG_M;
            }
            float4 b_fvecs[OPTN];
            int current_row_b = lidn;
            for (int wn = 0; wn < OPTN; wn++) {
                b_fvecs[wn] = Blocal[current_row_b][k_vec];
                current_row_b += WG_N;
            }
            for (int wm = 0; wm < OPTM; wm++) {
                for (int wn = 0; wn < OPTN; wn++) {
                    sum[wm][wn] += dot(a_fvecs[wm], b_fvecs[wn]);
                }
            }
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    for (int wm = 0; wm < OPTM; wm++) {
        int globalRow = offsetM + lidm + wm * WG_M;
        if (globalRow < M) {
            for (int wn = 0; wn < OPTN; wn++) {
                int globalCol = offsetN + lidn + wn * WG_N;
                if (globalCol < N) {
                    C[globalCol * M + globalRow] = sum[wm][wn];
                }
            }
        }
    }
 }
--- a/ggml/src/ggml-opencl/kernels/set_rows.cl
+++ b/ggml/src/ggml-opencl/kernels/set_rows.cl
@ -0,0 +1,95 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 kernel void kernel_set_rows_f32(
        global char * src0,
        ulong         offset0,
        global char * src1,
        ulong         offset1,
        global char * dst,
        ulong         offsetd,
        int           ne01,
        ulong         nb01,
        ulong         nb02,
        ulong         nb03,
        int           ne11,
        int           ne12,
        ulong         nb10,
        ulong         nb11,
        ulong         nb12,
        int           nblk0,
        ulong         nb1,
        ulong         nb2,
        ulong         nb3
 ) {
    src0 = src0 + offset0;
    src1 = src1 + offset1;
    dst  = dst  + offsetd;
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
    if (i01 >= ne01) {
        return;
    }
    int i12 = i03%ne12;
    int i11 = i02%ne11;
    int i10 = i01;
    long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
    global float * dst_row = (global float *) (dst  +  i1*nb1  + i02*nb2  + i03*nb3);
    global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
    for (int ind = get_local_id(0); ind < nblk0; ind += get_local_size(0)) {
        dst_row[ind] = (float)src_row[ind];
    }
 }
 kernel void kernel_set_rows_f16(
        global char * src0,
        ulong         offset0,
        global char * src1,
        ulong         offset1,
        global char * dst,
        ulong         offsetd,
        int           ne01,
        ulong         nb01,
        ulong         nb02,
        ulong         nb03,
        int           ne11,
        int           ne12,
        ulong         nb10,
        ulong         nb11,
        ulong         nb12,
        int           nblk0,
        ulong         nb1,
        ulong         nb2,
        ulong         nb3
 ) {
    src0 = src0 + offset0;
    src1 = src1 + offset1;
    dst  = dst  + offsetd;
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
    if (i01 >= ne01) {
        return;
    }
    int i12 = i03%ne12;
    int i11 = i02%ne11;
    int i10 = i01;
    long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
    global half  * dst_row = (global half  *) (dst  +  i1*nb1  + i02*nb2  + i03*nb3);
    global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
    for (int ind = get_local_id(0); ind < nblk0; ind += get_local_size(0)) {
        dst_row[ind] = src_row[ind];
    }
 }
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -441,18 +441,20 @@ struct vk_device_struct {
    vk_pipeline pipeline_div_norepeat[2][2][2];
    vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
-    vk_pipeline pipeline_upscale_f32;
+    vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32, pipeline_upscale_bilinear_ac_f32;
    vk_pipeline pipeline_scale_f32;
    vk_pipeline pipeline_sqr_f32;
    vk_pipeline pipeline_sin_f32;
    vk_pipeline pipeline_cos_f32;
    vk_pipeline pipeline_clamp_f32;
    vk_pipeline pipeline_pad_f32;
    vk_pipeline pipeline_roll_f32;
    vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32;
    vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16;
    vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16;
    vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT];
    vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT];
    vk_pipeline pipeline_set_rows[GGML_TYPE_COUNT];
    vk_pipeline pipeline_norm_f32;
    vk_pipeline pipeline_group_norm_f32;
    vk_pipeline pipeline_rms_norm_f32;
@ -709,6 +711,37 @@ struct vk_op_unary_push_constants {
 };
 static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128");
 static vk_op_unary_push_constants vk_op_unary_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst, int64_t ne = 0) {
    GGML_ASSERT(ne != 0 || (ggml_nelements(src0) == ggml_nelements(dst)));
    ne = ne != 0 ? ne : ggml_nelements(dst);
    GGML_ASSERT(ne <= (int64_t)std::numeric_limits<uint32_t>::max());
    vk_op_unary_push_constants p{};
    p.ne = (uint32_t)ne;
    size_t src0_tsize = ggml_type_size(src0->type);
    p.ne00 = (uint32_t)src0->ne[0];
    p.ne01 = (uint32_t)src0->ne[1];
    p.ne02 = (uint32_t)src0->ne[2];
    p.ne03 = (uint32_t)src0->ne[3];
    p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize);
    p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize);
    p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize);
    p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize);
    size_t dst_tsize = ggml_type_size(dst->type);
    p.ne10 = (uint32_t)dst->ne[0];
    p.ne11 = (uint32_t)dst->ne[1];
    p.ne12 = (uint32_t)dst->ne[2];
    p.ne13 = (uint32_t)dst->ne[3];
    p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize);
    p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize);
    p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize);
    p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize);
    return p; // fastdiv values and offsets are initialized later in ggml_vk_op
 }
 // See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
 // Precompute mp (m' in the paper) and L such that division
 // can be computed using a multiply (high 32b of 64b result)
@ -878,6 +911,7 @@ struct vk_op_conv2d_dw_push_constants {
 struct vk_op_upscale_push_constants {
    uint32_t ne; uint32_t a_offset; uint32_t d_offset;
    uint32_t ne00; uint32_t ne01;
    uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
    uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
    float sf0; float sf1; float sf2; float sf3;
@ -1751,7 +1785,14 @@ static FaHeadSizes fa_get_head_sizes(uint32_t hsk, uint32_t hsv) {
 // number of rows/cols for flash attention shader
 static constexpr uint32_t flash_attention_num_small_rows = 32;
 static constexpr uint32_t scalar_flash_attention_num_small_rows = 1;
-static constexpr uint32_t scalar_flash_attention_num_large_rows = 8;
+
 static uint32_t get_fa_scalar_num_large_rows(uint32_t hsv) {
    if (hsv >= 512) {
        return 2;
    } else {
        return 8;
    }
 }
 // The FA coopmat1 shader assumes 16x16x16 matrix multiply support.
 // 128 threads split into four subgroups, each subgroup does 1/4
@ -1776,7 +1817,7 @@ static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t hsk, uint3
        if (small_rows) {
            return {scalar_flash_attention_num_small_rows, 64};
        } else {
-            return {scalar_flash_attention_num_large_rows, 32};
+            return {get_fa_scalar_num_large_rows(hsv), 32};
        }
    }
@ -1795,8 +1836,12 @@ static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t hsk, uint3
    // small cols to reduce register count
    if (ggml_is_quantized(type) || hsk >= 256) {
        if (hsk >= 512) {
            return {32, 32};
        } else {
            return {64, 32};
        }
    }
    return {64, 64};
 }
@ -1837,7 +1882,7 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
    const uint32_t warps = warptile[0] / warptile[10];
    const uint32_t load_bufs = (warptile[1] + warptile[2]) * (warptile[3] + bank_conflict_offset) * type_size;
-    const uint32_t mmid_row_ids = mul_mat_id ? 4096 * sizeof(uint32_t) : 0;
+    const uint32_t mmid_row_ids = mul_mat_id ? (4096 * sizeof(uint32_t) + 4/*_ne1*/) : 0;
    const uint32_t coopmat_stage = device->coopmat_support ? warptile[7] * warptile[8] / warps * sizeof(float) : 0;
    const uint32_t total_size = load_bufs + mmid_row_ids + coopmat_stage + lut_size;
@ -1962,10 +2007,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
        s_mmq_wg_denoms_k = { 32,  32, 1 };
        // spec constants and tile sizes for quant matmul_id
-        l_warptile_mmqid = { 256, 128, 64, 16, 0 };
+        l_warptile_mmqid = { 256, 128, 128, 16, 0 };
        m_warptile_mmqid = { 256, 128, 64, 16, 0 };
        s_warptile_mmqid = { 256, 128, 64, 16, 0 };
-        l_mmqid_wg_denoms = { 128, 64, 1 };
+        l_mmqid_wg_denoms = { 128, 128, 1 };
        m_mmqid_wg_denoms = { 128, 64, 1 };
        s_mmqid_wg_denoms = { 128, 64, 1 };
@ -2754,19 +2799,41 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_bf16,"contig_cpy_f32_bf16",contig_cpy_f32_bf16_len,contig_cpy_f32_bf16_data,"main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    if (device->float_controls_rte_fp16) {
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_rte_len, cpy_f32_q4_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_rte_len, cpy_f32_q4_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_rte_len, cpy_f32_q4_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_rte_len, cpy_f32_q4_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_rte_len, cpy_f32_q5_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_rte_len, cpy_f32_q5_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_rte_len, cpy_f32_q5_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_rte_len, cpy_f32_q5_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_rte_len, cpy_f32_q8_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_rte_len, cpy_f32_q8_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_rte_len, cpy_f32_iq4_nl_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_rte_len, cpy_f32_iq4_nl_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
    } else {
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
-        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
+        ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {32, 1, 1}, {}, 1);
    }
    if (device->float_controls_rte_fp16) {
        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_F32],  "set_rows_f32",  set_rows_f32_rte_len,  set_rows_f32_rte_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_F16],  "set_rows_f16",  set_rows_f16_rte_len,  set_rows_f16_rte_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_BF16], "set_rows_bf16", set_rows_bf16_rte_len, set_rows_bf16_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q4_0], "set_rows_q4_0", set_rows_q4_0_rte_len, set_rows_q4_0_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q4_1], "set_rows_q4_1", set_rows_q4_1_rte_len, set_rows_q4_1_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q5_0], "set_rows_q5_0", set_rows_q5_0_rte_len, set_rows_q5_0_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q5_1], "set_rows_q5_1", set_rows_q5_1_rte_len, set_rows_q5_1_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q8_0], "set_rows_q8_0", set_rows_q8_0_rte_len, set_rows_q8_0_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_IQ4_NL], "set_rows_iq4_nl", set_rows_iq4_nl_rte_len, set_rows_iq4_nl_rte_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
    } else {
        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_F32],  "set_rows_f32",  set_rows_f32_len,  set_rows_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_F16],  "set_rows_f16",  set_rows_f16_len,  set_rows_f16_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_BF16], "set_rows_bf16", set_rows_bf16_len, set_rows_bf16_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q4_0], "set_rows_q4_0", set_rows_q4_0_len, set_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q4_1], "set_rows_q4_1", set_rows_q4_1_len, set_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q5_0], "set_rows_q5_0", set_rows_q5_0_len, set_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q5_1], "set_rows_q5_1", set_rows_q5_1_len, set_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_Q8_0], "set_rows_q8_0", set_rows_q8_0_len, set_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
        ggml_vk_create_pipeline(device, device->pipeline_set_rows[GGML_TYPE_IQ4_NL], "set_rows_iq4_nl", set_rows_iq4_nl_len, set_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {1}, 1, true);
    }
    ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_0], "cpy_q4_0_f32", cpy_q4_0_f32_len, cpy_q4_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
@ -2806,7 +2873,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_upscale_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_upscale_nearest_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_NEAREST}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_ac_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
@ -2818,6 +2887,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_roll_f32, "roll_f32", roll_f32_len, roll_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
@ -6072,7 +6143,7 @@ static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, con
    // Needs to be kept up to date on shader changes
    GGML_UNUSED(hsv);
    const uint32_t wg_size = scalar_flash_attention_workgroup_size;
-    const uint32_t Br = scalar_flash_attention_num_large_rows;
+    const uint32_t Br = get_fa_scalar_num_large_rows(hsv);
    const uint32_t Bc = scalar_flash_attention_Bc;
    const uint32_t tmpsh = wg_size * sizeof(float);
@ -6197,7 +6268,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    case FA_SCALAR:
    case FA_COOPMAT1:
        // We may switch from coopmat1 to scalar, so use the scalar limit for both
-        max_gqa = scalar_flash_attention_num_large_rows;
+        max_gqa = get_fa_scalar_num_large_rows(HSV);
        break;
    case FA_COOPMAT2:
        max_gqa = get_fa_num_small_rows(FA_COOPMAT2);
@ -6492,8 +6563,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
        }
        return nullptr;
    case GGML_OP_UPSCALE:
-        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && dst->op_params[0] == GGML_SCALE_MODE_NEAREST) {
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return ctx->device->pipeline_upscale_f32;
+            int mode = ggml_get_op_params_i32(dst, 0);
            switch (mode) {
                case GGML_SCALE_MODE_NEAREST:
                    return ctx->device->pipeline_upscale_nearest_f32;
                case GGML_SCALE_MODE_BILINEAR:
                    return ctx->device->pipeline_upscale_bilinear_f32;
                case GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS:
                    return ctx->device->pipeline_upscale_bilinear_ac_f32;
            }
        }
        return nullptr;
    case GGML_OP_SCALE:
@ -6526,6 +6605,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
            return ctx->device->pipeline_pad_f32;
        }
        return nullptr;
    case GGML_OP_ROLL:
        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
            return ctx->device->pipeline_roll_f32;
        }
        return nullptr;
    case GGML_OP_REPEAT:
        if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) {
            return ctx->device->pipeline_repeat_f32;
@ -6540,6 +6624,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
    case GGML_OP_CONT:
    case GGML_OP_DUP:
        return ggml_vk_get_cpy_pipeline(ctx, src0, dst, dst->type);
    case GGML_OP_SET_ROWS:
        return ctx->device->pipeline_set_rows[dst->type];
    case GGML_OP_SILU_BACK:
        if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
            return ctx->device->pipeline_silu_back_f32;
@ -6778,6 +6864,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
    case GGML_OP_RMS_NORM:
    case GGML_OP_CONV_2D_DW:
    case GGML_OP_IM2COL:
    case GGML_OP_SET_ROWS:
        return true;
    default:
        return false;
@ -7072,6 +7159,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
    case GGML_OP_COS:
    case GGML_OP_CLAMP:
    case GGML_OP_PAD:
    case GGML_OP_ROLL:
    case GGML_OP_REPEAT:
    case GGML_OP_REPEAT_BACK:
    case GGML_OP_CPY:
@ -7091,6 +7179,12 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
                    ne *= ggml_type_size(src0->type) / 2;
                }
            }
            // copy_to_quant has block size of 32, and each thread does QUANT_K elements.
            // Splitting into 512x512xZ wouldn't work well since each workgroup does 1024 elements.
            // So divide by block size here before splitting into 512x512 groups.
            if (op == GGML_OP_CPY && !ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
                ne = CEIL_DIV(ne, ggml_blck_size(dst->type));
            }
            if (ne > 262144) {
                elements = { 512, 512, CEIL_DIV(ne, 262144) };
            } else if (ne > 512) {
@ -7099,6 +7193,25 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
                elements = { ne, 1, 1 };
            }
        } break;
    case GGML_OP_SET_ROWS:
        {
            uint32_t ne = ggml_nelements(src0);
            if (ggml_is_quantized(dst->type)) {
                // quants run 32 threads each doing QUANT_K elements
                ne = CEIL_DIV(ne, 32 * ggml_blck_size(dst->type));
            } else {
                // scalar types do one element per thread, running 512 threads
                ne = CEIL_DIV(ne, 512);
            }
            if (ne > 262144) {
                elements = { 512, 512, CEIL_DIV(ne, 262144) };
            } else if (ne > 512) {
                elements = { 512, CEIL_DIV(ne, 512), 1 };
            } else {
                elements = { ne, 1, 1 };
            }
        }
        break;
    default:
        elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
        break;
@ -7508,14 +7621,21 @@ static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, co
 static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
    const uint32_t src0_type_size = ggml_type_size(src0->type);
    const uint32_t mode = (uint32_t)ggml_get_op_params_i32(dst, 0);
-    const float sf0 = (float)dst->ne[0] / src0->ne[0];
+    float sf0 = (float)dst->ne[0] / src0->ne[0];
-    const float sf1 = (float)dst->ne[1] / src0->ne[1];
+    float sf1 = (float)dst->ne[1] / src0->ne[1];
-    const float sf2 = (float)dst->ne[2] / src0->ne[2];
+    float sf2 = (float)dst->ne[2] / src0->ne[2];
-    const float sf3 = (float)dst->ne[3] / src0->ne[3];
+    float sf3 = (float)dst->ne[3] / src0->ne[3];
    if (mode & GGML_SCALE_FLAG_ALIGN_CORNERS) {
        sf0 = (float)(dst->ne[0] - 1) / (src0->ne[0] - 1);
        sf1 = (float)(dst->ne[1] - 1) / (src0->ne[1] - 1);
    }
    ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
        (uint32_t)ggml_nelements(dst), 0, 0,
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1],
        (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
        sf0, sf1, sf2, sf3,
@ -7523,123 +7643,64 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
 }
 static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    float * op_params = (float *)dst->op_params;
+    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
-    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    p.param1 = ggml_get_op_params_f32(dst, 0);
-    const uint32_t dst_type_size = ggml_type_size(dst->type);
+    p.param2 = ggml_get_op_params_f32(dst, 1);
-    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, {
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p), dryrun);
        (uint32_t)ggml_nelements(src0),
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
        op_params[0], op_params[1],
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    }, dryrun);
 }
 static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst), dryrun);
    const uint32_t dst_type_size = ggml_type_size(dst->type);
    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, {
        (uint32_t)ggml_nelements(src0),
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
        0.0f, 0.0f,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    }, dryrun);
 }
 static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst), dryrun);
    const uint32_t dst_type_size = ggml_type_size(dst->type);
    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, {
        (uint32_t)ggml_nelements(src0),
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
        0.0f, 0.0f,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    }, dryrun);
 }
 static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst), dryrun);
    const uint32_t dst_type_size = ggml_type_size(dst->type);
    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, {
        (uint32_t)ggml_nelements(src0),
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
        0.0f, 0.0f,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    }, dryrun);
 }
 static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    float * op_params = (float *)dst->op_params;
+    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
-    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    p.param1 = ggml_get_op_params_f32(dst, 0);
-    const uint32_t dst_type_size = ggml_type_size(dst->type);
+    p.param2 = ggml_get_op_params_f32(dst, 1);
-    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, {
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p), dryrun);
        (uint32_t)ggml_nelements(src0),
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
        op_params[0], op_params[1],
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    }, dryrun);
 }
 static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
-    const uint32_t dst_type_size = ggml_type_size(dst->type);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p), dryrun);
 }
-    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, {
+static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-        (uint32_t)ggml_nelements(dst),
+    const int32_t s0 = ggml_get_op_params_i32(dst, 0);
-        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+    const int32_t s1 = ggml_get_op_params_i32(dst, 1);
-        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+    const int32_t s2 = ggml_get_op_params_i32(dst, 2);
-        0,
+    const int32_t s3 = ggml_get_op_params_i32(dst, 3);
-        0.0f, 0.0f,
+    const uint32_t s01_packed = ((s0 + 0x8000) << 16) | (s1 + 0x8000);
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    const uint32_t s23_packed = ((s2 + 0x8000) << 16) | (s3 + 0x8000);
-    }, dryrun);
+
    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
    memcpy(&p.param1, &s01_packed, sizeof(float));
    memcpy(&p.param2, &s23_packed, sizeof(float));
    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p), dryrun);
 }
 static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
-    const uint32_t dst_type_size = ggml_type_size(dst->type);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p), dryrun);
    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, {
        (uint32_t)ggml_nelements(dst),
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
        0.0f, 0.0f,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    }, dryrun);
 }
 static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
-    const uint32_t dst_type_size = ggml_type_size(dst->type);
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p), dryrun);
    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, {
        (uint32_t)ggml_nelements(dst),
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
        0.0f, 0.0f,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    }, dryrun);
 }
 static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
    const uint32_t src0_type_size = ggml_type_size(src0->type);
    const uint32_t dst_type_size = ggml_type_size(dst->type);
    uint32_t ne = (uint32_t)ggml_nelements(src0);
    if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
        // Convert from number of logical elements to 2- or 4-byte units.
@ -7651,13 +7712,22 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const
        }
    }
-    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
+    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ne);
-        ne,
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p), dryrun);
-        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+}
-        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+
 static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
    const uint32_t src0_type_size = ggml_type_size(src0->type);
    const uint32_t src1_type_size = ggml_type_size(src1->type);
    const uint32_t dst_type_size = ggml_type_size(dst->type);
    ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SET_ROWS, {
        (uint32_t)ggml_nelements(src0),
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
-        0.0f, 0.0f,
+        0.0f, 0.0f, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    }, dryrun);
 }
@ -8980,7 +9050,9 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
    case GGML_OP_COS:
    case GGML_OP_CLAMP:
    case GGML_OP_PAD:
    case GGML_OP_ROLL:
    case GGML_OP_CPY:
    case GGML_OP_SET_ROWS:
    case GGML_OP_CONT:
    case GGML_OP_DUP:
    case GGML_OP_SILU_BACK:
@ -9047,6 +9119,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
        case GGML_OP_CLAMP:
        case GGML_OP_PAD:
        case GGML_OP_CPY:
        case GGML_OP_SET_ROWS:
        case GGML_OP_CONT:
        case GGML_OP_DUP:
        case GGML_OP_SILU_BACK:
@ -9149,12 +9222,20 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
    case GGML_OP_PAD:
        ggml_vk_pad(ctx, compute_ctx, src0, node, dryrun);
        break;
    case GGML_OP_ROLL:
        ggml_vk_roll(ctx, compute_ctx, src0, node, dryrun);
        break;
    case GGML_OP_CPY:
    case GGML_OP_CONT:
    case GGML_OP_DUP:
        ggml_vk_cpy(ctx, compute_ctx, src0, node, dryrun);
        break;
    case GGML_OP_SET_ROWS:
        ggml_vk_set_rows(ctx, compute_ctx, src0, src1, node, dryrun);
        break;
    case GGML_OP_SILU_BACK:
        ggml_vk_silu_back(ctx, compute_ctx, src0, src1, node, dryrun);
@ -9369,7 +9450,9 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
    case GGML_OP_COS:
    case GGML_OP_CLAMP:
    case GGML_OP_PAD:
    case GGML_OP_ROLL:
    case GGML_OP_CPY:
    case GGML_OP_SET_ROWS:
    case GGML_OP_CONT:
    case GGML_OP_DUP:
    case GGML_OP_SILU_BACK:
@ -10435,9 +10518,20 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
            } break;
        case GGML_OP_SET_ROWS:
            {
-                // TODO: add support
+                switch (op->type) {
-                // ref: https://github.com/ggml-org/llama.cpp/pull/14274
+                    case GGML_TYPE_F32:
                    case GGML_TYPE_F16:
                    case GGML_TYPE_BF16:
                    case GGML_TYPE_Q4_0:
                    case GGML_TYPE_Q4_1:
                    case GGML_TYPE_Q5_0:
                    case GGML_TYPE_Q5_1:
                    case GGML_TYPE_Q8_0:
                    case GGML_TYPE_IQ4_NL:
                        return true;
                    default:
                        return false;
                }
            } break;
        case GGML_OP_CONT:
        case GGML_OP_CPY:
@ -10523,13 +10617,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
        case GGML_OP_CLAMP:
            return op->src[0]->type == GGML_TYPE_F32;
        case GGML_OP_UPSCALE:
            return op->op_params[0] == GGML_SCALE_MODE_NEAREST;
        case GGML_OP_ACC:
        case GGML_OP_CONCAT:
        case GGML_OP_SCALE:
        case GGML_OP_PAD:
        case GGML_OP_ROLL:
        case GGML_OP_DIAG_MASK_INF:
            return true;
        case GGML_OP_SOFT_MAX:
        case GGML_OP_SOFT_MAX_BACK:
        case GGML_OP_ARGSORT:
@ -11052,6 +11145,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
        } else {
            tensor_clone = ggml_cpy(ggml_ctx, src_clone[0], src_clone[1]);
        }
    } else if (tensor->op == GGML_OP_SET_ROWS) {
        tensor_clone = ggml_set_rows(ggml_ctx, src_clone[0], src_clone[1]);
    } else if (tensor->op == GGML_OP_CONT) {
        tensor_clone = ggml_cont_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
    } else if (tensor->op == GGML_OP_RESHAPE) {
--- a/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp
@ -6,17 +6,25 @@ spirv_execution_mode(capabilities = [4467], 4462, 16); // RoundingModeRTE, 16 bi
 #endif // RTE16
 #include "types.comp"
 #include "generic_unary_head.comp"
-#if defined(DATA_A_IQ4_NL)
+#if defined(SET_ROWS) && QUANT_K == 1
-// 16 invocations needed for init_iq4nl_shmem
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
-layout(local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
+const uint BLOCK_SIZE = 512;
 #else
-layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
 const uint BLOCK_SIZE = 32;
 #endif
 layout (binding = 0) readonly buffer S {float data_s[];};
 #if defined(SET_ROWS)
 #include "generic_binary_head.comp"
 layout (binding = 1) readonly buffer C {uvec2 data_i[];};
 layout (binding = 2) writeonly buffer Q {A_TYPE data_q[];};
 #else
 #include "generic_unary_head.comp"
 layout (binding = 1) writeonly buffer Q {A_TYPE data_q[];};
 #endif
 #if defined(DATA_A_Q4_0)
 void quantize(uint dst_idx, uint src_idx)
@ -221,15 +229,56 @@ void quantize(uint dst_idx, uint src_idx)
 }
 #endif
 #if defined(DATA_A_F32) || defined(DATA_A_F16)
 void quantize(uint dst_idx, uint src_idx)
 {
    data_q[dst_idx] = A_TYPE(data_s[src_idx]);
 }
 #endif
 #if defined(DATA_A_BF16)
 void quantize(uint dst_idx, uint src_idx)
 {
    data_q[dst_idx] = A_TYPE(fp32_to_bf16(data_s[src_idx]));
 }
 #endif
 #if defined(SET_ROWS)
 void main() {
 #ifdef NEEDS_INIT_IQ_SHMEM
    init_iq_shmem(gl_WorkGroupSize);
    if (gl_LocalInvocationIndex.x != 0) {
        return;
    }
 #endif
-    const uint idx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x * QUANT_K;
+    const uint idx = ((gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x) * BLOCK_SIZE + gl_LocalInvocationID.x) * QUANT_K;
    if (idx >= p.ne) {
        return;
    }
    uint i00, i01, i02, i03;
    get_indices(idx, i00, i01, i02, i03);
    uint i12 = fastmod(i03, p.ne12);
    uint i11 = fastmod(i02, p.ne11);
    uint i10 = i01;
    uint i1 = data_i[src1_idx(i10, i11, i12, 0) + get_boffset()].x;
    uint src0_idx = src0_idx(i00, i01, i02, i03) + get_aoffset();
    uint dst_idx = dst_idx(i00 / QUANT_K, i1, i02, i03) + get_doffset();
    quantize(dst_idx, src0_idx);
 }
 #else
 void main() {
 #ifdef NEEDS_INIT_IQ_SHMEM
    init_iq_shmem(gl_WorkGroupSize);
 #endif
    const uint idx = (gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x * 32 + gl_LocalInvocationID.x) * QUANT_K;
    if (idx >= p.ne) {
        return;
@ -240,3 +289,5 @@ void main() {
    quantize(dst_idx, src_idx);
 }
 #endif
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
@ -18,6 +18,7 @@
 #extension GL_KHR_cooperative_matrix : enable
 #extension GL_KHR_memory_scope_semantics : enable
 #extension GL_KHR_shader_subgroup_basic : enable
 #extension GL_KHR_shader_subgroup_ballot : enable
 #endif
 #ifdef MUL_MAT_ID
@ -104,6 +105,10 @@ shared FLOAT_TYPE buf_b[BN * SHMEM_STRIDE];
 #ifdef MUL_MAT_ID
 shared u16vec2 row_ids[4096];
 uint _ne1;
 #ifdef COOPMAT
 shared uint _ne1_sh;
 #endif
 #endif // MUL_MAT_ID
 #define NUM_WARPS (BLOCK_SIZE / WARP)
@ -172,7 +177,47 @@ void main() {
    const uint loadstride_b = gl_WorkGroupSize.x * LOAD_VEC_B / BK;
 #ifdef MUL_MAT_ID
-    uint _ne1 = 0;
+#ifdef COOPMAT
    // Spread the search across all elements in the first subgroup
    if (gl_SubgroupID == 0) {
        _ne1 = 0;
        uint num_elements = p.nei1 * p.nei0;
        uint ids[16];
        uint iter = 0;
        for (uint j = 0; j < num_elements; j += gl_SubgroupSize) {
            // prefetch up to 16 elements
            if (iter == 0) {
                [[unroll]] for (uint k = 0; k < 16; ++k) {
                    uint i = j + gl_SubgroupInvocationID + k*gl_SubgroupSize;
                    bool in_range = i < num_elements;
                    uint ii1 = i / p.nei0;
                    uint ii0 = i % p.nei0;
                    ids[k] = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0;
                }
            }
            uint i = j + gl_SubgroupInvocationID;
            bool in_range = i < num_elements;
            uint ii1 = i / p.nei0;
            uint ii0 = i % p.nei0;
            uint id = ids[iter++];
            uvec4 ballot = subgroupBallot(in_range && id == expert_idx);
            uint idx = subgroupBallotExclusiveBitCount(ballot);
            if (in_range && id == expert_idx) {
                row_ids[_ne1 + idx] = u16vec2(ii0, ii1);
            }
            _ne1 += subgroupBallotBitCount(ballot);
            iter &= 15;
        }
        _ne1_sh = _ne1;
    }
    barrier();
    _ne1 = _ne1_sh;
 #else
    _ne1 = 0;
    for (uint ii1 = 0; ii1 < p.nei1; ii1++) {
        for (uint ii0 = 0; ii0 < p.nei0; ii0++) {
            if (data_ids[ii1*p.nbi1 + ii0] == expert_idx) {
@ -183,6 +228,7 @@ void main() {
    }
    barrier();
 #endif
    // Workgroup has no work
    if (ic * BN >= _ne1) return;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
@ -162,17 +162,32 @@ void main() {
        _ne1 = 0;
        uint num_elements = p.nei1 * p.nei0;
-        for (uint i = gl_SubgroupInvocationID; subgroupAny(i < num_elements); i += gl_SubgroupSize) {
+        uint ids[16];
        uint iter = 0;
        for (uint j = 0; j < num_elements; j += gl_SubgroupSize) {
            // prefetch up to 16 elements
            if (iter == 0) {
                [[unroll]] for (uint k = 0; k < 16; ++k) {
                    uint i = j + gl_SubgroupInvocationID + k*gl_SubgroupSize;
                    bool in_range = i < num_elements;
            uint ii0 = i % p.nei0;
                    uint ii1 = i / p.nei0;
-            uint id = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0;
+                    uint ii0 = i % p.nei0;
                    ids[k] = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0;
                }
            }
            uint i = j + gl_SubgroupInvocationID;
            bool in_range = i < num_elements;
            uint ii1 = i / p.nei0;
            uint ii0 = i % p.nei0;
            uint id = ids[iter++];
            uvec4 ballot = subgroupBallot(in_range && id == expert_idx);
            uint idx = subgroupBallotExclusiveBitCount(ballot);
            if (in_range && id == expert_idx) {
                row_ids[_ne1 + idx] = u16vec4(ii0 % p.ne11, ii1, ii0, 0);
            }
            _ne1 += subgroupBallotBitCount(ballot);
            iter &= 15;
        }
        _ne1_sh = _ne1;
    }
@ -414,6 +429,19 @@ void main() {
                fetch_scales(ir * BM, pos_a, stride_a, block_k + BK, tid, false);
            }
            if ((ir + 1) * BM <= p.M && block_k + BK <= end_k) {
                coopmat<MAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
                coopmat<MAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b;
                coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
 #ifdef MUL_MAT_ID
                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB);
 #else
                coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose);
 #endif
                sum = coopMatMulAdd(mat_a, mat_b, sum);
            } else {
                coopmat<MAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
                coopmat<MAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b;
@ -426,6 +454,7 @@ void main() {
                sum = coopMatMulAdd(mat_a, mat_b, sum);
            }
        }
        // Convert from ACC_TYPE to D_TYPE
        coopmat<D_TYPE, gl_ScopeWorkgroup, BM, BN, gl_MatrixUseAccumulator> mat_d;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp
@ -0,0 +1,46 @@
 #version 450
 #include "types.comp"
 #include "generic_unary_head.comp"
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 uint wrap_idx(int i, uint ne) {
    if (i < 0) {
        return i + ne;
    } else if (i >= ne) {
        return i - ne;
    }
    return i;
 }
 void main() {
    const uint idx = get_idx();
    if (idx >= p.ne) {
        return;
    }
    const uint i3 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
    const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10;
    const uint i2 = fastdiv(idx - i3_offset, p.ne1_01mp, p.ne1_01L);
    const uint i2_offset = i2*p.ne11*p.ne10;
    const uint i1 = fastdiv(idx - i3_offset - i2_offset, p.ne1_0mp, p.ne1_0L);
    const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;
    const uint p1 = floatBitsToUint(p.param1);
    const uint p2 = floatBitsToUint(p.param2);
    const int s0 = int(p1 >> 16)    - 0x8000;
    const int s1 = int(p1 & 0xFFFF) - 0x8000;
    const int s2 = int(p2 >> 16)    - 0x8000;
    const int s3 = int(p2 & 0xFFFF) - 0x8000;
    const uint i00 = wrap_idx(int(i0) - s0, p.ne10);
    const uint i01 = wrap_idx(int(i1) - s1, p.ne11);
    const uint i02 = wrap_idx(int(i2) - s2, p.ne12);
    const uint i03 = wrap_idx(int(i3) - s3, p.ne13);
    const uint a_idx = i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
    const uint d_idx = i3 *p.nb13 + i2 *p.nb12 + i1 *p.nb11 + i0 *p.nb10;
    data_d[get_doffset() + d_idx] = D_TYPE(data_a[get_aoffset() + a_idx]);
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp
@ -3,6 +3,7 @@
 layout (push_constant) uniform parameter
 {
    uint ne; uint a_offset; uint d_offset;
    uint ne00; uint ne01;
    uint nb00; uint nb01; uint nb02; uint nb03;
    uint ne10; uint ne11; uint ne12; uint ne13;
    float sf0; float sf1; float sf2; float sf3;
@ -15,6 +16,61 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 // from ggml.h: enum ggml_scale_mode, enum ggml_scale_flag
 #define NEAREST  0
 #define BILINEAR 1
 #define ALIGN_CORNERS (1 << 8)
 layout (constant_id = 0) const uint scale_mode = 0;
 float fetch_nearest(uint i10, uint i11, uint i12, uint i13) {
    const uint i00 = uint(i10 / p.sf0);
    const uint i01 = uint(i11 / p.sf1);
    const uint i02 = uint(i12 / p.sf2);
    const uint i03 = uint(i13 / p.sf3);
    return data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00];
 }
 float fetch_bilinear(ivec2 c0, ivec2 c1, vec2 d, uint i12, uint i13) {
    const uint i02 = uint(i12 / p.sf2);
    const uint i03 = uint(i13 / p.sf3);
    const uint base = p.a_offset + i03 * p.nb03 + i02 * p.nb02;
    const float v00 = data_a[base + c0.y * p.nb01 + c0.x * p.nb00];
    const float v01 = data_a[base + c0.y * p.nb01 + c1.x * p.nb00];
    const float v10 = data_a[base + c1.y * p.nb01 + c0.x * p.nb00];
    const float v11 = data_a[base + c1.y * p.nb01 + c1.x * p.nb00];
    return
        v00 * (1.0-d.x) * (1.0-d.y) +
        v01 * d.x       * (1.0-d.y) +
        v10 * (1.0-d.x) * d.y +
        v11 * d.x       * d.y;
 }
 float interpolate_bilinear(uint i10, uint i11, uint i12, uint i13) {
    const ivec2 ne0 = ivec2(p.ne00, p.ne01);
    const vec2 c = (vec2(i10, i11) + 0.5) / vec2(p.sf0, p.sf1) - 0.5;
    const vec2 c0f = floor(c);
    const vec2 d = c - c0f;
    const ivec2 c0 = max(ivec2(c0f), 0);
    const ivec2 c1 = min(ivec2(c0f + 1), ne0 - 1);
    return fetch_bilinear(c0, c1, d, i12, i13);
 }
 float interpolate_bilinear_align_corners(uint i10, uint i11, uint i12, uint i13) {
    const vec2 c = vec2(i10, i11) / vec2(p.sf0, p.sf1);
    const vec2 c0f = floor(c);
    const vec2 d = c - c0f;
    const ivec2 c0 = ivec2(c0f);
    const ivec2 c1 = c0 + 1;
    return fetch_bilinear(c0, c1, d, i12, i13);
 }
 void main() {
    const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
@ -27,10 +83,18 @@ void main() {
    const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
    const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;
-    const uint i00 = uint(i10 / p.sf0);
+    float result;
-    const uint i01 = uint(i11 / p.sf1);
+    switch (scale_mode) {
-    const uint i02 = uint(i12 / p.sf2);
+        case NEAREST:
-    const uint i03 = uint(i13 / p.sf3);
+            result = fetch_nearest(i10, i11, i12, i13);
            break;
        case BILINEAR:
            result = interpolate_bilinear(i10, i11, i12, i13);
            break;
        case BILINEAR | ALIGN_CORNERS:
            result = interpolate_bilinear_align_corners(i10, i11, i12, i13);
            break;
    }
-    data_d[p.d_offset + idx] = D_TYPE(data_a[p.a_offset + i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
+    data_d[p.d_offset + idx] = D_TYPE(result);
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@ -532,6 +532,11 @@ void process_shaders() {
        string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    }
    for (std::string t : {"f32", "f16", "bf16", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
        string_to_spv("set_rows_" + t, "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uvec2"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
        string_to_spv("set_rows_" + t + "_rte", "copy_to_quant.comp", {{"SET_ROWS", "1"}, {"DATA_A_" + to_uppercase(t), "1"}, {"B_TYPE", "uvec2"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}});
    }
    auto get_type_str = [](bool f16) {
        return f16 ? "float16_t" : "float";
    };
@ -662,6 +667,8 @@ void process_shaders() {
    string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
    string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));
    string_to_spv("roll_f32", "roll.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
    for (auto &c : compiles) {
        c.wait();
    }
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -187,6 +187,9 @@ class Keys:
    class Classifier:
        OUTPUT_LABELS = "{arch}.classifier.output_labels"
    class ShortConv:
        L_CACHE = "{arch}.shortconv.l_cache"
    class Tokenizer:
        MODEL                = "tokenizer.ggml.model"
        PRE                  = "tokenizer.ggml.pre"
@ -352,6 +355,7 @@ class MODEL_ARCH(IntEnum):
    EXAONE           = auto()
    GRANITE          = auto()
    GRANITE_MOE      = auto()
    GRANITE_HYBRID   = auto()
    CHAMELEON        = auto()
    WAVTOKENIZER_DEC = auto()
    PLM              = auto()
@ -361,6 +365,7 @@ class MODEL_ARCH(IntEnum):
    ERNIE4_5         = auto()
    HUNYUAN_MOE      = auto()
    SMOLLM3          = auto()
    LFM2             = auto()
 class VISION_PROJECTOR_TYPE(IntEnum):
@ -532,6 +537,9 @@ class MODEL_TENSOR(IntEnum):
    POSNET_ATTN_K        = auto()
    POSNET_ATTN_V        = auto()
    POSNET_ATTN_OUT      = auto()
    SHORTCONV_CONV       = auto()
    SHORTCONV_INPROJ     = auto()
    SHORTCONV_OUTPROJ    = auto()
    # vision
    V_MMPROJ             = auto()
    V_MMPROJ_FC          = auto()
@ -661,6 +669,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.EXAONE:           "exaone",
    MODEL_ARCH.GRANITE:          "granite",
    MODEL_ARCH.GRANITE_MOE:      "granitemoe",
    MODEL_ARCH.GRANITE_HYBRID:   "granitehybrid",
    MODEL_ARCH.CHAMELEON:        "chameleon",
    MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
    MODEL_ARCH.PLM:              "plm",
@ -671,6 +680,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.FALCON_H1:        "falcon-h1",
    MODEL_ARCH.HUNYUAN_MOE:      "hunyuan-moe",
    MODEL_ARCH.SMOLLM3:          "smollm3",
    MODEL_ARCH.LFM2:             "lfm2",
 }
 VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@ -842,6 +852,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.POSNET_ATTN_K:             "posnet.{bid}.attn_k",
    MODEL_TENSOR.POSNET_ATTN_V:             "posnet.{bid}.attn_v",
    MODEL_TENSOR.POSNET_ATTN_OUT:           "posnet.{bid}.attn_output",
    MODEL_TENSOR.SHORTCONV_CONV:            "blk.{bid}.shortconv.conv",
    MODEL_TENSOR.SHORTCONV_INPROJ:          "blk.{bid}.shortconv.in_proj",
    MODEL_TENSOR.SHORTCONV_OUTPROJ:         "blk.{bid}.shortconv.out_proj",
    # vision
    MODEL_TENSOR.V_MMPROJ:                  "mm.{bid}",
    MODEL_TENSOR.V_MMPROJ_FC:               "mm.model.fc",
@ -2143,6 +2156,36 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_UP_SHEXP,
        MODEL_TENSOR.FFN_DOWN_SHEXP,
    ],
    MODEL_ARCH.GRANITE_HYBRID: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
        MODEL_TENSOR.OUTPUT,
        MODEL_TENSOR.ATTN_NORM,
        MODEL_TENSOR.SSM_IN,
        MODEL_TENSOR.SSM_CONV1D,
        MODEL_TENSOR.SSM_DT,
        MODEL_TENSOR.SSM_A,
        MODEL_TENSOR.SSM_D,
        MODEL_TENSOR.SSM_NORM,
        MODEL_TENSOR.SSM_OUT,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
        MODEL_TENSOR.FFN_NORM,
        # MoE
        MODEL_TENSOR.FFN_GATE_INP,
        MODEL_TENSOR.FFN_GATE_EXP,
        MODEL_TENSOR.FFN_DOWN_EXP,
        MODEL_TENSOR.FFN_UP_EXP,
        MODEL_TENSOR.FFN_GATE_SHEXP,
        MODEL_TENSOR.FFN_UP_SHEXP,
        MODEL_TENSOR.FFN_DOWN_SHEXP,
        # Dense
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.CHAMELEON: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
@ -2324,6 +2367,24 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
    MODEL_ARCH.LFM2: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.TOKEN_EMBD_NORM,
        MODEL_TENSOR.SHORTCONV_CONV,
        MODEL_TENSOR.SHORTCONV_INPROJ,
        MODEL_TENSOR.SHORTCONV_OUTPROJ,
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.FFN_NORM,
        MODEL_TENSOR.ATTN_NORM, # operator_norm
        MODEL_TENSOR.ATTN_Q_NORM,
        MODEL_TENSOR.ATTN_K_NORM,
        MODEL_TENSOR.ATTN_Q,
        MODEL_TENSOR.ATTN_K,
        MODEL_TENSOR.ATTN_V,
        MODEL_TENSOR.ATTN_OUT,
    ],
    # TODO
 }
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -648,6 +648,9 @@ class GGUFWriter:
    def add_convnext_block_count(self, length: int) -> None:
        self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length)
    def add_shortconv_l_cache(self, length: int) -> None:
        self.add_uint32(Keys.ShortConv.L_CACHE.format(arch=self.arch), length)
    def add_block_count(self, length: int) -> None:
        self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -13,7 +13,7 @@ class TensorNameMap:
            "transformer.wte",                           # gpt2 gpt-j mpt refact qwen dbrx jais exaone
            "transformer.word_embeddings",               # falcon
            "word_embeddings",                           # bloom
-            "model.embed_tokens",                        # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414
+            "model.embed_tokens",                        # llama-hf nemotron olmoe olmo2 rwkv6qwen2 glm4-0414 granite-hybrid
            "tok_embeddings",                            # llama-pth
            "embeddings.word_embeddings",                # bert nomic-bert
            "language_model.embedding.word_embeddings",  # persimmon
@ -50,6 +50,7 @@ class TensorNameMap:
            "model.pre_ln",               # rwkv7
            "model.layers.0.pre_norm",    # rwkv7
            "backbone.norm",              # wavtokenizer
            "model.embedding_norm",       # lfm2
        ),
        # Position embeddings
@ -118,7 +119,7 @@ class TensorNameMap:
            "transformer.h.{bid}.input_layernorm",                  # falcon7b
            "h.{bid}.input_layernorm",                              # bloom
            "transformer.h.{bid}.ln_mlp",                           # falcon40b
-            "model.layers.{bid}.input_layernorm",                   # llama-hf nemotron olmoe phimoe
+            "model.layers.{bid}.input_layernorm",                   # llama-hf nemotron olmoe phimoe granite-hybrid
            "layers.{bid}.attention_norm",                          # llama-pth
            "language_model.encoder.layers.{bid}.input_layernorm",  # persimmon
            "model.layers.{bid}.ln1",                               # yi
@ -136,6 +137,7 @@ class TensorNameMap:
            "model.layers.{bid}.ln1",                               # rwkv7
            "model.layers.{bid}.input_layernorm",                   # llama4
            "transformer_encoder.{bid}.attention_norm",             # neobert
            "model.layers.{bid}.operator_norm",                     # lfm2
        ),
        # Attention norm 2
@ -220,6 +222,7 @@ class TensorNameMap:
            "transformer.h.{bid}.self_attention.dense",                     # falcon
            "h.{bid}.self_attention.dense",                                 # bloom
            "model.layers.{bid}.self_attn.o_proj",                          # llama-hf nemotron olmoe olmo2 phimoe
            "model.layers.{bid}.self_attn.out_proj",                        # lfm2
            "model.layers.{bid}.self_attn.linear_attn",                     # deci
            "layers.{bid}.attention.wo",                                    # llama-pth
            "encoder.layer.{bid}.attention.output.dense",                   # bert
@ -279,7 +282,7 @@ class TensorNameMap:
            "transformer.decoder_layer.{bid}.rms_norm_2",                    # Grok
            "encoder.layers.{bid}.post_attention_layernorm",                 # chatglm
            "transformer.layers.{bid}.ffn_norm",                             # openelm
-            "model.layers.{bid}.pre_ff_layernorm",                           # jamba
+            "model.layers.{bid}.pre_ff_layernorm",                           # jamba granite-hybrid
            "model.layers.{bid}.pre_moe_layernorm",                          # mini-jamba
            "model.layers.{bid}.post_attention_layernorm",                   # llama4
            "transformer_encoder.{bid}.ffn_norm",                            # neobert
@ -349,7 +352,7 @@ class TensorNameMap:
            "model.layers.{bid}.residual_mlp.w3",                     # arctic
            "encoder.layers.{bid}.mlp.dense_h_to_4h",                 # chatglm
            "transformer.h.{bid}.mlp.c_fc_1",                         # exaone
-            "model.layers.{bid}.feed_forward.up_proj",                # llama4 jamba
+            "model.layers.{bid}.feed_forward.up_proj",                # llama4 jamba granite-hybrid
            "transformer_encoder.{bid}.ffn.w12",                      # neobert
        ),
@ -389,7 +392,7 @@ class TensorNameMap:
            "transformer.h.{bid}.mlp.linear_1",           # refact
            "model.layers.{bid}.residual_mlp.w1",         # arctic
            "transformer.h.{bid}.mlp.c_fc_0",             # exaone
-            "model.layers.{bid}.feed_forward.gate_proj",  # llama4 jamba
+            "model.layers.{bid}.feed_forward.gate_proj",  # llama4 jamba granite-hybrid
        ),
        MODEL_TENSOR.FFN_GATE_EXP: (
@ -435,7 +438,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.mlp.down_layer",                     # jina-bert-v2
            "encoder.layers.{bid}.mlp.dense_4h_to_h",                 # chatglm
            "model.layers.h.{bid}.mlp.c_proj",                        # exaone
-            "model.layers.{bid}.feed_forward.down_proj",              # llama4 jamba
+            "model.layers.{bid}.feed_forward.down_proj",              # llama4 jamba granite-hybrid
            "transformer_encoder.{bid}.ffn.w3",                       # neobert
        ),
@ -558,13 +561,13 @@ class TensorNameMap:
        MODEL_TENSOR.SSM_IN: (
            "model.layers.{bid}.in_proj",           # mamba-hf
            "backbone.layers.{bid}.mixer.in_proj",  # mamba
-            "model.layers.{bid}.mamba.in_proj",     # jamba falcon-h1
+            "model.layers.{bid}.mamba.in_proj",     # jamba falcon-h1 granite-hybrid
        ),
        MODEL_TENSOR.SSM_CONV1D: (
            "model.layers.{bid}.conv1d",           # mamba-hf
            "backbone.layers.{bid}.mixer.conv1d",  # mamba
-            "model.layers.{bid}.mamba.conv1d",     # jamba falcon-h1
+            "model.layers.{bid}.mamba.conv1d",     # jamba falcon-h1 granite-hybrid
        ),
        MODEL_TENSOR.SSM_X: (
@ -576,7 +579,7 @@ class TensorNameMap:
        MODEL_TENSOR.SSM_DT: (
            "model.layers.{bid}.dt_proj",           # mamba-hf
            "backbone.layers.{bid}.mixer.dt_proj",  # mamba
-            "model.layers.{bid}.mamba.dt_proj",     # jamba falcon-h1
+            "model.layers.{bid}.mamba.dt_proj",     # jamba falcon-h1 granite-hybrid
        ),
        MODEL_TENSOR.SSM_DT_NORM: (
@ -586,7 +589,7 @@ class TensorNameMap:
        MODEL_TENSOR.SSM_A: (
            "model.layers.{bid}.A_log",           # mamba-hf
            "backbone.layers.{bid}.mixer.A_log",  # mamba
-            "model.layers.{bid}.mamba.A_log",     # jamba falcon-h1
+            "model.layers.{bid}.mamba.A_log",     # jamba falcon-h1 granite-hybrid
        ),
        MODEL_TENSOR.SSM_B_NORM: (
@ -602,18 +605,18 @@ class TensorNameMap:
        MODEL_TENSOR.SSM_D: (
            "model.layers.{bid}.D",           # mamba-hf
            "backbone.layers.{bid}.mixer.D",  # mamba
-            "model.layers.{bid}.mamba.D",     # jamba falcon-h1
+            "model.layers.{bid}.mamba.D",     # jamba falcon-h1 granite-hybrid
        ),
        MODEL_TENSOR.SSM_NORM: (
-            "model.layers.{bid}.mamba.norm", # falcon-h1
+            "model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid
            "backbone.layers.{bid}.mixer.norm",  # mamba2
        ),
        MODEL_TENSOR.SSM_OUT: (
            "model.layers.{bid}.out_proj",           # mamba-hf
            "backbone.layers.{bid}.mixer.out_proj",  # mamba
-            "model.layers.{bid}.mamba.out_proj",     # jamba falcon-h1
+            "model.layers.{bid}.mamba.out_proj",     # jamba falcon-h1 granite-hybrid
        ),
        MODEL_TENSOR.TIME_MIX_W0: (
@ -1015,6 +1018,18 @@ class TensorNameMap:
            "backbone.posnet.{bid}.proj_out", # wavtokenizer
        ),
        MODEL_TENSOR.SHORTCONV_CONV: (
            "model.layers.{bid}.conv.conv",
        ),
        MODEL_TENSOR.SHORTCONV_INPROJ: (
            "model.layers.{bid}.conv.in_proj",
        ),
        MODEL_TENSOR.SHORTCONV_OUTPROJ: (
            "model.layers.{bid}.conv.out_proj",
        ),
        #############################################################################
        ## Vision encoder
--- a/include/llama.h
+++ b/include/llama.h
@ -82,47 +82,6 @@ extern "C" {
        LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
    };
    // pre-tokenization types
    enum llama_vocab_pre_type {
        LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
        LLAMA_VOCAB_PRE_TYPE_LLAMA3         = 1,
        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
        LLAMA_VOCAB_PRE_TYPE_FALCON         = 4,
        LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
        LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
        LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
        LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
        LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
        LLAMA_VOCAB_PRE_TYPE_STABLELM2      = 10,
        LLAMA_VOCAB_PRE_TYPE_QWEN2          = 11,
        LLAMA_VOCAB_PRE_TYPE_OLMO           = 12,
        LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
        LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
        LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
        LLAMA_VOCAB_PRE_TYPE_CHATGLM3       = 16,
        LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
        LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
        LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
        LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
        LLAMA_VOCAB_PRE_TYPE_SMOLLM         = 21,
        LLAMA_VOCAB_PRE_TYPE_CODESHELL      = 22,
        LLAMA_VOCAB_PRE_TYPE_BLOOM          = 23,
        LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
        LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
        LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
        LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
        LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
        LLAMA_VOCAB_PRE_TYPE_SUPERBPE       = 30,
        LLAMA_VOCAB_PRE_TYPE_TRILLION       = 31,
        LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
        LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
        LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
        LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
        LLAMA_VOCAB_PRE_TYPE_HUNYUAN        = 36,
    };
    enum llama_rope_type {
        LLAMA_ROPE_TYPE_NONE   = -1,
        LLAMA_ROPE_TYPE_NORM   = 0,
--- a/scripts/create_ops_docs.py
+++ b/scripts/create_ops_docs.py
@ -0,0 +1,196 @@
 #!/usr/bin/env python3
 """
 This script parses docs/ops/*.csv and creates the ops.md, which is a table documenting supported operations on various ggml backends.
 """
 import csv
 import logging
 import sys
 from pathlib import Path
 from collections import defaultdict
 class DocsGenerator:
    def __init__(self, ggml_root: str, output_filename: str = "ops.md"):
        self.ggml_root = Path(ggml_root)
        self.ops_dir = self.ggml_root / "docs" / "ops"
        self.output_filename = output_filename
        self.backend_support: dict[str, dict[str, list[bool]]] = defaultdict(
            lambda: defaultdict(list)
        )
        self.all_operations: set[str] = set()
        self.all_backends: set[str] = set()
        self.logger = logging.getLogger(__name__)
    def parse_support_files(self) -> None:
        if not self.ops_dir.exists():
            self.logger.warning(f"ops directory not found: {self.ops_dir}")
            return
        self.logger.info(f"Parsing support files from {self.ops_dir}...")
        for support_file in self.ops_dir.glob("*.csv"):
            self.logger.info(f"  Reading: {support_file.name}")
            self._parse_support_file(support_file)
    def _parse_support_file(self, file_path: Path) -> None:
        try:
            with open(file_path, "r", newline='') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    # Skip rows that don't have support mode
                    if row.get('test_mode') != 'support':
                        continue
                    backend_name = row.get('backend_name', '').strip()
                    operation = row.get('op_name', '').strip()
                    supported_str = row.get('error_message', '').strip()  # "yes" or "no"
                    backend_reg_name = row.get('backend_reg_name', '').strip()
                    # Skip invalid or error operations
                    if not operation or not backend_name or operation in [
                        "CONTEXT_ERROR",
                        "BUILD_ERROR",
                    ]:
                        continue
                    is_supported = supported_str.lower() == "yes"
                    # Use backend_reg_name for grouping, fallback to backend_name
                    backend_key = backend_reg_name if backend_reg_name else backend_name
                    self.all_backends.add(backend_key)
                    self.backend_support[backend_key][operation].append(is_supported)
                    self.all_operations.add(operation)
        except Exception as e:
            self.logger.error(f"    Error parsing {file_path}: {e}")
    def get_backend_support_status(self, backend: str, operation: str) -> str:
        support_list = self.backend_support[backend].get(operation, [])
        if not support_list:
            return "unsupported"
        all_supported = all(support_list)
        any_supported = any(support_list)
        if all_supported:
            return "supported"
        elif any_supported:
            return "partially supported"
        else:
            return "unsupported"
    def get_support_status(self, operation: str) -> str:
        if operation not in self.all_operations:
            return "unsupported"
        support_count = 0
        total_backends = len(self.all_backends)
        for backend in self.all_backends:
            if self.backend_support[backend].get(operation, False):
                support_count += 1
        if support_count == 0:
            return "unsupported"
        elif support_count == total_backends:
            return "supported"
        else:
            return "partially supported"
    def get_support_symbol(self, status: str) -> str:
        symbols = {"supported": "✅", "partially supported": "🟡", "unsupported": "❌"}
        return symbols.get(status, "❓")
    def generate_markdown(self) -> str:
        lines = []
        lines.append("# GGML Operations")
        lines.append("")
        lines.append("List of GGML operations and backend support status.")
        lines.append("")
        lines.append("Legend:")
        lines.append("- ✅ Fully supported by this backend")
        lines.append("- 🟡 Partially supported by this backend")
        lines.append("- ❌ Not supported by this backend")
        lines.append("")
        backends = sorted(self.all_backends)
        header = "| Operation |"
        for backend in backends:
            header += f" {backend} |"
        separator = "|-----------|"
        for _ in backends:
            separator += "------|"
        lines.append(header)
        lines.append(separator)
        sorted_operations = sorted(self.all_operations)
        for operation in sorted_operations:
            row = f"| {operation:>32} |"
            for backend in backends:
                status = self.get_backend_support_status(backend, operation)
                if status == "supported":
                    symbol = "✅"
                elif status == "partially supported":
                    symbol = "🟡"
                else:
                    symbol = "❌"
                row += f" {symbol} |"
            lines.append(row)
        lines.append("")
        return "\n".join(lines)
    def run(self) -> None:
        self.logger.info("Parsing GGML operation support files...")
        self.parse_support_files()
        if not self.all_operations:
            self.logger.error(
                "No operations found. Make sure to run test-backend-ops support --output csv > docs/ops/file.csv first."
            )
            return
        self.logger.info(
            f"Found {len(self.all_operations)} operations across {len(self.all_backends)} backends"
        )
        self.logger.info("Generating markdown...")
        markdown_content = self.generate_markdown()
        docs_dir = self.ggml_root / "docs"
        docs_dir.mkdir(exist_ok=True)
        ops_file = docs_dir / self.output_filename
        with open(ops_file, "w") as f:
            f.write(markdown_content)
        self.logger.info(f"Generated: {ops_file}")
        self.logger.info(f"Operations: {len(self.all_operations)}")
        self.logger.info(f"Backends: {len(self.all_backends)}")
 def main():
    logging.basicConfig(level=logging.INFO)
    if len(sys.argv) > 1:
        output_filename = sys.argv[1]
    else:
        output_filename = "ops.md"
    generator = DocsGenerator(".", output_filename)
    generator.run()
 if __name__ == "__main__":
    main()
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@ -73,6 +73,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_ARWKV7,           "arwkv7"           },
    { LLM_ARCH_GRANITE,          "granite"          },
    { LLM_ARCH_GRANITE_MOE,      "granitemoe"       },
    { LLM_ARCH_GRANITE_HYBRID,   "granitehybrid"    },
    { LLM_ARCH_CHAMELEON,        "chameleon"        },
    { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
    { LLM_ARCH_PLM,              "plm"              },
@ -82,6 +83,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_ERNIE4_5,         "ernie4_5"         },
    { LLM_ARCH_HUNYUAN_MOE,      "hunyuan-moe"      },
    { LLM_ARCH_SMOLLM3,          "smollm3"          },
    { LLM_ARCH_LFM2,             "lfm2"             },
    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
@ -154,7 +156,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
    { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
    { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
    { LLM_KV_ATTENTION_LAYER_INDICES,                "%s.attention.layer_indices"                },
    { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 },
    { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              },
@ -188,6 +189,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
    { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
    { LLM_KV_TOKENIZER_MODEL,                "tokenizer.ggml.model"                    },
    { LLM_KV_TOKENIZER_PRE,                  "tokenizer.ggml.pre"                      },
    { LLM_KV_TOKENIZER_LIST,                 "tokenizer.ggml.tokens"                   },
@ -1641,6 +1644,43 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
        },
    },
    {
        LLM_ARCH_GRANITE_HYBRID,
        {
            { LLM_TENSOR_TOKEN_EMBD,     "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,    "output_norm" },
            { LLM_TENSOR_OUTPUT,         "output" },
            { LLM_TENSOR_ATTN_NORM,      "blk.%d.attn_norm" },
            // mamba(2) ssm layers
            { LLM_TENSOR_SSM_IN,         "blk.%d.ssm_in" },
            { LLM_TENSOR_SSM_CONV1D,     "blk.%d.ssm_conv1d" },
            { LLM_TENSOR_SSM_DT,         "blk.%d.ssm_dt" },
            { LLM_TENSOR_SSM_A,          "blk.%d.ssm_a" },
            { LLM_TENSOR_SSM_D,          "blk.%d.ssm_d" },
            { LLM_TENSOR_SSM_NORM,       "blk.%d.ssm_norm" },
            { LLM_TENSOR_SSM_OUT,        "blk.%d.ssm_out" },
            // attention layers
            { LLM_TENSOR_ATTN_Q,         "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_K,         "blk.%d.attn_k" },
            { LLM_TENSOR_ATTN_V,         "blk.%d.attn_v" },
            { LLM_TENSOR_ATTN_OUT,       "blk.%d.attn_output" },
            // dense FFN
            { LLM_TENSOR_FFN_NORM,       "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_GATE,       "blk.%d.ffn_gate" },
            { LLM_TENSOR_FFN_DOWN,       "blk.%d.ffn_down" },
            { LLM_TENSOR_FFN_UP,         "blk.%d.ffn_up" },
            // moe FFN
            { LLM_TENSOR_FFN_NORM,       "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_GATE_INP,   "blk.%d.ffn_gate_inp" },
            { LLM_TENSOR_FFN_GATE_EXPS,  "blk.%d.ffn_gate_exps" },
            { LLM_TENSOR_FFN_DOWN_EXPS,  "blk.%d.ffn_down_exps" },
            { LLM_TENSOR_FFN_UP_EXPS,    "blk.%d.ffn_up_exps" },
            // shared expert
            { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
            { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
            { LLM_TENSOR_FFN_UP_SHEXP,   "blk.%d.ffn_up_shexp" },
        },
    },
    {
        LLM_ARCH_CHAMELEON,
        {
@ -1793,6 +1833,27 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP,         "blk.%d.ffn_up" },
        },
    },
    {
        LLM_ARCH_LFM2,
        {
            { LLM_TENSOR_ATTN_NORM,         "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_Q,            "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_K,            "blk.%d.attn_k" },
            { LLM_TENSOR_ATTN_V,            "blk.%d.attn_v" },
            { LLM_TENSOR_ATTN_OUT,          "blk.%d.attn_output" },
            { LLM_TENSOR_ATTN_K_NORM,       "blk.%d.attn_k_norm" },
            { LLM_TENSOR_ATTN_Q_NORM,       "blk.%d.attn_q_norm" },
            { LLM_TENSOR_FFN_DOWN,          "blk.%d.ffn_down" },
            { LLM_TENSOR_FFN_GATE,          "blk.%d.ffn_gate" },
            { LLM_TENSOR_FFN_NORM,          "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_UP,            "blk.%d.ffn_up" },
            { LLM_TENSOR_SHORTCONV_CONV,    "blk.%d.shortconv.conv" },
            { LLM_TENSOR_SHORTCONV_INPROJ,  "blk.%d.shortconv.in_proj" },
            { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" },
            { LLM_TENSOR_TOKEN_EMBD,        "token_embd" },
            { LLM_TENSOR_TOKEN_EMBD_NORM,   "token_embd_norm" },
        }
    },
    {
        LLM_ARCH_UNKNOWN,
        {
@ -1960,6 +2021,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    {LLM_TENSOR_CONVNEXT_PW1,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_CONVNEXT_PW2,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_CONVNEXT_GAMMA,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
    {LLM_TENSOR_SHORTCONV_CONV,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
    {LLM_TENSOR_SHORTCONV_INPROJ,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_SHORTCONV_OUTPROJ,          {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
 };
 LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
@ -2027,10 +2091,11 @@ bool llm_arch_is_recurrent(const llm_arch & arch) {
 }
 bool llm_arch_is_hybrid(const llm_arch & arch) {
    // List all mamba-attention hybrid models here
    switch (arch) {
        case LLM_ARCH_JAMBA:
        case LLM_ARCH_FALCON_H1:
        case LLM_ARCH_GRANITE_HYBRID:
        case LLM_ARCH_LFM2:
            return true;
        default:
            return false;
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@ -77,6 +77,7 @@ enum llm_arch {
    LLM_ARCH_ARWKV7,
    LLM_ARCH_GRANITE,
    LLM_ARCH_GRANITE_MOE,
    LLM_ARCH_GRANITE_HYBRID,
    LLM_ARCH_CHAMELEON,
    LLM_ARCH_WAVTOKENIZER_DEC,
    LLM_ARCH_PLM,
@ -86,6 +87,7 @@ enum llm_arch {
    LLM_ARCH_ERNIE4_5,
    LLM_ARCH_HUNYUAN_MOE,
    LLM_ARCH_SMOLLM3,
    LLM_ARCH_LFM2,
    LLM_ARCH_UNKNOWN,
 };
@ -158,7 +160,6 @@ enum llm_kv {
    LLM_KV_ATTENTION_SCALE,
    LLM_KV_ATTENTION_KEY_LENGTH_MLA,
    LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
    LLM_KV_ATTENTION_LAYER_INDICES,
    LLM_KV_ROPE_DIMENSION_COUNT,
    LLM_KV_ROPE_DIMENSION_SECTIONS,
@ -227,6 +228,8 @@ enum llm_kv {
    LLM_KV_CLASSIFIER_OUTPUT_LABELS,
    LLM_KV_SHORTCONV_L_CACHE,
    // deprecated:
    LLM_KV_TOKENIZER_PREFIX_ID,
    LLM_KV_TOKENIZER_SUFFIX_ID,
@ -396,6 +399,9 @@ enum llm_tensor {
    LLM_TENSOR_POS_NET_ATTN_K,
    LLM_TENSOR_POS_NET_ATTN_V,
    LLM_TENSOR_POS_NET_ATTN_OUT,
    LLM_TENSOR_SHORTCONV_CONV,
    LLM_TENSOR_SHORTCONV_INPROJ,
    LLM_TENSOR_SHORTCONV_OUTPROJ,
 };
 enum llm_tensor_layer {
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@ -71,6 +71,11 @@ uint32_t llama_hparams::n_embd_r() const {
        return token_shift_count * n_embd;
    }
    if (n_shortconv_l_cache != 0) {
        // for LFM2 models
        return n_embd * (n_shortconv_l_cache - 1);
    }
    // TODO: maybe support other convolution strides than 1
    // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
    // Corresponds to Mamba's conv_states size
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@ -55,6 +55,8 @@ struct llama_hparams {
    struct llama_hparams_posnet   posnet;
    struct llama_hparams_convnext convnext;
    uint32_t n_shortconv_l_cache  = 0;
    std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
    std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
    std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -45,17 +45,21 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_190M:          return "190M";
        case LLM_TYPE_220M:          return "220M";
        case LLM_TYPE_250M:          return "250M";
        case LLM_TYPE_256M:          return "256M";
        case LLM_TYPE_270M:          return "270M";
        case LLM_TYPE_335M:          return "335M";
        case LLM_TYPE_350M:          return "350M";
        case LLM_TYPE_410M:          return "410M";
        case LLM_TYPE_450M:          return "450M";
        case LLM_TYPE_475M:          return "475M";
        case LLM_TYPE_700M:          return "700M";
        case LLM_TYPE_770M:          return "770M";
        case LLM_TYPE_780M:          return "780M";
        case LLM_TYPE_0_3B:          return "0.3B";
        case LLM_TYPE_0_5B:          return "0.5B";
        case LLM_TYPE_0_6B:          return "0.6B";
        case LLM_TYPE_1B:            return "1B";
        case LLM_TYPE_1_2B:          return "1.2B";
        case LLM_TYPE_1_3B:          return "1.3B";
        case LLM_TYPE_1_4B:          return "1.4B";
        case LLM_TYPE_1_5B:          return "1.5B";
@ -586,6 +590,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                        case 22: type = LLM_TYPE_1B; break;
                        case 26: type = LLM_TYPE_3B; break;
                        case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
                        case 30: type = LLM_TYPE_256M; break; // smoldocling 256M
                        // granite uses a vocab with len 49152
                        case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
                        case 36: type = LLM_TYPE_8B; break; // granite
@ -1509,6 +1514,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale);
                ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale);
                // Granite uses rope_finetuned as a switch for rope, so default to true
                bool rope_finetuned = true;
                ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
                hparams.rope_finetuned = rope_finetuned;
                switch (hparams.n_layer) {
                    case 32: type = LLM_TYPE_3B; break;
                    case 40: type = LLM_TYPE_3B; break;
@ -1516,6 +1526,40 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    default: type = LLM_TYPE_UNKNOWN;
                }
                // For Granite MoE Shared
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
            } break;
        case LLM_ARCH_GRANITE_HYBRID:
            {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale, /* required */ false);
                ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale, /* required */ false);
                ml.get_key(LLM_KV_EMBEDDING_SCALE,             hparams.f_embedding_scale, /* required */ false);
                ml.get_key(LLM_KV_ATTENTION_SCALE,             hparams.f_attention_scale, /* required */ false);
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
                ml.get_key(LLM_KV_SSM_GROUP_COUNT,    hparams.ssm_n_group);
                // Granite uses rope_finetuned as a switch for rope, so default to true
                bool rope_finetuned = true;
                ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
                hparams.rope_finetuned = rope_finetuned;
                // A layer is recurrent IFF the n_head_kv value is set to 0
                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
                }
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
                    // TODO: Add llm type label (not sure this is useful)
                    default: type = LLM_TYPE_UNKNOWN;
                }
                // For Granite MoE Shared
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, /* required */ false);
            } break;
@ -1627,6 +1671,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
        case LLM_ARCH_LFM2:
            {
                ml.get_key(LLM_KV_SHORTCONV_L_CACHE,           hparams.n_shortconv_l_cache);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                for (uint32_t il = 0; il < hparams.n_layer; ++il) {
                    hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
                }
                switch (hparams.n_embd) {
                    case 1024: type = LLM_TYPE_350M; break;
                    case 1536: type = LLM_TYPE_700M; break;
                    case 2048: type = LLM_TYPE_1_2B; break;
                    default:   type = LLM_TYPE_UNKNOWN;
                }
            } break;
        default: throw std::runtime_error("unsupported model architecture");
    }
@ -3458,6 +3516,99 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        }
                    }
                } break;
            case LLM_ARCH_GRANITE_HYBRID:
                {
                    // mamba2 Mixer SSM params
                    // NOTE: int64_t for tensor dimensions
                    const int64_t d_conv     = hparams.ssm_d_conv;
                    const int64_t d_inner    = hparams.ssm_d_inner;
                    const int64_t d_state    = hparams.ssm_d_state;
                    const int64_t n_ssm_head = hparams.ssm_dt_rank;
                    const int64_t n_group    = hparams.ssm_n_group;
                    const int64_t d_in_proj  = 2*d_inner + 2*n_group*d_state + n_ssm_head;
                    // only an expansion factor of 2 is supported for now
                    GGML_ASSERT(2 * n_embd == d_inner);
                    // embeddings
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
                    // output
                    {
                        output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
                        output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
                        if (output == NULL) {
                            output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
                        }
                    }
                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];
                        // norm
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
                        if (hparams.is_recurrent(i)) {
                            // ssm layers
                            layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0);
                            layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0);
                            layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, TENSOR_NOT_REQUIRED);
                            layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_ssm_head}, 0);
                            // no "weight" suffix for these
                            layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_ssm_head}, 0);
                            layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_ssm_head}, 0);
                            layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0);
                            // out_proj
                            layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
                        } else {
                            // attention layers (with optional bias)
                            const int64_t n_head_i = hparams.n_head(i);
                            const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(i);
                            const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(i);
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
                            layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},         TENSOR_NOT_REQUIRED);
                            layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_k_gqa_i}, TENSOR_NOT_REQUIRED);
                            layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_v_gqa_i}, TENSOR_NOT_REQUIRED);
                            layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},         TENSOR_NOT_REQUIRED);
                        }
                        // feed forward (w/ optional biases)
                        if (n_expert > 0) {
                            // MoE FFN
                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
                            layer.ffn_gate_inp  = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,  "weight", i), {n_embd, n_expert}, 0);
                            layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff, n_expert}, TENSOR_NOT_REQUIRED);
                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff, n_embd, n_expert}, 0);
                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff, n_expert}, 0);
                            // For Granite MoE Shared
                            if (hparams.n_ff_shexp > 0) {
                                layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
                                layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, 0);
                                layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0);
                            }
                        } else {
                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                            layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
                        }
                    }
                } break;
            case LLM_ARCH_XVERSE:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -4868,6 +5019,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                    }
                } break;
            case LLM_ARCH_LFM2:
                {
                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
                    tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];
                        // ffn is same for transformer and conv layers
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                        // for operator_norm
                        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
                        if (!hparams.is_recurrent(i)) {
                            layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
                            layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
                            GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa);
                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0);
                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0);
                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
                        } else {
                            layer.shortconv.conv     = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV,    "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0);
                            layer.shortconv.in_proj  = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ,  "weight", i), {n_embd, 3 * n_embd}, 0);
                            layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0);
                        }
                    }
                } break;
            default:
                throw std::runtime_error("unknown architecture");
        }
@ -5121,7 +5305,8 @@ void llama_model::print_info() const {
    if (arch == LLM_ARCH_MAMBA ||
        arch == LLM_ARCH_MAMBA2 ||
        arch == LLM_ARCH_JAMBA ||
-        arch == LLM_ARCH_FALCON_H1) {
+        arch == LLM_ARCH_FALCON_H1 ||
        arch == LLM_ARCH_GRANITE_HYBRID) {
        LLAMA_LOG_INFO("%s: ssm_d_conv       = %u\n",     __func__, hparams.ssm_d_conv);
        LLAMA_LOG_INFO("%s: ssm_d_inner      = %u\n",     __func__, hparams.ssm_d_inner);
        LLAMA_LOG_INFO("%s: ssm_d_state      = %u\n",     __func__, hparams.ssm_d_state);
@ -5176,7 +5361,8 @@ void llama_model::print_info() const {
    if (arch == LLM_ARCH_MINICPM ||
        arch == LLM_ARCH_GRANITE ||
-        arch == LLM_ARCH_GRANITE_MOE) {
+        arch == LLM_ARCH_GRANITE_MOE ||
        arch == LLM_ARCH_GRANITE_HYBRID) {
        LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
        LLAMA_LOG_INFO("%s: f_residual_scale  = %f\n", __func__, hparams.f_residual_scale);
        LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
@ -13895,13 +14081,11 @@ struct llm_build_arwkv7 : public llm_build_rwkv7_base {
    }
 };
 struct llm_build_granite : public llm_graph_context {
    llm_build_granite(
        const llama_model & model,
        const llm_graph_params & params,
-        ggml_cgraph * gf,
+        ggml_cgraph * gf)
        const bool use_rope = true)
        : llm_graph_context(params) {
        const int64_t n_embd_head = hparams.n_embd_head_v;
@ -13916,14 +14100,12 @@ struct llm_build_granite : public llm_graph_context {
        // inp_pos - built only if rope enabled
        ggml_tensor * inp_pos = nullptr;
-        if (use_rope) {
+        if (hparams.rope_finetuned) {
            inp_pos = build_inp_pos();
        }
        auto * inp_attn = build_attn_inp_kv_unified();
        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
        ggml_tensor * inp_out_ids = build_inp_out_ids();
        for (int il = 0; il < n_layer; ++il) {
@ -13936,7 +14118,51 @@ struct llm_build_granite : public llm_graph_context {
            cb(cur, "attn_norm", il);
            // self-attention
-            {
+            cur = build_attention_layer(
                gf, cur, inp_pos, inp_attn,
                model, n_embd_head, il);
            if (il == n_layer - 1 && inp_out_ids) {
                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
            }
            // ffn
            cur = build_layer_ffn(cur, inpSA, model, il);
            // input for next layer
            inpL = cur;
        }
        cur = inpL;
        cur = build_norm(cur,
                model.output_norm, NULL,
                LLM_NORM_RMS, -1);
        cb(cur, "result_norm", -1);
        res->t_embd = cur;
        // lm_head
        cur = build_lora_mm(model.output, cur);
        // For Granite architectures - scale logits
        cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
        cb(cur, "result_output", -1);
        res->t_logits = cur;
        ggml_build_forward_expand(gf, cur);
    }
    ggml_tensor * build_attention_layer(
              ggml_cgraph                     * gf,
              ggml_tensor                     * cur,
              ggml_tensor                     * inp_pos,
              llm_graph_input_attn_kv_unified * inp_attn,
        const llama_model                     & model,
        const int64_t                           n_embd_head,
        const int                               il) {
        // compute Q and K and (optionally) RoPE them
        ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
        cb(Qcur, "Qcur", il);
@ -13959,10 +14185,11 @@ struct llm_build_granite : public llm_graph_context {
            cb(Vcur, "Vcur", il);
        }
-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il),    n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+        Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
-                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+        Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
        const bool use_rope = hparams.rope_finetuned;
        if (use_rope) {
            ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
            Qcur = ggml_rope_ext(
@ -13982,19 +14209,24 @@ struct llm_build_granite : public llm_graph_context {
        cb(Kcur, "Kcur", il);
        cb(Vcur, "Vcur", il);
        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
        cur = build_attn(inp_attn, gf,
                model.layers[il].wo, model.layers[il].bo,
                Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
                cb(cur, "attn_out", il);
        return cur;
    }
-            if (il == n_layer - 1 && inp_out_ids) {
+    ggml_tensor * build_layer_ffn(
-                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+              ggml_tensor       * cur,
-                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+              ggml_tensor       * inpSA,
-            }
+        const llama_model       & model,
        const int                 il) {
        // For Granite architectures - scale residual
        if (hparams.f_residual_scale) {
            cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
        }
        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
        cb(ffn_inp, "ffn_inp", il);
@ -14052,13 +14284,72 @@ struct llm_build_granite : public llm_graph_context {
        }
        // For Granite architectures - scale residual
        if (hparams.f_residual_scale) {
            cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
        }
        cur = ggml_add(ctx0, cur, ffn_inp);
        cb(cur, "ffn_out", il);
        cur = build_cvec(cur, il);
        cb(cur, "l_out", il);
        return cur;
    }
 };
 struct llm_build_granite_hybrid : public llm_graph_context_mamba {
    llm_build_granite_hybrid(
                 const llama_model & model,
            const llm_graph_params & params,
                       ggml_cgraph * gf) :
        llm_graph_context_mamba(params) {
        const int64_t n_embd_head = hparams.n_embd_head_v;
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
        ggml_tensor * cur;
        ggml_tensor * inpL;
        inpL = build_inp_embd(model.tok_embd);
        auto * inp = build_inp_mem_hybrid();
        ggml_tensor * inp_out_ids = build_inp_out_ids();
        // Positional embeddings populated if rope enabled
        ggml_tensor * inp_pos = nullptr;
        if (hparams.rope_finetuned) {
            inp_pos = build_inp_pos();
        }
        for (int il = 0; il < n_layer; ++il) {
            struct ggml_tensor * inpSA = inpL;
            // norm
            cur = build_norm(inpL,
                    model.layers[il].attn_norm, NULL,
                    LLM_NORM_RMS, il);
            cb(cur, "attn_norm", il);
            if (hparams.is_recurrent(il)) {
                // ssm layer //
                cur = build_mamba2_layer(inp->get_recr(), gf, cur, model, ubatch, il);
            } else {
                // attention layer //
                cur = build_attention_layer(
                    gf, cur, inp_pos, inp->get_attn(), model,
                    n_embd_head, il);
            }
            if (il == n_layer - 1 && inp_out_ids) {
                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
            }
            // ffn
            cur = build_layer_ffn(cur, inpSA, model, il);
            // input for next layer
            inpL = cur;
        }
@ -14076,12 +14367,156 @@ struct llm_build_granite : public llm_graph_context {
        cur = build_lora_mm(model.output, cur);
        // For Granite architectures - scale logits
        if (hparams.f_logit_scale) {
            cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
        }
        cb(cur, "result_output", -1);
        res->t_logits = cur;
        ggml_build_forward_expand(gf, cur);
    }
    ggml_tensor * build_attention_layer(
              ggml_cgraph                     * gf,
              ggml_tensor                     * cur,
              ggml_tensor                     * inp_pos,
              llm_graph_input_attn_kv_unified * inp_attn,
        const llama_model                     & model,
        const int64_t                           n_embd_head,
        const int                               il) {
        // compute Q and K and (optionally) RoPE them
        ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
        cb(Qcur, "Qcur", il);
        if (model.layers[il].bq) {
            Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
            cb(Qcur, "Qcur", il);
        }
        ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
        cb(Kcur, "Kcur", il);
        if (model.layers[il].bk) {
            Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
            cb(Kcur, "Kcur", il);
        }
        ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
        cb(Vcur, "Vcur", il);
        if (model.layers[il].bv) {
            Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
            cb(Vcur, "Vcur", il);
        }
        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, hparams.n_head(il),    n_tokens);
        Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
        Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, hparams.n_head_kv(il), n_tokens);
        const bool use_rope = hparams.rope_finetuned;
        if (use_rope) {
            ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
            Qcur = ggml_rope_ext(
                    ctx0, Qcur, inp_pos, rope_factors,
                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                    );
            Kcur = ggml_rope_ext(
                    ctx0, Kcur, inp_pos, rope_factors,
                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow
                    );
        }
        cb(Qcur, "Qcur", il);
        cb(Kcur, "Kcur", il);
        cb(Vcur, "Vcur", il);
        const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
        cur = build_attn(inp_attn, gf,
                model.layers[il].wo, model.layers[il].bo,
                Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
                cb(cur, "attn_out", il);
        return cur;
    }
    ggml_tensor * build_layer_ffn(
              ggml_tensor       * cur,
              ggml_tensor       * inpSA,
        const llama_model       & model,
        const int                 il) {
        // For Granite architectures - scale residual
        if (hparams.f_residual_scale) {
            cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
        }
        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
        cb(ffn_inp, "ffn_inp", il);
        // feed-forward network (non-MoE)
        if (model.layers[il].ffn_gate_inp == nullptr) {
            cur = build_norm(ffn_inp,
                    model.layers[il].ffn_norm, NULL,
                    LLM_NORM_RMS, il);
                    cb(cur, "ffn_norm", il);
            cur = build_ffn(cur,
                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                    NULL,
                    LLM_FFN_SILU, LLM_FFN_PAR, il);
                    cb(cur, "ffn_out", il);
        } else {
            // MoE branch
            cur = build_norm(ffn_inp,
                    model.layers[il].ffn_norm, NULL,
                    LLM_NORM_RMS, il);
                    cb(cur, "ffn_norm", il);
            ggml_tensor * moe_out = build_moe_ffn(cur,
                    model.layers[il].ffn_gate_inp,
                    model.layers[il].ffn_up_exps,
                    model.layers[il].ffn_gate_exps,
                    model.layers[il].ffn_down_exps,
                    nullptr,
                    n_expert, n_expert_used,
                    LLM_FFN_SILU, true,
                    false, 0.0,
                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                    il);
            cb(moe_out, "ffn_moe_out", il);
            // For Granite MoE Shared
            if (hparams.n_ff_shexp > 0) {
                ggml_tensor * ffn_shexp = build_ffn(cur,
                    model.layers[il].ffn_up_shexp,   NULL, NULL,
                    model.layers[il].ffn_gate_shexp, NULL, NULL,
                    model.layers[il].ffn_down_shexp, NULL, NULL,
                    NULL,
                    LLM_FFN_SILU, LLM_FFN_PAR, il);
                cb(ffn_shexp, "ffn_shexp", il);
                cur = ggml_add(ctx0, moe_out, ffn_shexp);
                cb(cur, "ffn_out", il);
            } else {
                cur = moe_out;
            }
        }
        // For Granite architectures - scale residual
        if (hparams.f_residual_scale) {
            cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
        }
        cur = ggml_add(ctx0, cur, ffn_inp);
        cb(cur, "ffn_out", il);
        cur = build_cvec(cur, il);
        cb(cur, "l_out", il);
        return cur;
    }
 };
 // ref: https://github.com/facebookresearch/chameleon
@ -15574,6 +16009,163 @@ struct llm_build_smollm3 : public llm_graph_context {
    }
 };
 struct llm_build_lfm2 : public llm_graph_context {
    const llama_model & model;
    llm_build_lfm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) {
        ggml_tensor * cur = build_inp_embd(model.tok_embd);
        cb(cur, "model.embed_tokens", -1);
        ggml_tensor * inp_pos     = build_inp_pos();
        auto        * inp_hybrid  = build_inp_mem_hybrid();
        ggml_tensor * inp_out_ids = build_inp_out_ids();
        for (int il = 0; il < n_layer; ++il) {
            auto * prev_cur = cur;
            cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
            cb(cur, "model.layers.{}.operator_norm", il);
            cur = hparams.is_recurrent(il) ?
                build_shortconv_block(gf, cur, inp_hybrid->get_recr(), il) :
                build_attn_block(gf, cur, inp_pos, inp_hybrid->get_attn(), il) ;
            if (il == n_layer - 1 && inp_out_ids) {
                cur      = ggml_get_rows(ctx0,      cur, inp_out_ids);
                prev_cur = ggml_get_rows(ctx0, prev_cur, inp_out_ids);
            }
            cur = ggml_add(ctx0, prev_cur, cur);
            cur = ggml_add(ctx0, cur, build_feed_forward(cur, il));
        }
        cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1);
        cb(cur, "model.embedding_norm", -1);
        res->t_embd = cur;
        // lm_head is tied with embeddings
        cur = build_lora_mm(model.tok_embd, cur);
        cb(cur, "lm_head", -1);
        res->t_logits = cur;
        ggml_build_forward_expand(gf, cur);
    }
    ggml_tensor * build_feed_forward(ggml_tensor * cur,
                                     int           il) const {
        cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
        cb(cur, "model.layers.{}.ffn_norm", il);
        GGML_ASSERT(!model.layers[il].ffn_up_b);
        GGML_ASSERT(!model.layers[il].ffn_gate_b);
        GGML_ASSERT(!model.layers[il].ffn_down_b);
        cur = build_ffn(cur,
                model.layers[il].ffn_up,   NULL, NULL,
                model.layers[il].ffn_gate, NULL, NULL,
                model.layers[il].ffn_down, NULL, NULL,
                NULL,
                LLM_FFN_SILU, LLM_FFN_PAR, il);
        cb(cur, "model.layers.{}.feed_forward.w2", il);
        return cur;
    }
    ggml_tensor * build_attn_block(ggml_cgraph                     * gf,
                                   ggml_tensor                     * cur,
                                   ggml_tensor                     * inp_pos,
                                   llm_graph_input_attn_kv_unified * inp_attn,
                                   int                               il) const {
        GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
        auto const n_embd_head = hparams.n_embd_head_v;
        auto const n_head_kv = hparams.n_head_kv(il);
        auto * q = build_lora_mm(model.layers[il].wq, cur);
        cb(q, "model.layers.{}.self_attn.q_proj", il);
        auto * k = build_lora_mm(model.layers[il].wk, cur);
        cb(k, "model.layers.{}.self_attn.k_proj", il);
        auto * v = build_lora_mm(model.layers[il].wv, cur);
        cb(v, "model.layers.{}.self_attn.v_proj", il);
        q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head,    n_tokens);
        k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
        v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);
        // qk norm
        q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
        cb(q, "model.layers.{}.self_attn.q_layernorm", il);
        k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
        cb(k, "model.layers.{}.self_attn.k_layernorm", il);
        // RoPE
        q = ggml_rope_ext(
                ctx0, q, inp_pos, nullptr,
                n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                ext_factor, attn_factor, beta_fast, beta_slow
                );
        k = ggml_rope_ext(
                ctx0, k, inp_pos, nullptr,
                n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                ext_factor, attn_factor, beta_fast, beta_slow
                );
        cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL,
                q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        cb(cur, "model.layers.{}.self_attn.out_proj", il);
        return cur;
    }
    ggml_tensor * build_shortconv_block(ggml_cgraph        * gf,
                                        ggml_tensor        * cur,
                                        llm_graph_input_rs * inp_recr,
                                        int                il) {
        const auto * mctx_cur = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
        auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur);
        cb(bcx, "model.layers.{}.conv.in_proj", il);
        constexpr auto n_chunks = 3;
        GGML_ASSERT(bcx->ne[0] % n_chunks == 0);
        auto const chunk_size = bcx->ne[0] / n_chunks;
        auto * b = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 0 * chunk_size * ggml_element_size(bcx));
        auto * c = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 1 * chunk_size * ggml_element_size(bcx));
        auto * x = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 2 * chunk_size * ggml_element_size(bcx));
        auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x));
        // read conv state directly, with build_rs generation is slower
        ggml_tensor * conv_state = mctx_cur->get_r_l(il);
        const int64_t n_seqs  = ubatch.n_seqs;
        ggml_tensor * conv = build_rs(inp_recr, gf, conv_state, hparams.n_embd_r(), n_seqs);
        conv = ggml_reshape_3d(ctx0, conv_state, hparams.n_shortconv_l_cache - 1, hparams.n_embd, n_seqs);
        bx = ggml_concat(ctx0, conv, bx, 0);
        GGML_ASSERT(bx->ne[0] > conv->ne[0]);
        auto * new_conv = ggml_view_2d(ctx0, bx, conv->ne[0], bx->ne[1], bx->nb[1], (bx->ne[0] - conv->ne[0]) * ggml_element_size(bx));
        GGML_ASSERT(ggml_are_same_shape(conv, new_conv));
        // write conv state
        ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv, conv_state));
        auto * conv_kernel = model.layers[il].shortconv.conv;
        GGML_ASSERT(hparams.n_shortconv_l_cache > 0);
        // construct ssm_conv op
        ggml_tensor * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel);
        cb(conv_out, "model.layers.{}.conv.conv", il);
        auto * y = ggml_mul(ctx0, c, conv_out);
        y = build_lora_mm(model.layers[il].shortconv.out_proj, y);
        cb(y, "model.layers.{}.conv.out_proj", il);
        return y;
    }
 };
 llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
    llama_memory_i * res;
@ -15932,6 +16524,10 @@ llm_graph_result_ptr llama_model::build_graph(
            {
                llm = std::make_unique<llm_build_granite>(*this, params, gf);
            } break;
        case LLM_ARCH_GRANITE_HYBRID:
            {
                llm = std::make_unique<llm_build_granite_hybrid>(*this, params, gf);
            } break;
        case LLM_ARCH_CHAMELEON:
            {
                llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
@ -15972,6 +16568,10 @@ llm_graph_result_ptr llama_model::build_graph(
            {
                llm = std::make_unique<llm_build_falcon_h1>(*this, params, gf);
            } break;
        case LLM_ARCH_LFM2:
            {
                llm = std::make_unique<llm_build_lfm2>(*this, params, gf);
            } break;
        default:
            GGML_ABORT("fatal error");
    }
@ -16121,6 +16721,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_GLM4:
        case LLM_ARCH_GRANITE:
        case LLM_ARCH_GRANITE_MOE:
        case LLM_ARCH_GRANITE_HYBRID:
        case LLM_ARCH_CHAMELEON:
        case LLM_ARCH_BAILINGMOE:
        case LLM_ARCH_NEO_BERT:
@ -16164,6 +16765,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_MINICPM3:
        case LLM_ARCH_DOTS1:
        case LLM_ARCH_HUNYUAN_MOE:
        case LLM_ARCH_LFM2:
            return LLAMA_ROPE_TYPE_NEOX;
        case LLM_ARCH_QWEN2VL:
--- a/src/llama-model.h
+++ b/src/llama-model.h
@ -32,17 +32,21 @@ enum llm_type {
    LLM_TYPE_190M,
    LLM_TYPE_220M,
    LLM_TYPE_250M,
    LLM_TYPE_256M,
    LLM_TYPE_270M,
    LLM_TYPE_335M,
    LLM_TYPE_350M,
    LLM_TYPE_410M,
    LLM_TYPE_450M,
    LLM_TYPE_475M,
    LLM_TYPE_700M,
    LLM_TYPE_770M,
    LLM_TYPE_780M,
    LLM_TYPE_0_3B,
    LLM_TYPE_0_5B,
    LLM_TYPE_0_6B,
    LLM_TYPE_1B,
    LLM_TYPE_1_2B,
    LLM_TYPE_1_3B,
    LLM_TYPE_1_4B,
    LLM_TYPE_1_5B,
@ -154,6 +158,12 @@ struct llama_layer_convnext {
    struct ggml_tensor * gamma = nullptr;
 };
 struct llama_layer_shortconv {
    struct ggml_tensor * in_proj  = nullptr;
    struct ggml_tensor * conv     = nullptr;
    struct ggml_tensor * out_proj = nullptr;
 };
 struct llama_layer {
    // normalization
    struct ggml_tensor * attn_norm       = nullptr;
@ -340,6 +350,8 @@ struct llama_layer {
    struct llama_layer_posnet posnet;
    struct llama_layer_convnext convnext;
    struct llama_layer_shortconv shortconv;
 };
 struct llama_model {
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@ -847,6 +847,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        // do not quantize Mamba's small yet 2D weights
        // NOTE: can't use LLM_TN here because the layer number is not known
        quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
        quantize &= name.find("shortconv.conv.weight") == std::string::npos;
        // do not quantize RWKV's small yet 2D weights
        quantize &= name.find("time_mix_first.weight") == std::string::npos;
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -1760,7 +1760,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    tokenizer_pre == "llama-bpe"||
                    tokenizer_pre == "falcon3"  ||
                    tokenizer_pre == "falcon-h1" ||
-                    tokenizer_pre == "pixtral") {
+                    tokenizer_pre == "pixtral"  ||
                    tokenizer_pre == "midm-2.0" ||
                    tokenizer_pre == "lfm2") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
                ignore_merges = true;
                add_bos = true;
@ -2085,6 +2087,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                        || t.first == "<EOT>"
                        || t.first == "_<EOT>"
                        || t.first == "<｜end▁of▁sentence｜>" // DeepSeek
                        || t.first == "<end_of_utterance>" // smoldocling
                   ) {
                    special_eot_id = t.second;
                    if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@ -2244,6 +2247,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                    || t.first == "<EOT>"
                    || t.first == "_<EOT>"
                    || t.first == "<|end_of_text|>"
                    || t.first == "<end_of_utterance>" // smoldocling
               ) {
                special_eog_ids.insert(t.second);
                if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@ -7,6 +7,47 @@
 #include <memory>
 #include <unordered_map>
 // pre-tokenization types
 enum llama_vocab_pre_type {
    LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
    LLAMA_VOCAB_PRE_TYPE_LLAMA3         = 1,
    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
    LLAMA_VOCAB_PRE_TYPE_FALCON         = 4,
    LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
    LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
    LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
    LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
    LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
    LLAMA_VOCAB_PRE_TYPE_STABLELM2      = 10,
    LLAMA_VOCAB_PRE_TYPE_QWEN2          = 11,
    LLAMA_VOCAB_PRE_TYPE_OLMO           = 12,
    LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
    LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
    LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
    LLAMA_VOCAB_PRE_TYPE_CHATGLM3       = 16,
    LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
    LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
    LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
    LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
    LLAMA_VOCAB_PRE_TYPE_SMOLLM         = 21,
    LLAMA_VOCAB_PRE_TYPE_CODESHELL      = 22,
    LLAMA_VOCAB_PRE_TYPE_BLOOM          = 23,
    LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
    LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
    LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
    LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
    LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
    LLAMA_VOCAB_PRE_TYPE_GPT4O          = 29,
    LLAMA_VOCAB_PRE_TYPE_SUPERBPE       = 30,
    LLAMA_VOCAB_PRE_TYPE_TRILLION       = 31,
    LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
    LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
    LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
    LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
    LLAMA_VOCAB_PRE_TYPE_HUNYUAN        = 36,
 };
 struct LLM_KV;
 struct llama_model_loader;
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -2581,12 +2581,14 @@ struct server_context {
                continue;
            }
-            const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+            const float * embd = nullptr;
-            if (embd == NULL) {
+            if (llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE) {
                embd = llama_get_embeddings_ith(ctx, i);
            } else {
                embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
            }
-            if (embd == NULL) {
+            if (embd == nullptr) {
                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
                res->embedding.push_back(std::vector<float>(n_embd, 0.0f));
@ -2594,12 +2596,12 @@ struct server_context {
            }
            // normalize only when there is pooling
            // TODO: configurable
            if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) {
                common_embd_normalize(embd, embd_res.data(), n_embd, 2);
                res->embedding.push_back(embd_res);
                break;
            } else {
-                res->embedding.push_back({ embd, embd + n_embd });
+                res->embedding.emplace_back(embd, embd + n_embd);
            }
        }