Merge branch 'upstream' into concedo_experimental

# Conflicts: # .github/workflows/release.yml # README.md # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/ggml-cann.cpp # tools/mtmd/CMakeLists.txt # tools/mtmd/clip.cpp # tools/mtmd/clip.h
2025-09-11 01:24:36 +00:00 · 2025-05-24 12:10:36 +08:00 · 2025-05-24 12:10:36 +08:00 · 55cc9acec5
commit 55cc9acec5
parent ec04115ae9 b775345d78
35 changed files with 95818 additions and 463 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -48,3 +48,7 @@ end_of_line = unset
 charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset
+
+[tools/mtmd/miniaudio.h]
+trim_trailing_whitespace = unset
+insert_final_newline = unset
--- a/.github/workflows/winget.yml
+++ b/.github/workflows/winget.yml
@ -0,0 +1,42 @@
+name: Update Winget Package
+
+on:
+  workflow_dispatch: # allows manual triggering
+  schedule:
+    - cron: '28 5 * * *' # Update every day at 5:28 UTC
+
+jobs:
+  update:
+    name: Update Winget Package
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Install cargo binstall
+        uses: cargo-bins/cargo-binstall@268643a6b5ea099f5718ee5cd3ff7dc89a5eb49b
+
+      - name: Install komac
+        run: |
+          cargo binstall komac@2.11.2 -y
+
+      - name: Find latest release
+        id: find_latest_release
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const { data: releases } = await github.rest.repos.listReleases({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+            });
+            console.log("Latest release:", releases[0].tag_name);
+            return releases[0].tag_name;
+
+      - name: Update manifest
+        env:
+          VERSION: ${{ steps.find_latest_release.outputs.result }}
+        run: |
+          echo "Updating manifest..."
+          komac update --version ${{ env.VERSION }} \
+            --urls "https://github.com/ggml-org/llama.cpp/releases/download/${{ env.VERSION }}/llama-${{ env.VERSION }}-bin-win-vulkan-x64.zip" \
+            --token ${{ secrets.WINGET_GITHUB_TOKEN }} \
+            --submit \
+            ggml.llamacpp
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -40,7 +40,7 @@
 using json = nlohmann::ordered_json;

 std::initializer_list<enum llama_example> mmproj_examples = {
-    LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_MTMD,
    LLAMA_EXAMPLE_SERVER,
 };

@ -2234,12 +2234,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
    add_opt(common_arg(
-        {"--image"}, "FILE",
-        "path to an image file. use with multimodal models. Specify multiple times for batching",
+        {"--image", "--audio"}, "FILE",
+        "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
        [](common_params & params, const std::string & value) {
            params.image.emplace_back(value);
        }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    ).set_examples({LLAMA_EXAMPLE_MTMD}));
    if (llama_supports_rpc()) {
        add_opt(common_arg(
            {"--rpc"}, "SERVERS",
@ -2869,7 +2869,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.chat_template = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
    add_opt(common_arg(
        {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
        string_format(
--- a/common/common.h
+++ b/common/common.h
@ -72,7 +72,7 @@ enum llama_example {
    LLAMA_EXAMPLE_SERVER,
    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
    LLAMA_EXAMPLE_EXPORT_LORA,
-    LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_MTMD,
    LLAMA_EXAMPLE_LOOKUP,
    LLAMA_EXAMPLE_PARALLEL,
    LLAMA_EXAMPLE_TTS,
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -45,7 +45,7 @@ class SentencePieceTokenTypes(IntEnum):

 class ModelType(IntEnum):
    TEXT = 1
-    VISION = 2
+    MMPROJ = 2


 AnyModel = TypeVar("AnyModel", bound="type[ModelBase]")
@ -54,7 +54,7 @@ AnyModel = TypeVar("AnyModel", bound="type[ModelBase]")
 class ModelBase:
    _model_classes: dict[ModelType, dict[str, type[ModelBase]]] = {
        ModelType.TEXT: {},
-        ModelType.VISION: {},
+        ModelType.MMPROJ: {},
    }

    dir_model: Path
@ -88,7 +88,7 @@ class ModelBase:
                 small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None):
        if type(self) is ModelBase or \
                type(self) is TextModel or \
-                type(self) is VisionModel:
+                type(self) is MmprojModel:
            raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")

        self.dir_model = dir_model
@ -309,6 +309,7 @@ class ModelBase:
                            gguf.MODEL_TENSOR.POSNET_NORM1,
                            gguf.MODEL_TENSOR.POSNET_NORM2,
                            gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
+                            gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
                        )
                    )
                    or not new_name.endswith(".weight")
@ -438,7 +439,7 @@ class ModelBase:
        assert names

        def func(modelcls: AnyModel) -> AnyModel:
-            model_type = ModelType.VISION if modelcls.model_arch == gguf.MODEL_ARCH.CLIP_VISION else ModelType.TEXT
+            model_type = ModelType.MMPROJ if modelcls.model_arch == gguf.MODEL_ARCH.MMPROJ else ModelType.TEXT
            for name in names:
                cls._model_classes[model_type][name] = modelcls
            return modelcls
@ -1114,60 +1115,87 @@ class TextModel(ModelBase):
            self.gguf_writer.add_pooling_type(pooling_type)


-class VisionModel(ModelBase):
-    model_type = ModelType.VISION
-    model_arch = gguf.MODEL_ARCH.CLIP_VISION
+class MmprojModel(ModelBase):
+    model_type = ModelType.MMPROJ
+    model_arch = gguf.MODEL_ARCH.MMPROJ
    preprocessor_config: dict[str, Any]
    global_config: dict[str, Any]

+    has_vision_encoder: bool = True # by default
+    has_audio_encoder: bool = False
+
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

-        if self.model_arch != gguf.MODEL_ARCH.CLIP_VISION:
-            raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION")
+        if self.model_arch != gguf.MODEL_ARCH.MMPROJ:
+            raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")
+
+        if self.has_vision_encoder and self.has_audio_encoder:
+            raise NotImplementedError("both vision + audio not supported yet")

        # get n_embd of the text model
        if "text_config" not in self.hparams:
            self.hparams["text_config"] = {}
+        if "audio_config" not in self.hparams:
+            self.hparams["audio_config"] = {}
        text_config = {**self.hparams, **self.hparams["text_config"]}
        self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
        assert self.n_embd_text > 0, "n_embd not found in hparams"

-        if "vision_config" not in self.hparams:
-            raise ValueError("vision_config not found in hparams")
        # move vision config to the top level, while preserving the original hparams in global_config
        self.global_config = self.hparams
-        self.hparams = self.hparams["vision_config"]
+
+        if "vision_config" in self.hparams:
+            self.hparams = self.hparams["vision_config"]
+        elif "audio_config" in self.hparams:
+            self.hparams = self.hparams["audio_config"]
+        else:
+            raise ValueError("vision_config / audio_config not found in hparams")

        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"])
-        self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, self.block_count)
+        self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)

        # load preprocessor config
        with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
            self.preprocessor_config = json.load(f)

    def set_type(self):
-        self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION)
+        self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)

    def set_gguf_parameters(self):
        self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
-        self.gguf_writer.add_vision_has_vision_encoder(True)

-        # vision config
-        self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
-        self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
-        self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
-        self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
-        self.gguf_writer.add_vision_block_count(self.block_count)
-        self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
+        if self.has_vision_encoder:
+            self.gguf_writer.add_clip_has_vision_encoder(True)
+            self.gguf_writer.add_vision_projection_dim(self.n_embd_text)

-        # preprocessor config
-        self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
-        self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
+            # vision config
+            self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
+            self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
+            self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
+            self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
+            self.gguf_writer.add_vision_block_count(self.block_count)
+            self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
+
+            # preprocessor config
+            self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
+            self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
+
+        elif self.has_audio_encoder:
+            self.gguf_writer.add_clip_has_audio_encoder(True)
+            self.gguf_writer.add_audio_projection_dim(self.n_embd_text)
+
+            # audio config
+            self.gguf_writer.add_audio_embedding_length(self.find_hparam(["hidden_size"]))
+            self.gguf_writer.add_audio_feed_forward_length(self.find_hparam(["intermediate_size"]))
+            self.gguf_writer.add_audio_block_count(self.block_count)
+            self.gguf_writer.add_audio_head_count(self.find_hparam(["num_attention_heads"]))
+
+        else:
+            raise ValueError("MmprojModel must have either vision or audio encoder")

    def write_vocab(self):
-        raise ValueError("VisionModel does not support vocab writing")
+        raise ValueError("MmprojModel does not support vocab writing")


@ModelBase.register("GPTNeoXForCausalLM")
@ -1951,7 +1979,7 @@ class LlamaModel(TextModel):
    "LlavaForConditionalGeneration", # pixtral
    "Mistral3ForConditionalGeneration", # mistral small 3.1
 )
-class LlavaVisionModel(VisionModel):
+class LlavaVisionModel(MmprojModel):
    img_break_tok_id = -1

    def __init__(self, *args, **kwargs):
@ -1977,7 +2005,7 @@ class LlavaVisionModel(VisionModel):
        super().set_gguf_parameters()
        hparams = self.hparams
        if hparams["model_type"] == "pixtral":
-            self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.PIXTRAL)
+            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL)
            self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])

            # hidden_act
@ -2016,7 +2044,7 @@ class LlavaVisionModel(VisionModel):


@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration")
-class SmolVLMModel(VisionModel):
+class SmolVLMModel(MmprojModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if self.hparams["model_type"] == "smolvlm_vision":
@ -2028,7 +2056,7 @@ class SmolVLMModel(VisionModel):

    def set_gguf_parameters(self):
        super().set_gguf_parameters()
-        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.IDEFICS3)
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.IDEFICS3)
        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
        self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
        self.gguf_writer.add_vision_use_gelu(True)
@ -2094,10 +2122,10 @@ class Llama4Model(LlamaModel):


@ModelBase.register("Llama4ForConditionalGeneration")
-class Llama4VisionModel(VisionModel):
+class Llama4VisionModel(MmprojModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
-        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.LLAMA4)
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LLAMA4)
        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams["norm_eps"])
        self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / self.hparams["pixel_shuffle_ratio"]))
        assert self.hparams["hidden_act"] == "gelu"
@ -2670,7 +2698,7 @@ class Qwen2VLModel(TextModel):


@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
-class Qwen2VLVisionModel(VisionModel):
+class Qwen2VLVisionModel(MmprojModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.hparams["image_size"] = self.hparams.get("image_size", 560)
@ -2685,9 +2713,9 @@ class Qwen2VLVisionModel(VisionModel):
        super().set_gguf_parameters()
        hparams = self.hparams
        if self.global_config['model_type'] == 'qwen2_vl':
-            self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN2VL)
+            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
        elif self.global_config['model_type'] == 'qwen2_5_vl':
-            self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN25VL)
+            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
            self.gguf_writer.add_vision_use_silu(True)
            # find n_wa_pattern (window attention pattern)
            fullatt_block_indexes = hparams.get("fullatt_block_indexes")
@ -2746,11 +2774,11 @@ class Qwen2VLVisionModel(VisionModel):


@ModelBase.register("InternVisionModel")
-class InternVisionModel(VisionModel):
+class InternVisionModel(MmprojModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
-        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.INTERNVL)
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL)
        self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
        # hidden_act
        if hparams["hidden_act"] == "silu":
@ -4008,11 +4036,11 @@ class Gemma3Model(TextModel):


@ModelBase.register("Gemma3ForConditionalGeneration")
-class Gemma3VisionModel(VisionModel):
+class Gemma3VisionModel(MmprojModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
-        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.GEMMA3)
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3)
        # default values below are taken from HF tranformers code
        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
        self.gguf_writer.add_vision_use_gelu(True)
@ -5959,6 +5987,52 @@ class ChameleonModel(TextModel):
        return data_torch


+@ModelBase.register("UltravoxModel")
+class UltravoxModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.LLAMA # dummy
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        raise NotImplementedError("Ultravox does not have text decoder. Please use --mmproj argument")
+
+
+@ModelBase.register("UltravoxModel")
+class UltravoxAudioModel(MmprojModel):
+    has_vision_encoder = False # no vision encoder
+    has_audio_encoder = True
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.hparams["hidden_size"] = self.hparams["d_model"]
+        self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
+        self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.ULTRAVOX)
+        self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
+        self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
+        self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        del bid, new_name, n_dims  # unused
+        if ".conv" in name and ".weight" in name:
+            return gguf.GGMLQuantizationType.F16
+        return False
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        # prevent clash naming with vision tensors
+        if name.startswith("multi_modal_projector"):
+            name = "audio." + name
+
+        if "conv1.bias" in name or "conv2.bias" in name:
+            # transpose conv1 and conv2 bias
+            data_torch = data_torch.unsqueeze(-1)
+
+        return [(self.map_tensor_name(name), data_torch)]
+
 ###### CONVERSION LOGIC ######


@ -6134,13 +6208,15 @@ def split_str_to_n_bytes(split_str: str) -> int:


 def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str:
+    # TODO @ngxson : this won't work correctly if the model has both audio & vision encoders
+    # maybe we should fallback to text model's arch in that case, since not many models have both
    text_config = hparams.get("text_config", {})
    vision_config = hparams.get("vision_config", {})
    arch = hparams["architectures"][0]
    # if "architectures" is found in the sub-config, use that instead
    if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
        arch = text_config["architectures"][0]
-    elif model_type == ModelType.VISION and vision_config.get("architectures") is not None:
+    elif model_type == ModelType.MMPROJ and vision_config.get("architectures") is not None:
        arch = vision_config["architectures"][0]
    return arch

@ -6203,7 +6279,7 @@ def main() -> None:

    with torch.inference_mode():
        output_type = ftype_map[args.outtype]
-        model_type = ModelType.VISION if args.mmproj else ModelType.TEXT
+        model_type = ModelType.MMPROJ if args.mmproj else ModelType.TEXT
        hparams = ModelBase.load_hparams(dir_model)
        model_architecture = get_model_architecture(hparams, model_type)
        logger.info(f"Model architecture: {model_architecture}")
--- a/docs/multimodal.md
+++ b/docs/multimodal.md
@ -4,7 +4,9 @@ llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools
 - [llama-mtmd-cli](../tools/mtmd/README.md)
 - [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API

-To enable it, can use use one of the 2 methods below:
+Currently, we support **image** and **audio** input. Audio is highly experimental and may have reduced quality.
+
+To enable it, you can use one of the 2 methods below:

 - Use `-hf` option with a supported model (see a list of pre-quantized model below)
    - To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
@ -37,6 +39,8 @@ Replaces the `(tool_name)` with the name of binary you want to use. For example,

 NOTE: some models may require large context window, for example: `-c 8192`

+**Vision models**:
+
 ```sh
 # Gemma 3
 (tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
@ -78,3 +82,11 @@ NOTE: some models may require large context window, for example: `-c 8192`
 # Llama 4 Scout
 (tool_name) -hf ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF
 ```
+
+**Audio models**:
+
+```sh
+# Ultravox 0.5
+(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF
+(tool_name) -hf ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF
+```
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -534,15 +534,15 @@ extern "C" {
        GGML_UNARY_OP_STEP,
        GGML_UNARY_OP_TANH,
        GGML_UNARY_OP_ELU,
+        GGML_UNARY_OP_RELU,
        GGML_UNARY_OP_SIGMOID,
        GGML_UNARY_OP_GELU,
-        GGML_UNARY_OP_GELU_ERF,
        GGML_UNARY_OP_GELU_QUICK,
        GGML_UNARY_OP_SILU,
        GGML_UNARY_OP_HARDSWISH,
        GGML_UNARY_OP_HARDSIGMOID,
        GGML_UNARY_OP_EXP,
-        GGML_UNARY_OP_RELU,
+        GGML_UNARY_OP_GELU_ERF,

        GGML_UNARY_OP_COUNT,
    };
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -2820,23 +2820,29 @@ static vk_device ggml_vk_get_device(size_t idx) {
                pipeline_robustness = true;
            } else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
                device->subgroup_size_control = true;
+#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
            } else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
                       !getenv("GGML_VK_DISABLE_COOPMAT")) {
                device->coopmat_support = true;
                device->coopmat_m = 0;
                device->coopmat_n = 0;
                device->coopmat_k = 0;
+#endif
+#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
            } else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
                       !getenv("GGML_VK_DISABLE_COOPMAT2")) {
                coopmat2_support = true;
+#endif
 #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT)
            } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0 &&
                       !getenv("GGML_VK_DISABLE_INTEGER_DOT_PRODUCT")) {
                device->integer_dot_product = true;
 #endif
+#if defined(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT)
            } else if (strcmp("VK_KHR_shader_bfloat16", properties.extensionName) == 0 &&
                       !getenv("GGML_VK_DISABLE_BFLOAT16")) {
                bfloat16_support = true;
+#endif
            }
        }

@ -4694,6 +4700,19 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
        }
    }

+    if (src->type == to) {
+        // Copy two or four bytes at a time, depending on block size.
+        // For quantized types, we scale by block size/type size. But
+        // this path is also used for bf16->bf16 for example, where the
+        // type size must be exactly 2 or 4.
+        GGML_ASSERT(ggml_is_quantized(to) || ggml_type_size(src->type) == 2 || ggml_type_size(src->type) == 4);
+        if ((ggml_type_size(src->type) % 4) == 0) {
+            return ctx->device->pipeline_contig_cpy_f32_f32;
+        } else {
+            return ctx->device->pipeline_contig_cpy_f16_f16;
+        }
+    }
+
    std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl;
    GGML_ABORT("fatal error");
 }
@ -6755,7 +6774,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
    case GGML_OP_UNARY:
    case GGML_OP_CONV_2D_DW:
        {
-            const uint32_t ne = ggml_nelements(dst);
+            uint32_t ne = ggml_nelements(dst);
+            if (op == GGML_OP_CPY && ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
+                // Convert from number of logical elements to 2- or 4-byte units.
+                ne /= ggml_blck_size(src0->type);
+                if ((ggml_type_size(src0->type) % 4) == 0) {
+                    ne *= ggml_type_size(src0->type) / 4;
+                } else {
+                    ne *= ggml_type_size(src0->type) / 2;
+                }
+            }
            if (ne > 262144) {
                elements = { 512, 512, CEIL_DIV(ne, 262144) };
            } else if (ne > 512) {
@ -7305,8 +7333,19 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const
    const uint32_t src0_type_size = ggml_type_size(src0->type);
    const uint32_t dst_type_size = ggml_type_size(dst->type);

+    uint32_t ne = (uint32_t)ggml_nelements(src0);
+    if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
+        // Convert from number of logical elements to 2- or 4-byte units.
+        ne /= ggml_blck_size(src0->type);
+        if ((ggml_type_size(src0->type) % 4) == 0) {
+            ne *= ggml_type_size(src0->type) / 4;
+        } else {
+            ne *= ggml_type_size(src0->type) / 2;
+        }
+    }
+
    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
-        (uint32_t)ggml_nelements(src0),
+        ne,
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
@ -9288,8 +9327,7 @@ static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_buffer(ggml_
    try {
        ptr = ggml_vk_host_malloc(vk_instance.devices[0], size);
    } catch (vk::SystemError& e) {
-        std::cerr << "ggml_vulkan: Failed to allocate pinned memory." << std::endl;
-        std::cerr << "ggml_vulkan: " << e.what() << std::endl;
+        GGML_LOG_WARN("ggml_vulkan: Failed to allocate pinned memory (%s)\n", e.what());
        // fallback to cpu buffer
        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
    }
@ -9891,6 +9929,15 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
                    return true;
                }
+
+                // We can handle copying from a type to the same type if it's
+                // contiguous (memcpy). We use f16 or f32 shaders to do the copy,
+                // so the type/block size must be a multiple of 4.
+                if (src0_type == src1_type &&
+                    ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op) &&
+                    (ggml_type_size(src0_type) % 2) == 0) {
+                    return true;
+                }
                return false;
            } break;
        case GGML_OP_REPEAT:
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -219,10 +219,13 @@ class Keys:
        TYPE       = "adapter.type"
        LORA_ALPHA = "adapter.lora.alpha"

-    class ClipVision:
+    class Clip:
        PROJECTOR_TYPE      = "clip.projector_type"
        HAS_VISION_ENCODER  = "clip.has_vision_encoder"
+        HAS_AUDIO_ENCODER   = "clip.has_audio_encoder"
        HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
+
+    class ClipVision:
        IMAGE_SIZE          = "clip.vision.image_size"
        PATCH_SIZE          = "clip.vision.patch_size"
        EMBEDDING_LENGTH    = "clip.vision.embedding_length"
@ -243,19 +246,33 @@ class Keys:
        class Projector:
            SCALE_FACTOR    = "clip.vision.projector.scale_factor"

+    class ClipAudio:
+        NUM_MEL_BINS        = "clip.audio.num_mel_bins"
+        EMBEDDING_LENGTH    = "clip.audio.embedding_length"
+        FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
+        PROJECTION_DIM      = "clip.audio.projection_dim"
+        BLOCK_COUNT         = "clip.audio.block_count"
+
+        class Attention:
+            HEAD_COUNT      = "clip.audio.attention.head_count"
+            LAYERNORM_EPS   = "clip.audio.attention.layer_norm_epsilon"
+
+        class Projector:
+            STACK_FACTOR    = "clip.audio.projector.stack_factor"
+
 #
 # recommended mapping of model tensor names for storage in gguf
 #


 class GGUFType:
-    MODEL       = "model"
-    ADAPTER     = "adapter"
-    CLIP_VISION = "clip-vision"
+    MODEL   = "model"
+    ADAPTER = "adapter"
+    MMPROJ  = "mmproj" # dummy, unused for now


 class MODEL_ARCH(IntEnum):
-    CLIP_VISION      = auto() # dummy arch for clip.cpp
+    MMPROJ           = auto() # dummy arch for clip.cpp
    LLAMA            = auto()
    LLAMA4           = auto()
    DECI             = auto()
@ -514,10 +531,27 @@ class MODEL_TENSOR(IntEnum):
    V_RESMPL_QUERY       = auto() # minicpmv
    V_TOK_EMBD_IMG_BREAK = auto() # pixtral
    V_MM_PATCH_MERGER    = auto() # mistral small 3.1
+    # audio (mtmd)
+    A_ENC_EMBD_POS       = auto()
+    A_ENC_CONV1D         = auto()
+    A_PRE_NORM           = auto()
+    A_POST_NORM          = auto()
+    A_ENC_ATTN_Q         = auto()
+    A_ENC_ATTN_K         = auto()
+    A_ENC_ATTN_V         = auto()
+    A_ENC_INPUT_NORM     = auto()
+    A_ENC_OUTPUT         = auto()
+    A_ENC_OUTPUT_NORM    = auto()
+    A_ENC_FFN_UP         = auto()
+    A_ENC_FFN_GATE       = auto()
+    A_ENC_FFN_DOWN       = auto()
+    A_MMPROJ             = auto()
+    A_MM_NORM_PRE        = auto()
+    A_MM_NORM_MID        = auto()


 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
-    MODEL_ARCH.CLIP_VISION:      "clip", # dummy arch for clip.cpp
+    MODEL_ARCH.MMPROJ:           "clip", # dummy arch for clip.cpp
    MODEL_ARCH.LLAMA:            "llama",
    MODEL_ARCH.LLAMA4:           "llama4",
    MODEL_ARCH.DECI:             "deci",
@ -776,10 +810,27 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.V_RESMPL_QUERY:            "resampler.query",
    MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK:      "v.token_embd.img_break", # pixtral
    MODEL_TENSOR.V_MM_PATCH_MERGER:         "mm.patch_merger", # mistral small 3.1
+    # audio (mtmd)
+    MODEL_TENSOR.A_ENC_EMBD_POS:            "a.position_embd",
+    MODEL_TENSOR.A_ENC_CONV1D:              "a.conv1d.{bid}",
+    MODEL_TENSOR.A_PRE_NORM:                "a.pre_ln",
+    MODEL_TENSOR.A_POST_NORM:               "a.post_ln",
+    MODEL_TENSOR.A_ENC_ATTN_Q:              "a.blk.{bid}.attn_q",
+    MODEL_TENSOR.A_ENC_ATTN_K:              "a.blk.{bid}.attn_k",
+    MODEL_TENSOR.A_ENC_ATTN_V:              "a.blk.{bid}.attn_v",
+    MODEL_TENSOR.A_ENC_INPUT_NORM:          "a.blk.{bid}.ln1",
+    MODEL_TENSOR.A_ENC_OUTPUT:              "a.blk.{bid}.attn_out",
+    MODEL_TENSOR.A_ENC_OUTPUT_NORM:         "a.blk.{bid}.ln2",
+    MODEL_TENSOR.A_ENC_FFN_UP:              "a.blk.{bid}.ffn_up",
+    MODEL_TENSOR.A_ENC_FFN_GATE:            "a.blk.{bid}.ffn_gate",
+    MODEL_TENSOR.A_ENC_FFN_DOWN:            "a.blk.{bid}.ffn_down",
+    MODEL_TENSOR.A_MMPROJ:                  "mm.a.mlp.{bid}",
+    MODEL_TENSOR.A_MM_NORM_PRE:             "mm.a.norm_pre",
+    MODEL_TENSOR.A_MM_NORM_MID:             "mm.a.norm_mid",
 }

 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
-    MODEL_ARCH.CLIP_VISION: [
+    MODEL_ARCH.MMPROJ: [
        MODEL_TENSOR.V_MMPROJ,
        MODEL_TENSOR.V_MMPROJ_FC,
        MODEL_TENSOR.V_MMPROJ_MLP,
@ -819,6 +870,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.V_RESMPL_QUERY,
        MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK,
        MODEL_TENSOR.V_MM_PATCH_MERGER,
+        # audio
+        MODEL_TENSOR.A_ENC_EMBD_POS,
+        MODEL_TENSOR.A_ENC_CONV1D,
+        MODEL_TENSOR.A_PRE_NORM,
+        MODEL_TENSOR.A_POST_NORM,
+        MODEL_TENSOR.A_ENC_ATTN_Q,
+        MODEL_TENSOR.A_ENC_ATTN_K,
+        MODEL_TENSOR.A_ENC_ATTN_V,
+        MODEL_TENSOR.A_ENC_INPUT_NORM,
+        MODEL_TENSOR.A_ENC_OUTPUT,
+        MODEL_TENSOR.A_ENC_OUTPUT_NORM,
+        MODEL_TENSOR.A_ENC_FFN_UP,
+        MODEL_TENSOR.A_ENC_FFN_GATE,
+        MODEL_TENSOR.A_ENC_FFN_DOWN,
+        MODEL_TENSOR.A_MMPROJ,
+        MODEL_TENSOR.A_MM_NORM_PRE,
+        MODEL_TENSOR.A_MM_NORM_MID,
    ],
    MODEL_ARCH.LLAMA: [
        MODEL_TENSOR.TOKEN_EMBD,
@ -2186,6 +2254,7 @@ class VisionProjectorType:
    LLAMA4 = "llama4"
    QWEN2VL = "qwen2vl_merger"
    QWEN25VL = "qwen2.5vl_merger"
+    ULTRAVOX = "ultravox"
    INTERNVL = "internvl"


--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -936,12 +936,18 @@ class GGUFWriter:

    # for vision models

+    def add_clip_has_vision_encoder(self, value: bool) -> None:
+        self.add_bool(Keys.Clip.HAS_VISION_ENCODER, value)
+
+    def add_clip_has_audio_encoder(self, value: bool) -> None:
+        self.add_bool(Keys.Clip.HAS_AUDIO_ENCODER, value)
+
+    def add_clip_projector_type(self, value: str) -> None:
+        self.add_string(Keys.Clip.PROJECTOR_TYPE, value)
+
    def add_vision_projection_dim(self, value: int) -> None:
        self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value)

-    def add_vision_has_vision_encoder(self, value: bool) -> None:
-        self.add_bool(Keys.ClipVision.HAS_VISION_ENCODER, value)
-
    def add_vision_patch_size(self, value: int) -> None:
        self.add_uint32(Keys.ClipVision.PATCH_SIZE, value)

@ -957,9 +963,6 @@ class GGUFWriter:
    def add_vision_head_count(self, value: int) -> None:
        self.add_uint32(Keys.ClipVision.Attention.HEAD_COUNT, value)

-    def add_vision_projector_type(self, value: str) -> None:
-        self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value)
-
    def add_vision_attention_layernorm_eps(self, value: float) -> None:
        self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value)

@ -987,6 +990,32 @@ class GGUFWriter:
    def add_vision_n_wa_pattern(self, value: int) -> None:
        self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)

+    # audio models
+
+    def add_audio_projection_dim(self, value: int) -> None:
+        self.add_uint32(Keys.ClipAudio.PROJECTION_DIM, value)
+
+    def add_audio_embedding_length(self, value: int) -> None:
+        self.add_uint32(Keys.ClipAudio.EMBEDDING_LENGTH, value)
+
+    def add_audio_feed_forward_length(self, value: int) -> None:
+        self.add_uint32(Keys.ClipAudio.FEED_FORWARD_LENGTH, value)
+
+    def add_audio_block_count(self, value: int) -> None:
+        self.add_uint32(Keys.ClipAudio.BLOCK_COUNT, value)
+
+    def add_audio_head_count(self, value: int) -> None:
+        self.add_uint32(Keys.ClipAudio.Attention.HEAD_COUNT, value)
+
+    def add_audio_attention_layernorm_eps(self, value: float) -> None:
+        self.add_float32(Keys.ClipAudio.Attention.LAYERNORM_EPS, value)
+
+    def add_audio_num_mel_bins(self, value: int) -> None:
+        self.add_uint32(Keys.ClipAudio.NUM_MEL_BINS, value)
+
+    def add_audio_stack_factor(self, value: int) -> None:
+        self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)
+
    def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
        pack_prefix = ''
        if not skip_pack_prefix:
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -1110,6 +1110,68 @@ class TensorNameMap:
        MODEL_TENSOR.V_MM_PATCH_MERGER: (
            "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1
        ),
+
+        # audio (mtmd)
+
+        MODEL_TENSOR.A_ENC_EMBD_POS: (
+            "audio_tower.embed_positions", # ultravox
+        ),
+
+        MODEL_TENSOR.A_ENC_CONV1D: (
+            "audio_tower.conv{bid}", # ultravox
+        ),
+
+        MODEL_TENSOR.A_PRE_NORM: (),
+
+        MODEL_TENSOR.A_POST_NORM: (
+            "audio_tower.layer_norm", # ultravox
+        ),
+
+        MODEL_TENSOR.A_ENC_ATTN_Q: (
+            "audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
+        ),
+
+        MODEL_TENSOR.A_ENC_ATTN_K: (
+            "audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
+        ),
+
+        MODEL_TENSOR.A_ENC_ATTN_V: (
+            "audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
+        ),
+
+        MODEL_TENSOR.A_ENC_INPUT_NORM: (
+            "audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
+        ),
+
+        MODEL_TENSOR.A_ENC_OUTPUT: (
+            "audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
+        ),
+
+        MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
+            "audio_tower.layers.{bid}.final_layer_norm", # ultravox
+        ),
+
+        MODEL_TENSOR.A_ENC_FFN_UP: (
+            "audio_tower.layers.{bid}.fc1", # ultravox
+        ),
+
+        MODEL_TENSOR.A_ENC_FFN_GATE: (),
+
+        MODEL_TENSOR.A_ENC_FFN_DOWN: (
+            "audio_tower.layers.{bid}.fc2", # ultravox
+        ),
+
+        MODEL_TENSOR.A_MMPROJ: (
+            "audio.multi_modal_projector.linear_{bid}", # ultravox
+        ),
+
+        MODEL_TENSOR.A_MM_NORM_PRE: (
+            "audio.multi_modal_projector.ln_pre", # ultravox
+        ),
+
+        MODEL_TENSOR.A_MM_NORM_MID: (
+            "audio.multi_modal_projector.ln_mid", # ultravox
+        ),
    }

    # architecture-specific block mappings
--- a/otherarch/tts_adapter.cpp
+++ b/otherarch/tts_adapter.cpp
@ -551,6 +551,7 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
    llama_model * ctsmodel = llama_model_load_from_file(modelfile_cts.c_str(), tts_model_params);

    tts_ctx_params.embeddings = true; //this requires embeddings instead
+    tts_ctx_params.n_ubatch = tts_ctx_params.n_batch;
    cts_ctx = llama_init_from_model(ctsmodel, tts_ctx_params);

    if (cts_ctx == nullptr) {
@ -948,7 +949,7 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
    kcpp_embd_batch codebatch = kcpp_embd_batch(codes,0,false,true);
    printf("\nRunning Vocoder (%d AudioTokens)", codes.size());

-    if (llama_decode(cts_ctx, codebatch.batch) != 0) {
+    if (llama_encode(cts_ctx, codebatch.batch) != 0) {
        printf("\nError: TTS vocoder generation failed!\n");
        output.data = "";
        output.status = 0;
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@ -2,6 +2,22 @@

 #include "ggml.h"

+void llama_hparams::set_swa_pattern(uint32_t n_pattern) {
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
+    }
+}
+
+bool llama_hparams::is_swa_any() const {
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        if (swa_layers[il]) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
 uint32_t llama_hparams::n_head(uint32_t il) const {
    if (il < n_layer) {
        return n_head_arr[il];
@ -72,7 +88,7 @@ uint32_t llama_hparams::n_embd_v_s() const {

 bool llama_hparams::is_swa(uint32_t il) const {
    if (il < n_layer) {
-        return n_swa_pattern == 0 || (il % n_swa_pattern < (n_swa_pattern - 1));
+        return swa_layers[il];
    }

    GGML_ABORT("fatal error");
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@ -102,20 +102,12 @@ struct llama_hparams {

    // Sliding Window Attention (SWA)
    llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
-
-    uint32_t n_swa = 0;         // the size of the sliding window (0 - no SWA)
-    uint32_t n_swa_pattern = 1; // this value n means that every nth layer is dense (i.e. non-SWA)
-                                // by default n == 1, all layers are dense
-                                // note that if n_swa_pattern == 0, all layers are SWA
-                                // example: n_swa_pattern = 3
-                                //   il == 0: swa
-                                //   il == 1: swa
-                                //   il == 2: dense
-                                //   il == 3: swa
-                                //   il == 4: swa
-                                //   il == 5: dense
-                                //   il == 6: swa
-                                //   etc ...
+    // the size of the sliding window (0 - no SWA)
+    uint32_t n_swa = 0;
+    // if swa_layers[il] == true, then layer il is SWA
+    // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
+    // by default, all layers are dense
+    std::array<bool, LLAMA_MAX_LAYERS> swa_layers;

    // for State Space Models
    uint32_t ssm_d_conv  = 0;
@ -153,6 +145,23 @@ struct llama_hparams {
    enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;
    enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;

+    // this value n_pattern means that every nth layer is dense (i.e. non-SWA)
+    // note that if n_pattern == 0, all layers are SWA
+    //           if n_pattern == 1, all layers are dense
+    // example: n_pattern = 3
+    //   il == 0: swa
+    //   il == 1: swa
+    //   il == 2: dense
+    //   il == 3: swa
+    //   il == 4: swa
+    //   il == 5: dense
+    //   il == 6: swa
+    //   etc ...
+    void set_swa_pattern(uint32_t n_pattern);
+
+    // return true if one of the layers is SWA
+    bool is_swa_any() const;
+
    uint32_t n_head(uint32_t il = 0) const;

    uint32_t n_head_kv(uint32_t il = 0) const;
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -468,11 +468,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
        GGML_ASSERT(hparams.n_expert_used == 0);
    }

-    // zero-out the array hparams
    std::fill(hparams.n_head_arr.begin(),    hparams.n_head_arr.end(),    0);
    std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
    std::fill(hparams.n_ff_arr.begin(),      hparams.n_ff_arr.end(),      0);

+    std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
+
+    std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);
+
    ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH,  hparams.n_ff_arr,   hparams.n_layer, false);
    ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);

@ -579,7 +582,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {

                hparams.swa_type      = LLAMA_SWA_TYPE_CHUNKED;
                hparams.n_swa         = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
-                hparams.n_swa_pattern = 4;    // pattern: 3 chunked - 1 full
+                hparams.set_swa_pattern(4);   // pattern: 3 chunked - 1 full

                switch (hparams.n_expert) {
                    case 16:  type = LLM_TYPE_17B_16E; break;
@ -868,7 +871,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;

                    hparams.n_swa         = 0;
-                    hparams.n_swa_pattern = 1;
+                    hparams.set_swa_pattern(1);
                }
            } break;
        case LLM_ARCH_PHIMOE:
@ -940,7 +943,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
            {
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
                hparams.n_swa = 4096; // default value of gemma 2
-                hparams.n_swa_pattern = 2;
+                hparams.set_swa_pattern(2);
                hparams.attn_soft_cap = true;

                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
@ -958,7 +961,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
        case LLM_ARCH_GEMMA3:
            {
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                hparams.n_swa_pattern = 6;
+                hparams.set_swa_pattern(6);

                hparams.rope_freq_base_train_swa  = 10000.0f;
                hparams.rope_freq_scale_train_swa = 1.0f;
@ -1043,7 +1046,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
        case LLM_ARCH_COHERE2:
            {
                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                hparams.n_swa_pattern = 4;
+                hparams.set_swa_pattern(4);

                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
                ml.get_key(LLM_KV_LOGIT_SCALE,              hparams.f_logit_scale);
@ -4417,7 +4420,7 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: n_head_kv        = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
        LLAMA_LOG_INFO("%s: n_rot            = %u\n",     __func__, hparams.n_rot);
        LLAMA_LOG_INFO("%s: n_swa            = %u\n",     __func__, hparams.n_swa);
-        LLAMA_LOG_INFO("%s: n_swa_pattern    = %u\n",     __func__, hparams.n_swa_pattern);
+        LLAMA_LOG_INFO("%s: is_swa_any       = %u\n",     __func__, hparams.is_swa_any());
        LLAMA_LOG_INFO("%s: n_embd_head_k    = %u\n",     __func__, hparams.n_embd_head_k);
        LLAMA_LOG_INFO("%s: n_embd_head_v    = %u\n",     __func__, hparams.n_embd_head_v);
        LLAMA_LOG_INFO("%s: n_gqa            = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il);        }, hparams.n_layer).c_str());
@ -13289,6 +13292,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
        case LLM_ARCH_JINA_BERT_V2:
        case LLM_ARCH_NOMIC_BERT:
        case LLM_ARCH_NOMIC_BERT_MOE:
+        case LLM_ARCH_WAVTOKENIZER_DEC:
            {
                res = nullptr;
            } break;
@ -13315,7 +13319,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);

                if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
-                    GGML_ASSERT(hparams.n_swa_pattern != 1);
+                    GGML_ASSERT(hparams.is_swa_any());

                    res = new llama_kv_cache_unified_iswa(
                            *this,
@ -13329,7 +13333,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                            cparams.n_batch,
                            padding);
                } else {
-                    GGML_ASSERT(hparams.n_swa_pattern == 1);
+                    GGML_ASSERT(!hparams.is_swa_any());

                    res = new llama_kv_cache_unified(
                            *this,
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@ -16,22 +16,26 @@
 #define KEY_FTYPE               "general.file_type"
 #define KEY_NAME                "general.name"
 #define KEY_DESCRIPTION         "general.description"
-#define KEY_MINICPMV_VERSION    "clip.minicpmv_version"
+#define KEY_PROJ_TYPE           "clip.projector_type"
+#define KEY_HAS_AUDIO_ENC       "clip.has_audio_encoder"
+#define KEY_HAS_VISION_ENC      "clip.has_vision_encoder"
 #define KEY_USE_GELU            "clip.use_gelu"
 #define KEY_USE_SILU            "clip.use_silu"
-#define KEY_N_EMBD              "clip.vision.embedding_length"
-#define KEY_N_FF                "clip.vision.feed_forward_length"
-#define KEY_N_BLOCK             "clip.vision.block_count"
-#define KEY_N_HEAD              "clip.vision.attention.head_count"
-#define KEY_LAYER_NORM_EPS      "clip.vision.attention.layer_norm_epsilon"
-#define KEY_PROJ_DIM            "clip.vision.projection_dim"
+
+#define KEY_N_EMBD              "clip.%s.embedding_length"
+#define KEY_N_FF                "clip.%s.feed_forward_length"
+#define KEY_N_BLOCK             "clip.%s.block_count"
+#define KEY_PROJ_DIM            "clip.%s.projection_dim"
+#define KEY_N_HEAD              "clip.%s.attention.head_count"
+#define KEY_LAYER_NORM_EPS      "clip.%s.attention.layer_norm_epsilon"
+
+// vision-specific
 #define KEY_IMAGE_SIZE          "clip.vision.image_size"
 #define KEY_PATCH_SIZE          "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN          "clip.vision.image_mean"
 #define KEY_IMAGE_STD           "clip.vision.image_std"
 #define KEY_FEATURE_LAYER       "clip.vision.feature_layer"
 #define KEY_PROJ_SCALE_FACTOR   "clip.vision.projector.scale_factor"
-#define KEY_PROJ_TYPE           "clip.projector_type"
 #define KEY_SPATIAL_MERGE_SIZE  "clip.vision.spatial_merge_size"

 #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
@ -39,13 +43,18 @@
 #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
 #define KEY_WIN_ATTN_PATTERN      "clip.vision.n_wa_pattern"
 #define KEY_ATTN_WINDOW_SIZE      "clip.vision.window_size"
+#define KEY_MINICPMV_VERSION      "clip.minicpmv_version"
+
+// audio-specific
+#define KEY_A_NUM_MEL_BINS      "clip.audio.num_mel_bins"
+#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"


 //
 // tensor name constants
 //

-#define TN_POS_EMBD        "v.position_embd.weight"
+#define TN_POS_EMBD        "%s.position_embd.weight"
 #define TN_CLASS_EMBD      "v.class_embd"
 #define TN_PATCH_EMBD      "v.patch_embd.weight"  // not rename tensor with ".0" postfix for backwrad compat
 #define TN_PATCH_EMBD_1    "v.patch_embd.weight.1"
@ -95,6 +104,12 @@
 #define TN_GLM_ADAPTER_GATE     "adapter.linear.gate.%s"
 #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"

+// ultravox
+#define TN_CONV1D       "a.conv1d.%d.%s"
+#define TN_MM_AUDIO_MLP "mm.a.mlp.%d.%s"
+#define TN_MM_NORM_PRE  "mm.a.norm_pre.%s"
+#define TN_MM_NORM_MID  "mm.a.norm_mid.%s"
+
 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))

@ -110,6 +125,7 @@ enum projector_type {
    PROJECTOR_TYPE_IDEFICS3,
    PROJECTOR_TYPE_PIXTRAL,
    PROJECTOR_TYPE_QWEN25VL,
+    PROJECTOR_TYPE_ULTRAVOX,
    PROJECTOR_TYPE_INTERNVL,
    PROJECTOR_TYPE_LLAMA4,
    PROJECTOR_TYPE_UNKNOWN,
@ -126,6 +142,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
+    { PROJECTOR_TYPE_ULTRAVOX,  "ultravox"},
    { PROJECTOR_TYPE_INTERNVL,  "internvl"},
    { PROJECTOR_TYPE_LLAMA4,    "llama4"},
 };
@ -147,8 +164,10 @@ struct clip_image_u8 {
    std::vector<uint8_t> buf;
 };

-// RGB float32 image (NHWC)
-// Memory layout: RGBRGBRGB...
+// For images, buf.size() == nx*ny*3
+//     Memory layout: RGBRGBRGB...
+// For audio, only one channel is used, buf.size() == nx*ny
+//     nx will be n_frames and ny will be n_mel
 struct clip_image_f32 {
    int nx;
    int ny;
@ -242,6 +261,7 @@ struct clip_image_u8_batch {

 struct clip_image_f32_batch {
    std::vector<clip_image_f32_ptr> entries;
+    bool is_audio = false;

    // for llava-uhd style models, we need to know the grid size
    // note: entries.size() == grid_x * grid_y + 1 (one overview image)
@ -249,7 +269,12 @@ struct clip_image_f32_batch {
    int grid_y = 0;

    clip_image_f32_batch clone() const {
-        clip_image_f32_batch new_batch;
+        clip_image_f32_batch new_batch{
+            /* entries  */ {},
+            /* is_audio */ is_audio,
+            /* grid_x   */ grid_x,
+            /* grid_y   */ grid_y,
+        };
        new_batch.entries.reserve(entries.size());
        for (const auto & entry : entries) {
            new_batch.entries.emplace_back(new clip_image_f32(*entry));
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -50,6 +50,7 @@ struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callbac

 enum ffn_op_type {
    FFN_GELU,
+    FFN_GELU_ERF,
    FFN_SILU,
    FFN_GELU_QUICK,
 };
@ -180,6 +181,9 @@ enum patch_merge_type {
 };

 struct clip_hparams {
+    bool has_vision = false;
+    bool has_audio = false;
+
    int32_t image_size;
    int32_t patch_size;
    int32_t n_embd;
@ -206,6 +210,10 @@ struct clip_hparams {
    int32_t attn_window_size = 0;
    int32_t n_wa_pattern = 0;
    int32_t spatial_merge_size = 0;
+
+    // audio
+    int32_t n_mel_bins = 0; // whisper preprocessor
+    int32_t proj_stack_factor = 0; // ultravox
 };

 struct clip_layer {
@ -347,6 +355,14 @@ struct clip_vision_model {
    // pixtral
    ggml_tensor * token_embd_img_break = nullptr;
    ggml_tensor * mm_patch_merger_w = nullptr;
+
+    // ultravox / whisper encoder
+    ggml_tensor * conv1d_1_w = nullptr;
+    ggml_tensor * conv1d_1_b = nullptr;
+    ggml_tensor * conv1d_2_w = nullptr;
+    ggml_tensor * conv1d_2_b = nullptr;
+    ggml_tensor * mm_norm_pre_w = nullptr;
+    ggml_tensor * mm_norm_mid_w = nullptr;
 };

 bool enable_gpu_clip = true;
@ -1430,6 +1446,104 @@ struct clip_graph {
        return gf;
    }

+    // whisper encoder with custom projector
+    ggml_cgraph * build_whisper_enc() {
+        const int n_frames = img.nx;
+        const int n_pos    = n_frames / 2;
+        GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
+
+        ggml_tensor * inp = build_inp_raw(1);
+
+        // conv1d block
+        {
+            // convolution + gelu
+            ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
+            cur = ggml_add(ctx0, cur, model.conv1d_1_b);
+
+            cur = ggml_gelu_erf(ctx0, cur);
+
+            cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
+            cur = ggml_add(ctx0, cur, model.conv1d_2_b);
+
+            cur = ggml_gelu_erf(ctx0, cur);
+            // transpose
+            inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+            cb(inp, "after_conv1d", -1);
+        }
+
+        // sanity check (only check one layer, but it should be the same for all)
+        GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
+        GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
+        GGML_ASSERT(model.layers[0].q_b);
+        GGML_ASSERT(model.layers[0].v_b);
+        GGML_ASSERT(!model.layers[0].k_b); // no bias for k
+        GGML_ASSERT(model.post_ln_w && model.post_ln_b);
+
+        ggml_tensor * pos_embd_selected = ggml_view_2d(
+            ctx0, model.position_embeddings,
+            model.position_embeddings->ne[0], n_pos,
+            model.position_embeddings->nb[1], 0
+        );
+        ggml_tensor * cur = build_vit(
+                                inp, n_pos,
+                                NORM_TYPE_NORMAL,
+                                hparams.ffn_op,
+                                pos_embd_selected,
+                                nullptr);
+
+        cb(cur, "after_transformer", -1);
+
+        // StackAudioFrames
+        // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
+        {
+            int64_t stride = n_embd * hparams.proj_stack_factor;
+            int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
+            int64_t pad = padded_len - ggml_nelements(cur);
+            if (pad > 0) {
+                cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
+                cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
+            }
+            cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
+                                ggml_row_size(cur->type, stride), 0);
+        }
+
+        cb(cur, "after_stacked", -1);
+
+        // UltravoxProjector
+        {
+            // pre-norm
+            cur = ggml_rms_norm(ctx0, cur, 1e-6);
+            cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
+
+            // ffn in
+            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
+
+            // swiglu
+            {
+                int64_t split_point = cur->ne[0] / 2;
+                ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
+                ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
+
+                // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
+                x1 = ggml_silu(ctx0, x1);
+                cur = ggml_mul(ctx0, x0, x1);
+            }
+
+            // mid-norm
+            cur = ggml_rms_norm(ctx0, cur, 1e-6);
+            cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
+
+            // ffn out
+            cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
+        }
+
+        cb(cur, "projected", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
 private:
    //
    // utility functions
@ -1584,8 +1698,8 @@ private:
        return inp;
    }

-    ggml_tensor * build_inp_raw() {
-        ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, 3);
+    ggml_tensor * build_inp_raw(int channels = 3) {
+        ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
        ggml_set_name(inp_raw, "inp_raw");
        ggml_set_input(inp_raw);
        return inp_raw;
@ -1663,6 +1777,11 @@ private:
                    cur = ggml_gelu(ctx0, cur);
                    cb(cur, "ffn_gelu", il);
                } break;
+            case FFN_GELU_ERF:
+                {
+                    cur = ggml_gelu_erf(ctx0, cur);
+                    cb(cur, "ggml_gelu_erf", il);
+                } break;
            case FFN_GELU_QUICK:
                {
                    cur = ggml_gelu_quick(ctx0, cur);
@ -1854,6 +1973,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                res = graph.build_llama4();
            } break;
+        case PROJECTOR_TYPE_ULTRAVOX:
+            {
+                res = graph.build_whisper_enc();
+            } break;
        default:
            {
                res = graph.build_llava();
@ -1952,18 +2075,30 @@ struct clip_model_loader {

        // other hparams
        {
-            get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); // legacy
+            get_bool(KEY_HAS_AUDIO_ENC,  hparams.has_audio, false);
+            get_bool(KEY_HAS_VISION_ENC, hparams.has_vision, false);

-            get_u32(KEY_N_EMBD,         hparams.n_embd);
-            get_u32(KEY_N_HEAD,         hparams.n_head);
-            get_u32(KEY_N_FF,           hparams.n_ff);
-            get_u32(KEY_N_BLOCK,        hparams.n_layer);
-            get_u32(KEY_PROJ_DIM,       hparams.projection_dim);
-            get_f32(KEY_LAYER_NORM_EPS, hparams.eps);
-            get_u32(KEY_IMAGE_SIZE,     hparams.image_size);
-            get_u32(KEY_PATCH_SIZE,     hparams.patch_size);
-            get_u32(KEY_IMAGE_CROP_RESOLUTION,    hparams.image_crop_resolution, false);
-            get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
+            const char * prefix = hparams.has_vision ? "vision" : "audio";
+            get_u32(string_format(KEY_N_EMBD,         prefix), hparams.n_embd);
+            get_u32(string_format(KEY_N_HEAD,         prefix), hparams.n_head);
+            get_u32(string_format(KEY_N_FF,           prefix), hparams.n_ff);
+            get_u32(string_format(KEY_N_BLOCK,        prefix), hparams.n_layer);
+            get_u32(string_format(KEY_PROJ_DIM,       prefix), hparams.projection_dim);
+            get_f32(string_format(KEY_LAYER_NORM_EPS, prefix), hparams.eps);
+
+            if (hparams.has_vision) {
+                get_u32(KEY_IMAGE_SIZE, hparams.image_size);
+                get_u32(KEY_PATCH_SIZE, hparams.patch_size);
+                get_u32(KEY_IMAGE_CROP_RESOLUTION,    hparams.image_crop_resolution, false);
+                get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
+                get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); // legacy
+
+            } else if (hparams.has_audio) {
+                get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
+
+            } else {
+                throw std::runtime_error(string_format("%s: neither vision nor audio encoder is present\n", __func__));
+            }

            // default warmup value
            hparams.warmup_image_size = hparams.image_size;
@ -2007,7 +2142,7 @@ struct clip_model_loader {
                }
            }

-            {
+            if (hparams.has_vision) {
                int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN);
                int idx_std  = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD);
                GGML_ASSERT(idx_mean >= 0 && "image_mean not found");
@ -2098,30 +2233,43 @@ struct clip_model_loader {
                            isize,   isize*3, // 336, 1008
                        };
                    } break;
+                case PROJECTOR_TYPE_ULTRAVOX:
+                    {
+                        get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor);
+                        if (hparams.n_mel_bins != 128) {
+                            throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
+                        }
+                        hparams.ffn_op = FFN_GELU_ERF;
+                        log_ffn_op = "gelu_erf"; // temporary solution for logging
+                    } break;
                default:
                    break;
            }

            LOG_INF("%s: projector:          %s\n", __func__, proj_type.c_str());
+            LOG_INF("%s: has_vision_encoder: %d\n", __func__, hparams.has_vision);
+            LOG_INF("%s: has_audio_encoder:  %d\n", __func__, hparams.has_audio);
            LOG_INF("%s: n_embd:             %d\n", __func__, hparams.n_embd);
            LOG_INF("%s: n_head:             %d\n", __func__, hparams.n_head);
            LOG_INF("%s: n_ff:               %d\n", __func__, hparams.n_ff);
            LOG_INF("%s: n_layer:            %d\n", __func__, hparams.n_layer);
-            LOG_INF("%s: projection_dim:     %d\n", __func__, hparams.projection_dim);
-            LOG_INF("%s: image_size:         %d\n", __func__, hparams.image_size);
-            LOG_INF("%s: patch_size:         %d\n", __func__, hparams.patch_size);
-            LOG_INF("\n");
-            LOG_INF("%s: has_llava_proj:     %d\n", __func__, ctx_clip.has_llava_projector);
-            LOG_INF("%s: minicpmv_version:   %d\n", __func__, ctx_clip.minicpmv_version);
-            LOG_INF("%s: proj_scale_factor:  %d\n", __func__, hparams.proj_scale_factor);
-            LOG_INF("%s: n_wa_pattern:       %d\n", __func__, hparams.n_wa_pattern);
            LOG_INF("%s: ffn_op:             %s\n", __func__, log_ffn_op.c_str());
+            LOG_INF("%s: projection_dim:     %d\n", __func__, hparams.projection_dim);
+            LOG_INF("\n");
+            if (hparams.has_vision) {
+                LOG_INF("%s: image_size:         %d\n", __func__, hparams.image_size);
+                LOG_INF("%s: patch_size:         %d\n", __func__, hparams.patch_size);
+                LOG_INF("%s: has_llava_proj:     %d\n", __func__, ctx_clip.has_llava_projector);
+                LOG_INF("%s: minicpmv_version:   %d\n", __func__, ctx_clip.minicpmv_version);
+                LOG_INF("%s: proj_scale_factor:  %d\n", __func__, hparams.proj_scale_factor);
+                LOG_INF("%s: n_wa_pattern:       %d\n", __func__, hparams.n_wa_pattern);
+            } else if (hparams.has_audio) {
+                LOG_INF("%s: n_mel_bins:         %d\n", __func__, hparams.n_mel_bins);
+                LOG_INF("%s: proj_stack_factor:  %d\n", __func__, hparams.proj_stack_factor);
+            }
+            LOG_INF("\n");
            LOG_INF("%s: model size:         %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
            LOG_INF("%s: metadata size:      %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
-
-            if (ctx_clip.proj_type == PROJECTOR_TYPE_LLAMA4) {
-                LOG_WRN("%s: llama 4 vision is known to have degraded quality: https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
-            }
        }
    }

@ -2130,6 +2278,9 @@ struct clip_model_loader {
        std::map<std::string, size_t> tensor_offset;
        std::vector<ggml_tensor *> tensors_to_load;

+        // TODO @ngxson : support both audio and video in the future
+        const char * prefix = hparams.has_audio ? "a" : "v";
+
        // get offsets
        for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) {
            const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
@ -2167,47 +2318,47 @@ struct clip_model_loader {

        vision_model.class_embedding = get_tensor(TN_CLASS_EMBD, false);

-        vision_model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, "v", "weight"), false);
-        vision_model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, "v", "bias"),   false);
+        vision_model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
+        vision_model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, prefix, "bias"),   false);

-        vision_model.post_ln_w = get_tensor(string_format(TN_LN_POST, "v", "weight"), false);
-        vision_model.post_ln_b = get_tensor(string_format(TN_LN_POST, "v", "bias"),   false);
+        vision_model.post_ln_w = get_tensor(string_format(TN_LN_POST, prefix, "weight"), false);
+        vision_model.post_ln_b = get_tensor(string_format(TN_LN_POST, prefix, "bias"),   false);

        vision_model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
        vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD,   false);
        vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);

-        vision_model.position_embeddings = get_tensor(TN_POS_EMBD, false);
+        vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);

        // layers
        vision_model.layers.resize(hparams.n_layer);
        for (int il = 0; il < hparams.n_layer; ++il) {
            auto & layer = vision_model.layers[il];
-            layer.k_w    = get_tensor(string_format(TN_ATTN_K,      "v", il, "weight"));
-            layer.q_w    = get_tensor(string_format(TN_ATTN_Q,      "v", il, "weight"));
-            layer.v_w    = get_tensor(string_format(TN_ATTN_V,      "v", il, "weight"));
-            layer.o_w    = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight"));
-            layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, "v", il, "weight"), false);
-            layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, "v", il, "weight"), false);
-            layer.ln_1_w = get_tensor(string_format(TN_LN_1,        "v", il, "weight"), false);
-            layer.ln_2_w = get_tensor(string_format(TN_LN_2,        "v", il, "weight"), false);
-            layer.ls_1_w = get_tensor(string_format(TN_LS_1,        "v", il, "weight"), false); // no bias
-            layer.ls_2_w = get_tensor(string_format(TN_LS_2,        "v", il, "weight"), false); // no bias
+            layer.k_w    = get_tensor(string_format(TN_ATTN_K,      prefix, il, "weight"));
+            layer.q_w    = get_tensor(string_format(TN_ATTN_Q,      prefix, il, "weight"));
+            layer.v_w    = get_tensor(string_format(TN_ATTN_V,      prefix, il, "weight"));
+            layer.o_w    = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight"));
+            layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false);
+            layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
+            layer.ln_1_w = get_tensor(string_format(TN_LN_1,        prefix, il, "weight"), false);
+            layer.ln_2_w = get_tensor(string_format(TN_LN_2,        prefix, il, "weight"), false);
+            layer.ls_1_w = get_tensor(string_format(TN_LS_1,        prefix, il, "weight"), false); // no bias
+            layer.ls_2_w = get_tensor(string_format(TN_LS_2,        prefix, il, "weight"), false); // no bias

-            layer.k_b    = get_tensor(string_format(TN_ATTN_K,      "v", il, "bias"), false);
-            layer.q_b    = get_tensor(string_format(TN_ATTN_Q,      "v", il, "bias"), false);
-            layer.v_b    = get_tensor(string_format(TN_ATTN_V,      "v", il, "bias"), false);
-            layer.o_b    = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "bias"), false);
-            layer.ln_1_b = get_tensor(string_format(TN_LN_1,        "v", il, "bias"), false);
-            layer.ln_2_b = get_tensor(string_format(TN_LN_2,        "v", il, "bias"), false);
+            layer.k_b    = get_tensor(string_format(TN_ATTN_K,      prefix, il, "bias"), false);
+            layer.q_b    = get_tensor(string_format(TN_ATTN_Q,      prefix, il, "bias"), false);
+            layer.v_b    = get_tensor(string_format(TN_ATTN_V,      prefix, il, "bias"), false);
+            layer.o_b    = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false);
+            layer.ln_1_b = get_tensor(string_format(TN_LN_1,        prefix, il, "bias"), false);
+            layer.ln_2_b = get_tensor(string_format(TN_LN_2,        prefix, il, "bias"), false);

            // ffn
-            layer.ff_up_w   = get_tensor(string_format(TN_FFN_UP,   "v", il, "weight"));
-            layer.ff_up_b   = get_tensor(string_format(TN_FFN_UP,   "v", il, "bias"),   false);
-            layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, "v", il, "weight"), false);
-            layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, "v", il, "bias"),   false);
-            layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight"));
-            layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"),   false);
+            layer.ff_up_w   = get_tensor(string_format(TN_FFN_UP,   prefix, il, "weight"));
+            layer.ff_up_b   = get_tensor(string_format(TN_FFN_UP,   prefix, il, "bias"),   false);
+            layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, prefix, il, "weight"), false);
+            layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, prefix, il, "bias"),   false);
+            layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight"));
+            layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"),   false);

            // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
            // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
@ -2349,6 +2500,17 @@ struct clip_model_loader {
                    vision_model.mm_input_norm_w   = get_tensor(TN_MM_INP_NORM,     false);
                    vision_model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
                } break;
+            case PROJECTOR_TYPE_ULTRAVOX:
+                {
+                    vision_model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
+                    vision_model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
+                    vision_model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
+                    vision_model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
+                    vision_model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
+                    vision_model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
+                    vision_model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
+                    vision_model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
+                } break;
            case PROJECTOR_TYPE_INTERNVL:
                {
                    vision_model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
@ -2411,13 +2573,19 @@ struct clip_model_loader {
    }

    void alloc_compute_meta() {
+        const auto & hparams = ctx_clip.vision_model.hparams;
        ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());

        // create a fake batch
        clip_image_f32_batch batch;
        clip_image_f32_ptr img(clip_image_f32_init());
-        img->nx = ctx_clip.vision_model.hparams.warmup_image_size;
-        img->ny = ctx_clip.vision_model.hparams.warmup_image_size;
+        if (hparams.has_vision) {
+            img->nx = hparams.warmup_image_size;
+            img->ny = hparams.warmup_image_size;
+        } else {
+            img->nx = 1024; // TODO @ngxson : use a better default
+            img->ny = hparams.n_mel_bins;
+        }
        img->buf.resize(img->nx * img->ny * 3);
        batch.entries.push_back(std::move(img));

@ -3422,6 +3590,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
    } else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
        n_patches /= (scale_factor * scale_factor);
+    } else if (ctx->proj_type == PROJECTOR_TYPE_ULTRAVOX) {
+        const int proj_stack_factor = ctx->vision_model.hparams.proj_stack_factor;
+        const int n_len = CLIP_ALIGN(img->nx, proj_stack_factor);
+        n_patches = n_len / proj_stack_factor / 2;
    }

    return n_patches;
@ -3579,7 +3751,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    };

    // set input pixel values
-    {
+    if (!imgs.is_audio) {
        size_t nelem = 0;
        for (const auto & img : imgs.entries) {
            nelem += img->nx * img->ny * 3;
@ -3616,6 +3788,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
            }
        }
        set_input_f32("inp_raw", inp_raw);
+
+    } else {
+        // audio input
+        GGML_ASSERT(imgs.entries.size() == 1);
+        const auto & mel_inp = imgs.entries[0];
+        const int n_step = mel_inp->nx;
+        const int n_mel  = mel_inp->ny;
+        std::vector<float> inp_raw(n_step * n_mel);
+        std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float));
+        set_input_f32("inp_raw", inp_raw);
    }

    // set input per projector
@ -3812,6 +3994,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        case PROJECTOR_TYPE_GEMMA3:
        case PROJECTOR_TYPE_IDEFICS3:
        case PROJECTOR_TYPE_INTERNVL:
+        case PROJECTOR_TYPE_ULTRAVOX:
            {
                // do nothing
            } break;
@ -4076,6 +4259,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
            return ctx->vision_model.mm_input_proj_w->ne[0];
        case PROJECTOR_TYPE_IDEFICS3:
            return ctx->vision_model.projection->ne[1];
+        case PROJECTOR_TYPE_ULTRAVOX:
+            return ctx->vision_model.mm_2_w->ne[1];
        case PROJECTOR_TYPE_INTERNVL:
            return ctx->vision_model.mm_3_w->ne[1];
        case PROJECTOR_TYPE_LLAMA4:
@ -4111,6 +4296,15 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) {
 bool clip_is_pixtral(const struct clip_ctx * ctx) {
    return ctx->proj_type == PROJECTOR_TYPE_PIXTRAL;
 }
+
+bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
+    return ctx->vision_model.hparams.has_vision;
+}
+
+bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
+    return ctx->vision_model.hparams.has_audio;
+}
+
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
    clip_image_f32 clip_img;
    clip_img.buf.resize(h * w * 3);
@ -4131,3 +4325,14 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
 projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
    return ctx->proj_type;
 }
+
+void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {
+    clip_image_f32 * audio = new clip_image_f32;
+    audio->nx = n_frames;
+    audio->ny = n_mel;
+    audio->buf.resize(n_frames * n_mel);
+    std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float));
+
+    batch->entries.push_back(clip_image_f32_ptr(audio));
+    batch->is_audio = true;
+}
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@ -96,4 +96,10 @@ void set_clip_uses_gpu(bool usegpu);

 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);

-bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) ;
+// use by audio input
+void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel);
+
+bool clip_has_vision_encoder(const struct clip_ctx * ctx);
+bool clip_has_audio_encoder(const struct clip_ctx * ctx);
+
+bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) ;
--- a/tools/mtmd/miniaudio.h
+++ b/tools/mtmd/miniaudio.h
--- a/tools/mtmd/mtmd-audio.cpp
+++ b/tools/mtmd/mtmd-audio.cpp
@ -0,0 +1,855 @@
+// fix problem with std::min and std::max
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+#include "mtmd-audio.h"
+
+//#define MTMD_AUDIO_DEBUG
+
+#define MINIAUDIO_IMPLEMENTATION
+#ifndef MTMD_AUDIO_DEBUG
+#   define MA_NO_ENCODING
+#endif
+#define MA_NO_DEVICE_IO
+#define MA_NO_RESOURCE_MANAGER
+#define MA_NO_NODE_GRAPH
+#define MA_NO_ENGINE
+#define MA_NO_GENERATION
+#define MA_API static
+#include "miniaudio.h"
+
+#define _USE_MATH_DEFINES // for M_PI
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <thread>
+#include <vector>
+#include <fstream>
+#include <algorithm>
+
+// most of the code here is copied from whisper.cpp
+
+// align x to upper multiple of n
+#define _ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
+
+namespace whisper_preprocessor {
+
+#define SIN_COS_N_COUNT WHISPER_N_FFT
+namespace {
+struct whisper_global_cache {
+    // In FFT, we frequently use sine and cosine operations with the same values.
+    // We can use precalculated values to speed up the process.
+    float sin_vals[SIN_COS_N_COUNT];
+    float cos_vals[SIN_COS_N_COUNT];
+
+    // Hann window (Use cosf to eliminate difference)
+    // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html
+    // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147
+    float hann_window[WHISPER_N_FFT];
+
+    whisper_global_cache() {
+        fill_sin_cos_table();
+        fill_hann_window(sizeof(hann_window)/sizeof(hann_window[0]), true, hann_window);
+    }
+
+    void fill_sin_cos_table() {
+        for (int i = 0; i < SIN_COS_N_COUNT; i++) {
+            double theta = (2 * M_PI * i) / SIN_COS_N_COUNT;
+            sin_vals[i] = sinf(theta);
+            cos_vals[i] = cosf(theta);
+        }
+    }
+
+    void fill_hann_window(int length, bool periodic, float * output) {
+        int offset = -1;
+        if (periodic) {
+            offset = 0;
+        }
+        for (int i = 0; i < length; i++) {
+            output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
+        }
+    }
+} global_cache;
+}
+
+// naive Discrete Fourier Transform
+// input is real-valued
+// output is complex-valued
+static void dft(const float* in, int N, float* out) {
+    const int sin_cos_step = SIN_COS_N_COUNT / N;
+
+    for (int k = 0; k < N; k++) {
+        float re = 0;
+        float im = 0;
+
+        for (int n = 0; n < N; n++) {
+            int idx = (k * n * sin_cos_step) % (SIN_COS_N_COUNT); // t = 2*M_PI*k*n/N
+            re += in[n]*global_cache.cos_vals[idx]; // cos(t)
+            im -= in[n]*global_cache.sin_vals[idx]; // sin(t)
+        }
+
+        out[k*2 + 0] = re;
+        out[k*2 + 1] = im;
+    }
+}
+
+// Cooley-Tukey FFT
+// poor man's implementation - use something better
+// input is real-valued
+// output is complex-valued
+static void fft(float* in, int N, float* out) {
+    if (N == 1) {
+        out[0] = in[0];
+        out[1] = 0;
+        return;
+    }
+
+    const int half_N = N / 2;
+    if (N - half_N*2 == 1) {
+        dft(in, N, out);
+        return;
+    }
+
+    float* even = in + N;
+    for (int i = 0; i < half_N; ++i) {
+        even[i]= in[2*i];
+    }
+    float* even_fft = out + 2 * N;
+    fft(even, half_N, even_fft);
+
+    float* odd = even;
+    for (int i = 0; i < half_N; ++i) {
+        odd[i] = in[2*i + 1];
+    }
+    float* odd_fft = even_fft + N;
+    fft(odd, half_N, odd_fft);
+
+    const int sin_cos_step = SIN_COS_N_COUNT / N;
+    for (int k = 0; k < half_N; k++) {
+        int idx = k * sin_cos_step; // t = 2*M_PI*k/N
+        float re = global_cache.cos_vals[idx]; // cos(t)
+        float im = -global_cache.sin_vals[idx]; // sin(t)
+
+        float re_odd = odd_fft[2*k + 0];
+        float im_odd = odd_fft[2*k + 1];
+
+        out[2*k + 0] = even_fft[2*k + 0] + re*re_odd - im*im_odd;
+        out[2*k + 1] = even_fft[2*k + 1] + re*im_odd + im*re_odd;
+
+        out[2*(k + half_N) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd;
+        out[2*(k + half_N) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd;
+    }
+}
+
+static void log_mel_spectrogram_worker_thread(int ith, const float * hann, const std::vector<float> & samples,
+                                              int n_samples, int frame_size, int frame_step, int n_threads,
+                                              const whisper_filters & filters, whisper_mel & mel) {
+    std::vector<float> fft_in(frame_size * 2, 0.0);
+    std::vector<float> fft_out(frame_size * 2 * 2 * 2);
+
+    int n_fft = filters.n_fft;
+    int i = ith;
+
+    // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
+    WHISPER_ASSERT(n_fft == 1 + (frame_size / 2));
+
+    // calculate FFT only when fft_in are not all zero
+    for (; i < std::min(n_samples / frame_step + 1, mel.n_len); i += n_threads) {
+        const int offset = i * frame_step;
+
+        // apply Hann window (~10% faster)
+        for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
+            fft_in[j] = hann[j] * samples[offset + j];
+        }
+
+        // fill the rest with zeros
+        if (n_samples - offset < frame_size) {
+            std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
+        }
+
+        // FFT
+        fft(fft_in.data(), frame_size, fft_out.data());
+
+        // Calculate modulus^2 of complex numbers
+        // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
+        for (int j = 0; j < n_fft; j++) {
+            fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]);
+        }
+
+        // mel spectrogram
+        for (int j = 0; j < mel.n_mel; j++) {
+            double sum = 0.0;
+            // unroll loop (suggested by GH user @lunixbochs)
+            int k = 0;
+            for (k = 0; k < n_fft - 3; k += 4) {
+                sum +=
+                        fft_out[k + 0] * filters.data[j * n_fft + k + 0] +
+                        fft_out[k + 1] * filters.data[j * n_fft + k + 1] +
+                        fft_out[k + 2] * filters.data[j * n_fft + k + 2] +
+                        fft_out[k + 3] * filters.data[j * n_fft + k + 3];
+            }
+            // handle n_fft remainder
+            for (; k < n_fft; k++) {
+                sum += fft_out[k] * filters.data[j * n_fft + k];
+            }
+            sum = log10(std::max(sum, 1e-10));
+            mel.data[j * mel.n_len + i] = sum;
+        }
+    }
+
+    // Otherwise fft_out are all zero
+    double sum = log10(1e-10);
+    for (; i < mel.n_len; i += n_threads) {
+        for (int j = 0; j < mel.n_mel; j++) {
+            mel.data[j * mel.n_len + i] = sum;
+        }
+    }
+}
+
+// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157
+static bool log_mel_spectrogram(
+        const float * samples,
+        const int   n_samples,
+        const int   /*sample_rate*/,
+        const int   frame_size,
+        const int   frame_step,
+        const int   n_mel,
+        const int   n_threads,
+        const whisper_filters & filters,
+        const bool   debug,
+        whisper_mel & mel) {
+    //const int64_t t_start_us = ggml_time_us();
+
+    // Hann window
+    WHISPER_ASSERT(frame_size == WHISPER_N_FFT && "Unsupported frame_size");
+    const float * hann = global_cache.hann_window;
+
+    // Calculate the length of padding
+    int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
+    int64_t stage_2_pad = frame_size / 2;
+
+    // Initialize a vector and copy data from C array to it.
+    std::vector<float> samples_padded;
+    samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2);
+    std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad);
+
+    // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio
+    std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0);
+
+    // reflective pad 200 samples at the beginning of audio
+    std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin());
+
+    mel.n_mel     = n_mel;
+    // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936
+    // Calculate number of frames + remove the last frame
+    mel.n_len     = (samples_padded.size() - frame_size) / frame_step;
+    // Calculate semi-padded sample length to ensure compatibility
+    mel.n_len_org = 1 + (n_samples + stage_2_pad - frame_size) / frame_step;
+    mel.data.resize(mel.n_mel * mel.n_len);
+
+    {
+        std::vector<std::thread> workers(n_threads - 1);
+        for (int iw = 0; iw < n_threads - 1; ++iw) {
+            workers[iw] = std::thread(
+                    log_mel_spectrogram_worker_thread, iw + 1, hann, std::cref(samples_padded),
+                    n_samples + stage_2_pad, frame_size, frame_step, n_threads,
+                    std::cref(filters), std::ref(mel));
+        }
+
+        // main thread
+        log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, filters, mel);
+
+        for (int iw = 0; iw < n_threads - 1; ++iw) {
+            workers[iw].join();
+        }
+    }
+
+    // clamping and normalization
+    double mmax = -1e20;
+    for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
+        if (mel.data[i] > mmax) {
+            mmax = mel.data[i];
+        }
+    }
+
+    mmax -= 8.0;
+
+    for (int i = 0; i < mel.n_mel*mel.n_len; i++) {
+        if (mel.data[i] < mmax) {
+            mel.data[i] = mmax;
+        }
+
+        mel.data[i] = (mel.data[i] + 4.0)/4.0;
+    }
+
+    // Dump log_mel_spectrogram
+    if (debug) {
+        std::ofstream outFile("log_mel_spectrogram.json");
+        outFile << "[";
+        for (uint64_t i = 0; i < mel.data.size() - 1; i++) {
+            outFile << mel.data[i] << ", ";
+        }
+        outFile << mel.data[mel.data.size() - 1] << "]";
+        outFile.close();
+    }
+
+    return true;
+}
+
+bool preprocess_audio(
+        const float * samples,
+        size_t n_samples,
+        const whisper_filters & filters,
+        std::vector<whisper_mel> & output) {
+
+    if (n_samples == 0) {
+        // empty audio
+        return false;
+    }
+
+    whisper_mel out_full;
+    bool ok = log_mel_spectrogram(
+                samples,
+                n_samples,
+                COMMON_SAMPLE_RATE,
+                WHISPER_N_FFT,
+                WHISPER_HOP_LENGTH,
+                filters.n_mel,
+                4, // n_threads
+                filters,
+                false, // debug
+                out_full);
+    if (!ok) {
+        return false;
+    }
+
+    // because the cgraph in clip.cpp only accepts 3000 frames each, we need to split the mel
+    // we always expect the mel to have 3000 silent frames at the end
+    // printf("n_len %d\n", out_full.n_len);
+    const size_t frames_per_chunk = 3000;
+    GGML_ASSERT((size_t)out_full.n_len > frames_per_chunk);
+    for (size_t off = 0; off < (size_t)out_full.n_len; off += frames_per_chunk) {
+        int n_len = std::min(frames_per_chunk, (size_t)out_full.n_len - off);
+        if ((size_t)n_len < frames_per_chunk) {
+            break; // last uncomplete chunk will always be a padded chunk, safe to ignore
+        }
+
+        whisper_mel out_chunk;
+        out_chunk.n_len     = n_len;
+        out_chunk.n_mel     = out_full.n_mel;
+        out_chunk.n_len_org = out_full.n_mel; // unused
+        out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len);
+
+        for (int i = 0; i < out_full.n_mel; i++) {
+            auto src = out_full.data.begin() + i*out_full.n_len + off;
+            out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk);
+        }
+
+        output.push_back(std::move(out_chunk));
+    }
+
+    return true;
+}
+
+} // namespace whisper_preprocessor
+
+
+namespace audio_helpers {
+
+bool is_audio_file(const char * buf, size_t len) {
+    if (len < 12) {
+        return false;
+    }
+
+    // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
+    // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
+    bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
+    bool is_mp3 = len >= 3 && (
+        memcmp(buf, "ID3", 3) == 0 ||
+        // Check for MPEG sync word (simplified check)
+        ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
+    );
+    bool is_flac = memcmp(buf, "fLaC", 4) == 0;
+
+    return is_wav || is_mp3 || is_flac;
+}
+
+// returns true if the buffer is a valid audio file
+bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
+    ma_result result;
+    const int channels = 1;
+    ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
+    ma_decoder decoder;
+
+    result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
+    if (result != MA_SUCCESS) {
+        return false;
+    }
+
+    ma_uint64 frame_count;
+    ma_uint64 frames_read;
+    result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
+    if (result != MA_SUCCESS) {
+        ma_decoder_uninit(&decoder);
+        return false;
+    }
+
+    pcmf32_mono.resize(frame_count);
+    result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
+    if (result != MA_SUCCESS) {
+        ma_decoder_uninit(&decoder);
+        return false;
+    }
+
+#ifdef MTMD_AUDIO_DEBUG
+    // save audio to wav file
+    ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
+    ma_encoder encoder;
+    ma_encoder_init_file("output.wav", &config, &encoder);
+    ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
+    ma_encoder_uninit(&encoder);
+#endif
+
+    ma_decoder_uninit(&decoder);
+    return true;
+}
+
+} // namespace wav_utils
+
+
+// precalculated mel filter banks
+// values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function
+//
+// generated from python code:
+//
+// from numpy import load
+// data = load('mel_filters.npz')
+// lst = data.files
+// for item in lst:
+//   print(item)
+//   print(data[item].shape)
+//   n_mel = data[item].shape[0]
+//   n_fft = data[item].shape[1]
+//   for i, row in enumerate(data[item]):
+//     for j, val in enumerate(row):
+//       val = val * 1000.0
+//       if val != 0:
+//         print(f"data[{i*n_fft + j}] = {val:.6f};")
+
+namespace whisper_precalc_filters {
+
+whisper_preprocessor::whisper_filters get_128_bins() {
+    whisper_preprocessor::whisper_filters filters;
+    filters.n_mel = 128;
+    filters.n_fft = 201;
+    std::vector data(filters.n_mel * filters.n_fft, 0.0f);
+
+    data[1] = 12.37398665;
+    data[202] = 30.39256483;
+    data[404] = 24.74797331;
+    data[605] = 18.01857911;
+    data[807] = 37.12195903;
+    data[1008] = 5.64459199;
+    data[1009] = 6.72939420;
+    data[1210] = 36.03715822;
+    data[1412] = 19.10337992;
+    data[1613] = 23.66316877;
+    data[1815] = 31.47736564;
+    data[2016] = 11.28918398;
+    data[2017] = 1.08480197;
+    data[2218] = 41.68175161;
+    data[2420] = 13.45878839;
+    data[2621] = 29.30776216;
+    data[2823] = 25.83277412;
+    data[3024] = 16.93377644;
+    data[3226] = 38.20675984;
+    data[3427] = 4.55979025;
+    data[3428] = 7.81419594;
+    data[3629] = 34.95235741;
+    data[3831] = 20.18818259;
+    data[4032] = 22.57836796;
+    data[4234] = 32.56217018;
+    data[4435] = 10.20438317;
+    data[4436] = 2.16960395;
+    data[4637] = 40.59694707;
+    data[4839] = 14.54358920;
+    data[5040] = 28.22295949;
+    data[5242] = 26.91757679;
+    data[5443] = 15.84897563;
+    data[5645] = 39.29156065;
+    data[5846] = 3.47498828;
+    data[5847] = 8.89899861;
+    data[6048] = 33.86755288;
+    data[6250] = 21.27298526;
+    data[6451] = 21.49356715;
+    data[6653] = 33.64697099;
+    data[6854] = 9.11958050;
+    data[6855] = 3.25440569;
+    data[7056] = 39.51214626;
+    data[7258] = 15.62839188;
+    data[7459] = 27.13815868;
+    data[7661] = 28.00237760;
+    data[7862] = 14.76417296;
+    data[8064] = 40.37636518;
+    data[8265] = 2.38068704;
+    data[8266] = 10.20263787;
+    data[8467] = 31.61146119;
+    data[8669] = 24.54700135;
+    data[8870] = 15.32919332;
+    data[8871] = 1.66583748;
+    data[9072] = 36.72905266;
+    data[9274] = 20.09709924;
+    data[9475] = 16.93102531;
+    data[9476] = 2.90265540;
+    data[9677] = 32.84499049;
+    data[9879] = 23.52004871;
+    data[10080] = 11.03894413;
+    data[10081] = 10.72582975;
+    data[10282] = 22.71829173;
+    data[10484] = 32.27872774;
+    data[10685] = 0.11626833;
+    data[10686] = 22.85348251;
+    data[10887] = 8.56344029;
+    data[10888] = 14.97978810;
+    data[11089] = 15.51398356;
+    data[11090] = 8.51490628;
+    data[11291] = 21.10680379;
+    data[11292] = 3.32652032;
+    data[11493] = 25.47064796;
+    data[11695] = 27.35907957;
+    data[11896] = 0.65853616;
+    data[11897] = 23.83812517;
+    data[12098] = 3.44359246;
+    data[12099] = 21.22455277;
+    data[12300] = 5.35842171;
+    data[12301] = 19.42555793;
+    data[12502] = 6.49324711;
+    data[12503] = 18.35542172;
+    data[12704] = 6.93138083;
+    data[12705] = 17.93504693;
+    data[12906] = 6.74968259;
+    data[12907] = 18.09151843;
+    data[13108] = 6.01899112;
+    data[13109] = 18.75767298;
+    data[13310] = 4.80452832;
+    data[13311] = 19.87172849;
+    data[13512] = 3.16627859;
+    data[13513] = 21.37690969;
+    data[13514] = 1.25317345;
+    data[13714] = 1.15934468;
+    data[13715] = 20.80361731;
+    data[13716] = 4.04486805;
+    data[13917] = 17.55363122;
+    data[13918] = 7.08320038;
+    data[14119] = 14.07538634;
+    data[14120] = 10.32655034;
+    data[14321] = 10.40921453;
+    data[14322] = 13.73696327;
+    data[14523] = 6.59187697;
+    data[14524] = 17.27988198;
+    data[14525] = 1.46804214;
+    data[14725] = 2.65681883;
+    data[14726] = 18.09193194;
+    data[14727] = 5.85655728;
+    data[14928] = 13.34277913;
+    data[14929] = 10.28267574;
+    data[15130] = 8.56800377;
+    data[15131] = 14.72230814;
+    data[15132] = 1.04039861;
+    data[15332] = 3.79085587;
+    data[15333] = 17.14678481;
+    data[15334] = 6.11609267;
+    data[15535] = 11.75929047;
+    data[15536] = 11.13393717;
+    data[15737] = 6.43857848;
+    data[15738] = 16.07806236;
+    data[15739] = 4.23917221;
+    data[15939] = 1.19989377;
+    data[15940] = 12.75671553;
+    data[15941] = 9.65298992;
+    data[16142] = 7.06935255;
+    data[16143] = 14.94054683;
+    data[16144] = 4.19024844;
+    data[16344] = 1.51483389;
+    data[16345] = 12.00899947;
+    data[16346] = 9.84823331;
+    data[16547] = 6.10224018;
+    data[16548] = 15.33857174;
+    data[16549] = 5.57676842;
+    data[16749] = 0.36827257;
+    data[16750] = 9.89749376;
+    data[16751] = 11.35340426;
+    data[16752] = 2.05122307;
+    data[16952] = 3.89297144;
+    data[16953] = 12.97352277;
+    data[16954] = 8.06631614;
+    data[17155] = 6.74493238;
+    data[17156] = 13.85874674;
+    data[17157] = 5.41190524;
+    data[17357] = 0.74220158;
+    data[17358] = 8.98779090;
+    data[17359] = 11.37871388;
+    data[17360] = 3.32958088;
+    data[17560] = 2.82313535;
+    data[17561] = 10.68049297;
+    data[17562] = 9.43340641;
+    data[17563] = 1.76325557;
+    data[17763] = 4.39018616;
+    data[17764] = 11.87758986;
+    data[17765] = 7.97005836;
+    data[17766] = 0.66104700;
+    data[17966] = 5.49466675;
+    data[17967] = 12.62953598;
+    data[17968] = 6.93987962;
+    data[18169] = 6.18401915;
+    data[18170] = 12.93473132;
+    data[18171] = 6.29778765;
+    data[18371] = 0.02325210;
+    data[18372] = 6.50206627;
+    data[18373] = 12.32661773;
+    data[18374] = 6.00216538;
+    data[18574] = 0.31548753;
+    data[18575] = 6.48925547;
+    data[18576] = 12.04130240;
+    data[18577] = 6.01462880;
+    data[18777] = 0.29979556;
+    data[18778] = 6.18288014;
+    data[18779] = 12.04272825;
+    data[18780] = 6.29981188;
+    data[18781] = 0.55689598;
+    data[18980] = 0.01120471;
+    data[18981] = 5.61729167;
+    data[18982] = 11.22337859;
+    data[18983] = 6.82516303;
+    data[18984] = 1.35264499;
+    data[19184] = 4.82410006;
+    data[19185] = 10.16623247;
+    data[19186] = 7.56075513;
+    data[19187] = 2.34590308;
+    data[19387] = 3.83235747;
+    data[19388] = 8.92296247;
+    data[19389] = 8.47910438;
+    data[19390] = 3.50978645;
+    data[19590] = 2.66873185;
+    data[19591] = 7.51965167;
+    data[19592] = 9.55500547;
+    data[19593] = 4.81966138;
+    data[19594] = 0.08431751;
+    data[19793] = 1.35767367;
+    data[19794] = 5.98019501;
+    data[19795] = 10.60271543;
+    data[19796] = 6.25298498;
+    data[19797] = 1.74059917;
+    data[19997] = 4.32644226;
+    data[19998] = 8.73131864;
+    data[19999] = 7.78916525;
+    data[20000] = 3.48923868;
+    data[20200] = 2.57835095;
+    data[20201] = 6.77582854;
+    data[20202] = 9.40941647;
+    data[20203] = 5.31194592;
+    data[20204] = 1.21447595;
+    data[20403] = 0.75411191;
+    data[20404] = 4.75395704;
+    data[20405] = 8.75380263;
+    data[20406] = 7.19209015;
+    data[20407] = 3.28754401;
+    data[20607] = 2.68179690;
+    data[20608] = 6.49331464;
+    data[20609] = 9.11457930;
+    data[20610] = 5.39387390;
+    data[20611] = 1.67316827;
+    data[20810] = 0.57394296;
+    data[20811] = 4.20600036;
+    data[20812] = 7.83805829;
+    data[20813] = 7.52023002;
+    data[20814] = 3.97470826;
+    data[20815] = 0.42918732;
+    data[21014] = 1.90464477;
+    data[21015] = 5.36569161;
+    data[21016] = 8.82673822;
+    data[21017] = 6.27609482;
+    data[21018] = 2.89750961;
+    data[21218] = 2.89885257;
+    data[21219] = 6.19694078;
+    data[21220] = 8.56699049;
+    data[21221] = 5.34748193;
+    data[21222] = 2.12797290;
+    data[21421] = 0.44750227;
+    data[21422] = 3.59030394;
+    data[21423] = 6.73310598;
+    data[21424] = 7.77023612;
+    data[21425] = 4.70231380;
+    data[21426] = 1.63439126;
+    data[21625] = 1.01536023;
+    data[21626] = 4.01018746;
+    data[21627] = 7.00501446;
+    data[21628] = 7.23442994;
+    data[21629] = 4.31095669;
+    data[21630] = 1.38748321;
+    data[21829] = 1.33348850;
+    data[21830] = 4.18730825;
+    data[21831] = 7.04112789;
+    data[21832] = 6.93188375;
+    data[21833] = 4.14605811;
+    data[21834] = 1.36023236;
+    data[22033] = 1.42879714;
+    data[22034] = 4.14824858;
+    data[22035] = 6.86769979;
+    data[22036] = 6.83705276;
+    data[22037] = 4.18239459;
+    data[22038] = 1.52773573;
+    data[22237] = 1.32610439;
+    data[22238] = 3.91751388;
+    data[22239] = 6.50892360;
+    data[22240] = 6.92639686;
+    data[22241] = 4.39672917;
+    data[22242] = 1.86706171;
+    data[22441] = 1.04827771;
+    data[22442] = 3.51767405;
+    data[22443] = 5.98707050;
+    data[22444] = 7.17824046;
+    data[22445] = 4.76767914;
+    data[22446] = 2.35711760;
+    data[22645] = 0.61636406;
+    data[22646] = 2.96949223;
+    data[22647] = 5.32262027;
+    data[22648] = 7.57265091;
+    data[22649] = 5.27558755;
+    data[22650] = 2.97852419;
+    data[22651] = 0.68146095;
+    data[22849] = 0.04971400;
+    data[22850] = 2.29204819;
+    data[22851] = 4.53438237;
+    data[22852] = 6.77671656;
+    data[22853] = 5.90240723;
+    data[22854] = 3.71349836;
+    data[22855] = 1.52458926;
+    data[23054] = 1.50285335;
+    data[23055] = 3.63961048;
+    data[23056] = 5.77636715;
+    data[23057] = 6.63159089;
+    data[23058] = 4.54574358;
+    data[23059] = 2.45989650;
+    data[23060] = 0.37404924;
+    data[23258] = 0.61795861;
+    data[23259] = 2.65410915;
+    data[23260] = 4.69025923;
+    data[23261] = 6.72641024;
+    data[23262] = 5.46034705;
+    data[23263] = 3.47270933;
+    data[23264] = 1.48507138;
+    data[23463] = 1.59233576;
+    data[23464] = 3.53261665;
+    data[23465] = 5.47289755;
+    data[23466] = 6.44368259;
+    data[23467] = 4.54962999;
+    data[23468] = 2.65557761;
+    data[23469] = 0.76152512;
+    data[23667] = 0.46749352;
+    data[23668] = 2.31641904;
+    data[23669] = 4.16534441;
+    data[23670] = 6.01426978;
+    data[23671] = 5.67844696;
+    data[23672] = 3.87357362;
+    data[23673] = 2.06870004;
+    data[23674] = 0.26382666;
+    data[23872] = 1.05349103;
+    data[23873] = 2.81536230;
+    data[23874] = 4.57723346;
+    data[23875] = 6.33910485;
+    data[23876] = 5.12815686;
+    data[23877] = 3.40826320;
+    data[23878] = 1.68837002;
+    data[24077] = 1.43350090;
+    data[24078] = 3.11241671;
+    data[24079] = 4.79133241;
+    data[24080] = 6.40943693;
+    data[24081] = 4.77052201;
+    data[24082] = 3.13160778;
+    data[24083] = 1.49269309;
+    data[24281] = 0.02932359;
+    data[24282] = 1.62918994;
+    data[24283] = 3.22905602;
+    data[24284] = 4.82892245;
+    data[24285] = 6.14671456;
+    data[24286] = 4.58496623;
+    data[24287] = 3.02321767;
+    data[24288] = 1.46146910;
+    data[24486] = 0.13601698;
+    data[24487] = 1.66055572;
+    data[24488] = 3.18509457;
+    data[24489] = 4.70963307;
+    data[24490] = 6.04072399;
+    data[24491] = 4.55250870;
+    data[24492] = 3.06429295;
+    data[24493] = 1.57607743;
+    data[24494] = 0.08786193;
+    data[24691] = 0.09328097;
+    data[24692] = 1.54603878;
+    data[24693] = 2.99879676;
+    data[24694] = 4.45155473;
+    data[24695] = 5.90431225;
+    data[24696] = 4.65566106;
+    data[24697] = 3.23751615;
+    data[24698] = 1.81937125;
+    data[24699] = 0.40122634;
+    data[24897] = 1.30262633;
+    data[24898] = 2.68698297;
+    data[24899] = 4.07133950;
+    data[24900] = 5.45569602;
+    data[24901] = 4.87832492;
+    data[24902] = 3.52695142;
+    data[24903] = 2.17557792;
+    data[24904] = 0.82420459;
+    data[25102] = 0.94595028;
+    data[25103] = 2.26512621;
+    data[25104] = 3.58430226;
+    data[25105] = 4.90347855;
+    data[25106] = 5.20569785;
+    data[25107] = 3.91795207;
+    data[25108] = 2.63020652;
+    data[25109] = 1.34246063;
+    data[25110] = 0.05471494;
+    data[25307] = 0.49037894;
+    data[25308] = 1.74744334;
+    data[25309] = 3.00450763;
+    data[25310] = 4.26157191;
+    data[25311] = 5.51863620;
+    data[25312] = 4.39707236;
+    data[25313] = 3.16995848;
+    data[25314] = 1.94284460;
+    data[25315] = 0.71573065;
+    data[25513] = 1.14698056;
+    data[25514] = 2.34485767;
+    data[25515] = 3.54273478;
+    data[25516] = 4.74061165;
+    data[25517] = 4.95198462;
+    data[25518] = 3.78264743;
+    data[25519] = 2.61331047;
+    data[25520] = 1.44397374;
+    data[25521] = 0.27463681;
+    data[25718] = 0.47569509;
+    data[25719] = 1.61717169;
+    data[25720] = 2.75864848;
+    data[25721] = 3.90012516;
+    data[25722] = 5.04160160;
+    data[25723] = 4.45712078;
+    data[25724] = 3.34284059;
+    data[25725] = 2.22856039;
+    data[25726] = 1.11428020;
+
+    for (auto & val : data) {
+        val /= 1000.0f;
+    }
+
+    filters.data = std::move(data);
+    return filters;
+}
+
+} // namespace whisper_precalc_filters
--- a/tools/mtmd/mtmd-audio.h
+++ b/tools/mtmd/mtmd-audio.h
@ -0,0 +1,62 @@
+#pragma once
+
+#include "ggml.h"
+
+#include <cstdint>
+#include <vector>
+#include <string>
+
+#define WHISPER_ASSERT GGML_ASSERT
+
+#define WHISPER_SAMPLE_RATE 16000
+#define WHISPER_N_FFT       400
+#define WHISPER_HOP_LENGTH  160
+#define WHISPER_CHUNK_SIZE  30
+
+#define COMMON_SAMPLE_RATE 16000
+
+namespace whisper_preprocessor {
+
+struct whisper_mel {
+    int n_len;
+    int n_len_org;
+    int n_mel;
+
+    std::vector<float> data;
+};
+
+struct whisper_filters {
+    int32_t n_mel;
+    int32_t n_fft;
+
+    std::vector<float> data;
+};
+
+extern bool preprocess_audio(
+        const float * samples,
+        size_t n_samples,
+        const whisper_filters & filters,
+        std::vector<whisper_mel> & output);
+
+} // namespace whisper_preprocessor
+
+
+// TODO @ngxson : move this helper to mtmd-helpers.cpp
+namespace audio_helpers {
+
+extern bool is_audio_file(const char * buf, size_t len);
+
+extern bool decode_audio_from_buf(
+        const unsigned char * buf_in,
+        size_t len,
+        int target_sampler_rate,
+        std::vector<float> & pcmf32_mono);
+
+} // namespace audio_helpers
+
+
+namespace whisper_precalc_filters {
+
+extern whisper_preprocessor::whisper_filters get_128_bins();
+
+} // namespace whisper_precalc_filters
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@ -37,10 +37,10 @@ static volatile bool g_is_interrupted = false;
 static void show_additional_info(int /*argc*/, char ** argv) {
    LOG(
        "Experimental CLI for multimodal\n\n"
-        "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
+        "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> --audio <audio> -p <prompt>\n\n"
        "  -m and --mmproj are required\n"
        "  -hf user/repo can replace both -m and --mmproj in most cases\n"
-        "  --image and -p are optional, if NOT provided, the CLI will run in chat mode\n"
+        "  --image, --audio and -p are optional, if NOT provided, the CLI will run in chat mode\n"
        "  to disable using GPU for mmproj model, add --no-mmproj-offload\n",
        argv[0]
    );
@ -142,7 +142,7 @@ struct mtmd_cli_context {
        );
    }

-    bool load_image(const std::string & fname) {
+    bool load_media(const std::string & fname) {
        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
        if (!bmp.ptr) {
            return false;
@ -243,7 +243,7 @@ int main(int argc, char ** argv) {
    common_params params;
    params.sampling.temp = 0.2; // lower temp by default for better quality

-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
        return 1;
    }

@ -283,14 +283,14 @@ int main(int argc, char ** argv) {

    if (is_single_turn) {
        g_is_generating = true;
-        if (params.prompt.find("<__image__>") == std::string::npos) {
-            params.prompt += " <__image__>";
+        if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
+            params.prompt += mtmd_default_marker();
        }
        common_chat_msg msg;
        msg.role = "user";
        msg.content = params.prompt;
        for (const auto & image : params.image) {
-            if (!ctx.load_image(image)) {
+            if (!ctx.load_media(image)) {
                return 1; // error is already printed by libmtmd
            }
        }
@ -303,7 +303,12 @@ int main(int argc, char ** argv) {

    } else {
        LOG("\n Running in chat mode, available commands:");
-        LOG("\n   /image <path>    load an image");
+        if (mtmd_support_vision(ctx.ctx_vision.get())) {
+            LOG("\n   /image <path>    load an image");
+        }
+        if (mtmd_support_audio(ctx.ctx_vision.get())) {
+            LOG("\n   /audio <path>    load an audio");
+        }
        LOG("\n   /clear           clear the chat history");
        LOG("\n   /quit or /exit   exit the program");
        LOG("\n");
@ -333,15 +338,17 @@ int main(int argc, char ** argv) {
                continue;
            }
            g_is_generating = true;
-            if (line == "/image" || line.find("/image ") == 0) {
+            bool is_image = line == "/image" || line.find("/image ") == 0;
+            bool is_audio = line == "/audio" || line.find("/audio ") == 0;
+            if (is_image || is_audio) {
                if (line.size() < 8) {
-                    LOG_ERR("ERR: Missing image filename\n");
+                    LOG_ERR("ERR: Missing media filename\n");
                    continue;
                }
-                std::string image = line.substr(7);
-                if (ctx.load_image(image)) {
-                    LOG("Image %s loaded\n", image.c_str());
-                    content += "<__image__>";
+                std::string media_path = line.substr(7);
+                if (ctx.load_media(media_path)) {
+                    LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
+                    content += mtmd_default_marker();
                }
                // else, error is already printed by libmtmd
                continue;
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@ -12,17 +12,7 @@ size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) {
    size_t n_tokens = 0;
    for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
        auto chunk = mtmd_input_chunks_get(chunks, i);
-        auto chunk_type = mtmd_input_chunk_get_type(chunk);
-        if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-            size_t n_tokens_text;
-            mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
-            n_tokens += n_tokens_text;
-        } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-            auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
-            n_tokens += mtmd_image_tokens_get_n_tokens(tokens_image);
-        } else {
-            GGML_ASSERT(false && "chunk type not supported");
-        }
+        n_tokens += mtmd_input_chunk_get_n_tokens(chunk);
    }
    return n_tokens;
 }
@ -31,17 +21,7 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
    llama_pos n_pos = 0;
    for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) {
        auto chunk = mtmd_input_chunks_get(chunks, i);
-        auto chunk_type = mtmd_input_chunk_get_type(chunk);
-        if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-            size_t n_tokens_text;
-            mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text);
-            n_pos += n_tokens_text;
-        } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-            auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk);
-            n_pos += mtmd_image_tokens_get_n_pos(tokens_image);
-        } else {
-            GGML_ASSERT(false && "chunk type not supported");
-        }
+        n_pos += mtmd_input_chunk_get_n_pos(chunk);
    }
    return n_pos;
 }
@ -149,13 +129,10 @@ int32_t mtmd_helper_decode_image_chunk(
        llama_seq_id seq_id,
        int32_t n_batch,
        llama_pos * new_n_past) {
-    if (mtmd_input_chunk_get_type(chunk) != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-        LOG_ERR("failed to decode image chunk: input chunk not of image type\n");
-        return -1;
-    }
-    const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
-    if (!image_tokens) {
-        LOG_ERR("failed to decode image chunk: image tokens are null\n");
+    auto chunk_type = mtmd_input_chunk_get_type(chunk);
+    const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
+    if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        LOG_ERR("failed to decode chunk: input chunk not of image/audio type\n");
        return -1;
    }

@ -163,15 +140,23 @@ int32_t mtmd_helper_decode_image_chunk(
    int n_mmproj_embd = llama_model_n_embd(model);
    int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;

-    int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
+    int32_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk);
    int32_t i_batch = 0;
    int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
    decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);

-    const int nx = mtmd_image_tokens_get_nx(image_tokens);
-    const int ny = mtmd_image_tokens_get_ny(image_tokens);
-
    if (mtmd_decode_use_mrope(ctx)) {
+        const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+        if (chunk_type != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            LOG_ERR("failed to decode chunk: M-RoPE only accepts image chunk\n");
+            return -1;
+        }
+        if (!image_tokens) {
+            LOG_ERR("failed to decode chunk: image tokens are null\n");
+            return -1;
+        }
+        const int nx = mtmd_image_tokens_get_nx(image_tokens);
+        const int ny = mtmd_image_tokens_get_ny(image_tokens);
        batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
    } else {
        batch_embd.set_position_normal(n_past, seq_id);
@ -187,22 +172,22 @@ int32_t mtmd_helper_decode_image_chunk(
        int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
        llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);

-        LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
+        LOG_INF("decoding %s batch %d/%d, n_tokens_batch = %d\n", name, i_batch+1, n_img_batches, n_tokens_batch);

        int64_t t1 = ggml_time_ms();
        int32_t ret = llama_decode(lctx, batch_embd_view);
        if (ret != 0) {
-            LOG_ERR("failed to decode image\n");
+            LOG_ERR("failed to decode %s\n", name);
            llama_set_causal_attn(lctx, true); // restore causal attn
            return ret;
        }

-        LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
+        LOG_INF("%s decoded (batch %d/%d) in %" PRId64 " ms\n", name, i_batch+1, n_img_batches, ggml_time_ms() - t1);

        i_batch++;
    }

-    n_past += mtmd_image_tokens_get_n_pos(image_tokens);
+    n_past += mtmd_input_chunk_get_n_pos(chunk);
    *new_n_past = n_past;

    if (mtmd_decode_use_non_causal(ctx)) {
@ -253,25 +238,25 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
            *new_n_past += text_batch.n_tokens;
        }

-    } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-        const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+    } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE || chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
        int64_t t0 = ggml_time_ms();

-        LOG_INF("encoding image or slice...\n");
+        LOG_INF("encoding %s slice...\n", name);

-        ret = mtmd_encode(ctx, image_tokens);
+        ret = mtmd_encode_chunk(ctx, chunk);
        if (ret != 0) {
-            LOG_ERR("failed to encode image\n");
+            LOG_ERR("failed to encode %s slice\n", name);
            llama_batch_free(text_batch);
            return ret;
        }

-        LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
+        LOG_INF("%s slice encoded in %" PRId64 " ms\n", name, ggml_time_ms() - t0);

        float * embd = mtmd_get_output_embd(ctx);
        ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past);
        if (ret != 0) {
-            LOG_ERR("failed to decode image\n");
+            LOG_ERR("failed to decode %s\n", name);
            llama_batch_free(text_batch);
            return ret;
        }
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@ -1,6 +1,7 @@
 #include "clip.h"
 #include "clip-impl.h"
 #include "mtmd.h"
+#include "mtmd-audio.h"

 #include "llama.h"

@ -19,17 +20,49 @@ struct mtmd_bitmap {
    uint32_t ny;
    std::vector<unsigned char> data;
    std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
+    bool is_audio = false; // true if the bitmap is audio
 };

-struct mtmd_image_tokens_deleter {
-    void operator()(mtmd_image_tokens * val); // forward declaration
+struct mtmd_image_tokens {
+    uint32_t nx; // number of tokens in x direction
+    uint32_t ny; // number of tokens in y direction
+    bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
+    uint32_t n_tokens() const { return nx * ny; }
+    clip_image_f32_batch batch_f32; // preprocessed image patches
+    std::string id; // optional user-defined ID, useful for KV cache tracking
+
+    mtmd_image_tokens clone() {
+        return mtmd_image_tokens{
+            nx,
+            ny,
+            use_mrope_pos,
+            batch_f32.clone(),
+            id
+        };
+    }
 };
-using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens, mtmd_image_tokens_deleter>;
+using mtmd_image_tokens_ptr = std::unique_ptr<mtmd_image_tokens>;
+
+struct mtmd_audio_tokens {
+    uint32_t n_tokens; // number of tokens
+    clip_image_f32_batch batch_f32; // preprocessed image patches
+    std::string id; // optional user-defined ID, useful for KV cache tracking
+
+    mtmd_audio_tokens clone() {
+        return mtmd_audio_tokens{
+            n_tokens,
+            batch_f32.clone(),
+            id
+        };
+    }
+};
+using mtmd_audio_tokens_ptr = std::unique_ptr<mtmd_audio_tokens>;

 struct mtmd_input_chunk {
    mtmd_input_chunk_type type;
    std::vector<llama_token> tokens_text;
    mtmd_image_tokens_ptr tokens_image;
+    mtmd_audio_tokens_ptr tokens_audio;
 };

 struct mtmd_input_chunks {
@ -46,6 +79,10 @@ enum mtmd_slice_tmpl {
    // TODO @ngxson : add support for idefics (SmolVLM)
 };

+const char * mtmd_default_marker() {
+    return "<__media__>";
+}
+
 mtmd_context_params mtmd_context_params_default() {
    mtmd_context_params params;
    params.use_gpu = true;
@ -53,6 +90,7 @@ mtmd_context_params mtmd_context_params_default() {
    params.n_threads = 4;
    params.verbosity = GGML_LOG_LEVEL_INFO;
    params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
+    params.media_marker = mtmd_default_marker();
    return params;
 }

@ -63,7 +101,9 @@ struct mtmd_context {

    bool print_timings;
    int n_threads;
-    std::string image_marker;
+    std::string media_marker;
+    bool has_vision;
+    bool has_audio;

    // for llava-uhd style models, we need special tokens in-between slices
    // minicpmv calls them "slices", llama 4 calls them "tiles"
@ -81,6 +121,9 @@ struct mtmd_context {

    bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE

+    // for whisper, we pre-calculate the mel filter bank
+    whisper_preprocessor::whisper_filters w_filters;
+
    // TODO @ngxson : add timings

    mtmd_context(const char * mmproj_fname,
@ -89,8 +132,12 @@ struct mtmd_context {
        text_model   (text_model),
        print_timings(ctx_params.print_timings),
        n_threads    (ctx_params.n_threads),
-        image_marker (ctx_params.image_marker)
+        media_marker (ctx_params.media_marker)
    {
+        if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
+            throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
+        }
+
        clip_context_params ctx_clip_params;
        ctx_clip_params.use_gpu   = ctx_params.use_gpu;
        ctx_clip_params.verbosity = ctx_params.verbosity;
@ -99,7 +146,9 @@ struct mtmd_context {
            throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
        }

-        use_mrope = clip_is_qwen2vl(ctx_clip);
+        has_vision = clip_has_vision_encoder(ctx_clip);
+        has_audio  = clip_has_audio_encoder(ctx_clip);
+        use_mrope  = clip_is_qwen2vl(ctx_clip);

        projector_type proj = clip_get_projector_type(ctx_clip);
        int minicpmv_version = clip_is_minicpmv(ctx_clip);
@ -146,6 +195,21 @@ struct mtmd_context {
            tok_row_end_trail = true; // add trailing end-of-row token
            ov_img_first      = false; // overview image is last
        }
+
+        if (proj == PROJECTOR_TYPE_ULTRAVOX) {
+            // TODO @ngxson : check if model n_mel is 128 or 80
+            w_filters = whisper_precalc_filters::get_128_bins();
+        }
+
+        // warning messages
+        if (proj == PROJECTOR_TYPE_LLAMA4) {
+            LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
+                    "    https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
+        }
+        if (has_audio) {
+            LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
+                    "    https://github.com/ggml-org/llama.cpp/pull/13623\n", __func__);
+        }
    }

    ~mtmd_context() {
@ -179,29 +243,6 @@ private:
    }
 };

-struct mtmd_image_tokens_data {
-    clip_image_f32_batch batch_f32; // preprocessed image patches
-};
-
-struct mtmd_image_tokens {
-    uint32_t nx; // number of tokens in x direction
-    uint32_t ny; // number of tokens in y direction
-    bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
-    uint32_t n_tokens() const { return nx * ny; }
-    clip_image_f32_batch batch_f32; // preprocessed image patches
-    std::string id; // optional user-defined ID, useful for KV cache tracking
-
-    mtmd_image_tokens clone() {
-        return mtmd_image_tokens{
-            nx,
-            ny,
-            use_mrope_pos,
-            batch_f32.clone(),
-            id
-        };
-    }
-};
-
 mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
        const struct llama_model * text_model,
        const struct mtmd_context_params ctx_params) {
@ -247,59 +288,63 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
    auto vocab = llama_model_get_vocab(ctx->text_model);

    std::string prompt_modified(text->text);
-    std::string marker_modified(ctx->image_marker);
+    std::string marker_modified(ctx->media_marker);
    projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);

+    // for compatibility, we convert image marker to media marker
+    string_replace_all(prompt_modified, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
+
    // a bit hacky here, but works for now
    // for some models, we need to add prefix and suffix to the image embeddings
    if (clip_is_gemma3(ctx->ctx_clip)) {
        // gemma 3
        // <start_of_image> ... (image embeddings) ... <end_of_image>
-        marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+        marker_modified = "<start_of_image>" + ctx->media_marker + "<end_of_image>";
+        string_replace_all(prompt_modified, ctx->media_marker, marker_modified);

    } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
        // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
-        marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+        marker_modified = "<fake_token_around_image><global-img>" + ctx->media_marker + "<fake_token_around_image>";
+        string_replace_all(prompt_modified, ctx->media_marker, marker_modified);

    } else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
        // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
-        marker_modified = ctx->image_marker + "[IMG_END]";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+        marker_modified = ctx->media_marker + "[IMG_END]";
+        string_replace_all(prompt_modified, ctx->media_marker, marker_modified);

    } else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
        // <|vision_start|> ... (image embeddings) ... <|vision_end|>
-        marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+        marker_modified = "<|vision_start|>" + ctx->media_marker + "<|vision_end|>";
+        string_replace_all(prompt_modified, ctx->media_marker, marker_modified);

    } else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
        // (more details in mtmd_context constructor)
-        marker_modified = "<|image_start|>" + ctx->image_marker + "<|image_end|>";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+        marker_modified = "<|image_start|>" + ctx->media_marker + "<|image_end|>";
+        string_replace_all(prompt_modified, ctx->media_marker, marker_modified);

    } else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
        // <img> ... (image embeddings) ... </img>
-        marker_modified = "<img>" + ctx->image_marker + "</img>";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+        marker_modified = "<img>" + ctx->media_marker + "</img>";
+        string_replace_all(prompt_modified, ctx->media_marker, marker_modified);

    }

    // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
    // for glm-edge, BOI and EOI token's embeddings are not present in the text model

-    std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
+    std::vector<std::string> parts = string_split_str(prompt_modified, ctx->media_marker);
    output->entries.clear();
    output->entries.reserve(parts.size());

-    size_t i_img = 0;
+    size_t i_bm = 0;

    // utility for adding raw tokens
    auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
        mtmd_input_chunk chunk{
            MTMD_INPUT_CHUNK_TYPE_TEXT,
            std::move(tokens),
-            {},
+            nullptr, // image tokens
+            nullptr, // audio tokens
        };
        output->entries.emplace_back(std::move(chunk));
    };
@ -317,8 +362,9 @@ int32_t mtmd_tokenize(mtmd_context * ctx,

            mtmd_input_chunk chunk{
                MTMD_INPUT_CHUNK_TYPE_IMAGE,
-                {},
+                {}, // text tokens
                std::move(image_tokens),
+                nullptr, // audio tokens
            };
            chunks.emplace_back(std::move(chunk));
        }
@ -336,24 +382,36 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
        mtmd_input_chunk chunk{
            MTMD_INPUT_CHUNK_TYPE_TEXT,
            std::move(tokens),
-            {},
+            nullptr, // image tokens
+            nullptr, // audio tokens
        };
        output->entries.emplace_back(std::move(chunk));

-        if (&parts.back() != &part) {
-            // add image token to middle of 2 parts
+        // only add image/audio tokens to middle of 2 parts
+        // therefore, we skip handling image/audio if this is the last part
+        if (&parts.back() == &part) {
+            continue;
+        }

-            if (i_img >= n_bitmaps) {
+        if (!bitmaps[i_bm]->is_audio) {
+            // handle image
+
+            if (i_bm >= n_bitmaps) {
                LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
                return 1;
            }

+            if (!ctx->has_vision) {
+                LOG_ERR("%s: error: model does not support vision input\n", __func__);
+                return 2;
+            }
+
            // convert mtmd_bitmap to clip_image_u8
            clip_image_u8_ptr img_u8(clip_image_u8_init());
-            img_u8->nx = bitmaps[i_img]->nx;
-            img_u8->ny = bitmaps[i_img]->ny;
-            img_u8->buf.resize(bitmaps[i_img]->data.size());
-            std::memcpy(img_u8->buf.data(), bitmaps[i_img]->data.data(), img_u8->nx * img_u8->ny * 3);
+            img_u8->nx = bitmaps[i_bm]->nx;
+            img_u8->ny = bitmaps[i_bm]->ny;
+            img_u8->buf.resize(bitmaps[i_bm]->data.size());
+            std::memcpy(img_u8->buf.data(), bitmaps[i_bm]->data.data(), img_u8->nx * img_u8->ny * 3);

            // preprocess image
            clip_image_f32_batch batch_f32;
@ -370,7 +428,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
            ) {
                // split batch into chunks of single images
-                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img]->id);
+                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_bm]->id);
                GGML_ASSERT(chunks.size() > 0);

                auto ov_chunk = std::move(chunks.front());
@ -446,7 +504,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                    image_tokens->ny = 1;
                }
                image_tokens->batch_f32 = std::move(batch_f32);
-                image_tokens->id = bitmaps[i_img]->id; // optional
+                image_tokens->id = bitmaps[i_bm]->id; // optional

                LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
                LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
@ -454,23 +512,101 @@ int32_t mtmd_tokenize(mtmd_context * ctx,

                mtmd_input_chunk chunk{
                    MTMD_INPUT_CHUNK_TYPE_IMAGE,
-                    {},
+                    {}, // text tokens
                    std::move(image_tokens),
+                    nullptr, // audio tokens
                };
                output->entries.emplace_back(std::move(chunk));
            }

-            i_img++; // move to next image
+            i_bm++; // move to next image
+            continue;
+
+        } else {
+            // handle audio
+
+            if (i_bm >= n_bitmaps) {
+                LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
+                return 1;
+            }
+
+            if (!ctx->has_audio) {
+                LOG_ERR("%s: error: model does not support audio input\n", __func__);
+                return 2;
+            }
+
+            if (bitmaps[i_bm]->data.size() == 0) {
+                LOG_ERR("%s: error: empty audio data\n", __func__);
+                return 2;
+            }
+
+            // preprocess audio
+            GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
+            std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
+            const float * samples = (const float *)bitmaps[i_bm]->data.data();
+            size_t n_samples = bitmaps[i_bm]->data.size() / sizeof(float);
+            bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
+            if (!ok) {
+                LOG_ERR("Unable to preprocess audio\n");
+                return 2;
+            }
+
+            // consider each mel_spec as a separate audio chunk
+            // TODO: maybe support batching, but this may come with memory cost
+            for (auto & mel_spec : mel_spec_chunks) {
+                clip_image_f32_ptr mel_f32(clip_image_f32_init());
+                mel_f32->nx  = mel_spec.n_len;
+                mel_f32->ny  = mel_spec.n_mel;
+                mel_f32->buf = std::move(mel_spec.data);
+                size_t n_tokens = clip_n_output_tokens(ctx->ctx_clip, mel_f32.get());
+
+                clip_image_f32_batch batch_f32;
+                batch_f32.is_audio = true;
+                batch_f32.entries.push_back(std::move(mel_f32));
+
+                mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
+                audio_tokens->n_tokens = n_tokens;
+                audio_tokens->batch_f32 = std::move(batch_f32);
+                audio_tokens->id = bitmaps[i_bm]->id; // optional
+
+                LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
+
+                mtmd_input_chunk chunk{
+                    MTMD_INPUT_CHUNK_TYPE_AUDIO,
+                    {}, // text tokens
+                    nullptr, // image tokens
+                    std::move(audio_tokens),
+                };
+                output->entries.emplace_back(std::move(chunk));
+            }
+
+            i_bm++;
+            continue;
        }
    }

    return 0;
 }

-static void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens) {
-    if (image_tokens) {
-        delete image_tokens;
+int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
+        return 0;
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        return mtmd_encode(ctx, chunk->tokens_image.get());
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
+        ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
+        bool ok = clip_image_batch_encode(
+            ctx->ctx_clip,
+            ctx->n_threads,
+            &chunk->tokens_audio->batch_f32,
+            ctx->image_embd_v.data());
+        return ok ? 0 : 1;
    }
+
+    LOG_ERR("mtmd_encode_chunk: unknown chunk type %d\n", (int)chunk->type);
+    return 1;
 }

 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
@ -516,8 +652,12 @@ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
    return ctx->use_mrope;
 }

-void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
-    mtmd_image_tokens_free(val);
+bool mtmd_support_vision(mtmd_context * ctx) {
+    return ctx->has_vision;
+}
+
+bool mtmd_support_audio(mtmd_context * ctx) {
+    return ctx->has_audio;
 }

 // these 2 helpers below use internal clip_image_u8_ptr,
@ -526,6 +666,15 @@ void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
 // whichever library they want, and then use mtmd_bitmap_init() to create bitmap

 mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
+    if (audio_helpers::is_audio_file((const char *)buf, len)) {
+        std::vector<float> pcmf32;
+        if (!audio_helpers::decode_audio_from_buf(buf, len, COMMON_SAMPLE_RATE, pcmf32)) {
+            LOG_ERR("Unable to read WAV audio file from buffer\n");
+            return nullptr;
+        }
+        return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
+    }
+
    clip_image_u8_ptr img_u8(clip_image_u8_init());
    bool ok = clip_image_load_from_bytes(buf, len, img_u8.get(),2048);
    if (!ok) {
@ -538,15 +687,26 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t
 }

 mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
-    clip_image_u8_ptr img_u8(clip_image_u8_init());
-    bool ok = clip_image_load_from_file(fname, img_u8.get());
-    if (!ok) {
-        LOG_ERR("Unable to load image %s\n", fname);
+    std::vector<unsigned char> buf;
+    FILE * f = fopen(fname, "rb");
+    if (!f) {
+        LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
        return nullptr;
    }
-    uint32_t nx, ny;
-    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
-    return mtmd_bitmap_init(nx, ny, data);
+
+    fseek(f, 0, SEEK_END);
+    long file_size = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    buf.resize(file_size);
+
+    size_t n_read = fread(buf.data(), 1, file_size, f);
+    fclose(f);
+    if (n_read != (size_t)file_size) {
+        LOG_ERR("Failed to read entire file %s", fname);
+        return nullptr;
+    }
+
+    return mtmd_helper_bitmap_init_from_buf(buf.data(), buf.size());
 }

 //
@ -567,6 +727,18 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
    return bitmap;
 }

+mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
+                                          const float * data) {
+    mtmd_bitmap * bitmap = new mtmd_bitmap;
+    bitmap->nx = n_samples;
+    bitmap->ny = 1;
+    bitmap->is_audio = true;
+    size_t data_size = n_samples * sizeof(float);
+    bitmap->data.resize(data_size);
+    std::memcpy(bitmap->data.data(), data, data_size);
+    return bitmap;
+}
+
 uint32_t mtmd_bitmap_get_nx(const mtmd_bitmap * bitmap) {
    return bitmap->nx;
 }
@ -579,6 +751,14 @@ const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
    return bitmap->data.data();
 }

+size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) {
+    return bitmap->data.size();
+}
+
+bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
+    return bitmap->is_audio;
+}
+
 const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
    return bitmap->id.c_str();
 }
@ -642,17 +822,56 @@ const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chu
    return nullptr;
 }

+size_t mtmd_input_chunk_get_n_tokens(const mtmd_input_chunk * chunk) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        return chunk->tokens_text.size();
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        return mtmd_image_tokens_get_n_tokens(chunk->tokens_image.get());
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        return chunk->tokens_audio->n_tokens;
+    } else {
+        GGML_ABORT("invalid chunk type");
+    }
+}
+
+llama_pos mtmd_input_chunk_get_n_pos(const mtmd_input_chunk * chunk) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+        return chunk->tokens_text.size();
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        return mtmd_image_tokens_get_n_pos(chunk->tokens_image.get());
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        return chunk->tokens_audio->n_tokens;
+    } else {
+        GGML_ABORT("invalid chunk type");
+    }
+}
+
+const char * mtmd_input_chunk_get_id(const mtmd_input_chunk * chunk) {
+    if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        return chunk->tokens_image->id.c_str();
+    } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+        return chunk->tokens_audio->id.c_str();
+    }
+    return nullptr;
+}
+
 mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk) {
    mtmd_input_chunk * copy = new mtmd_input_chunk{
        chunk->type,
        chunk->tokens_text,
-        mtmd_image_tokens_ptr(),
+        nullptr,
+        nullptr,
    };
    if (chunk->tokens_image) {
        // copy the image tokens
        copy->tokens_image = mtmd_image_tokens_ptr(new mtmd_image_tokens());
        *copy->tokens_image = chunk->tokens_image->clone();
    }
+    if (chunk->tokens_audio) {
+        // copy the audio tokens
+        copy->tokens_audio = mtmd_audio_tokens_ptr(new mtmd_audio_tokens());
+        *copy->tokens_audio = chunk->tokens_audio->clone();
+    }
    return copy;
 }

@ -700,7 +919,8 @@ mtmd_input_chunks * mtmd_test_create_input_chunks() {
    mtmd_input_chunk chunk_text{
        MTMD_INPUT_CHUNK_TYPE_TEXT,
        std::move(tokens_text),
-        {},
+        nullptr, // image tokens
+        nullptr, // audio tokens
    };
    chunks->entries.emplace_back(std::move(chunk_text));

@ -712,8 +932,9 @@ mtmd_input_chunks * mtmd_test_create_input_chunks() {
    image_tokens->id = "image_1";
    mtmd_input_chunk chunk_image{
        MTMD_INPUT_CHUNK_TYPE_IMAGE,
-        {},
+        {}, // text tokens
        std::move(image_tokens),
+        nullptr, // audio tokens
    };
    chunks->entries.emplace_back(std::move(chunk_image));

--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@ -39,6 +39,7 @@
 #    define MTMD_API
 #endif

+// deprecated marker, use mtmd_default_marker() instead
 #define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"

 #ifdef __cplusplus
@ -48,6 +49,7 @@ extern "C" {
 enum mtmd_input_chunk_type {
    MTMD_INPUT_CHUNK_TYPE_TEXT,
    MTMD_INPUT_CHUNK_TYPE_IMAGE,
+    MTMD_INPUT_CHUNK_TYPE_AUDIO,
 };

 // opaque types
@ -79,9 +81,12 @@ struct mtmd_context_params {
    bool print_timings;
    int n_threads;
    enum ggml_log_level verbosity;
-    const char * image_marker;
+    const char * image_marker; // deprecated, use media_marker instead
+    const char * media_marker;
 };

+MTMD_API const char * mtmd_default_marker(void);
+
 MTMD_API struct mtmd_context_params mtmd_context_params_default(void);

 // initialize the mtmd context
@ -98,18 +103,28 @@ MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
 // whether the current model use M-RoPE for llama_decode
 MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);

+// whether the current model supports vision input
+MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
+
+// whether the current model supports audio input
+MTMD_API bool mtmd_support_audio(mtmd_context * ctx);

 // mtmd_bitmap
 //
-// length of data must be nx * ny * 3
-// the data is in RGBRGBRGB... format
-MTMD_API mtmd_bitmap *         mtmd_bitmap_init    (uint32_t nx,
-                                                    uint32_t ny,
-                                                    const unsigned char * data);
-MTMD_API uint32_t              mtmd_bitmap_get_nx  (const mtmd_bitmap * bitmap);
-MTMD_API uint32_t              mtmd_bitmap_get_ny  (const mtmd_bitmap * bitmap);
-MTMD_API const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap);
-MTMD_API void                  mtmd_bitmap_free    (mtmd_bitmap * bitmap);
+// if bitmap is image:
+//     length of data must be nx * ny * 3
+//     the data is in RGBRGBRGB... format
+// if bitmap is audio:
+//     length of data must be n_samples * sizeof(float)
+//     the data is in float format (PCM F32)
+MTMD_API mtmd_bitmap *         mtmd_bitmap_init           (uint32_t nx, uint32_t ny, const unsigned char * data);
+MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_audio(size_t n_samples,         const float         * data);
+MTMD_API uint32_t              mtmd_bitmap_get_nx     (const mtmd_bitmap * bitmap);
+MTMD_API uint32_t              mtmd_bitmap_get_ny     (const mtmd_bitmap * bitmap);
+MTMD_API const unsigned char * mtmd_bitmap_get_data   (const mtmd_bitmap * bitmap);
+MTMD_API size_t                mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
+MTMD_API bool                  mtmd_bitmap_is_audio   (const mtmd_bitmap * bitmap);
+MTMD_API void                  mtmd_bitmap_free       (mtmd_bitmap * bitmap);
 // bitmap ID is optional, but useful for KV cache tracking
 // these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
 MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
@ -132,6 +147,11 @@ MTMD_API void                     mtmd_input_chunks_free(mtmd_input_chunks * chu
 MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type        (const mtmd_input_chunk * chunk);
 MTMD_API const llama_token *        mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
 MTMD_API const mtmd_image_tokens *  mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
+MTMD_API size_t                     mtmd_input_chunk_get_n_tokens    (const mtmd_input_chunk * chunk);
+// returns nullptr for ID on text chunk
+MTMD_API const char *               mtmd_input_chunk_get_id          (const mtmd_input_chunk * chunk);
+// number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
+MTMD_API llama_pos                  mtmd_input_chunk_get_n_pos       (const mtmd_input_chunk * chunk);

 // in case you want to use custom logic to handle the chunk (i.e. KV cache management)
 // you can move the chunk ownership to your own code by copying it
@ -144,27 +164,28 @@ MTMD_API void               mtmd_input_chunk_free(mtmd_input_chunk * chunk);
 //
 // the instance will be constructed via mtmd_tokenize()
 // it will be freed along with mtmd_input_chunk
-MTMD_API size_t       mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens);
+MTMD_API size_t       mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
 MTMD_API size_t       mtmd_image_tokens_get_nx      (const mtmd_image_tokens * image_tokens);
 MTMD_API size_t       mtmd_image_tokens_get_ny      (const mtmd_image_tokens * image_tokens);
-MTMD_API const char * mtmd_image_tokens_get_id      (const mtmd_image_tokens * image_tokens);
+MTMD_API const char * mtmd_image_tokens_get_id      (const mtmd_image_tokens * image_tokens); // TODO: deprecate
 // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
-MTMD_API llama_pos    mtmd_image_tokens_get_n_pos   (const mtmd_image_tokens * image_tokens);
+MTMD_API llama_pos    mtmd_image_tokens_get_n_pos   (const mtmd_image_tokens * image_tokens); // TODO: deprecate

-// tokenize an input text prompt and an image
-// the prompt must have the input image marker (default: "<__image__>") in it
-// the marker will be replaced with the image tokens
+// tokenize an input text prompt and a list of bitmaps (images/audio)
+// the prompt must have the input image marker (default: "<__media__>") in it
+// the default marker is defined by mtmd_default_marker()
+// the marker will be replaced with the image/audio chunk
 // for example:
-//   "here is an image: <__image__>\ndescribe it in detail."
+//   "here is an image: <__media__>\ndescribe it in detail."
 //   this will gives 3 chunks:
 //   1. "here is an image: <start_of_image>"
-//   2. (image tokens)
+//   2. (image/audio tokens)
 //   3. "<end_of_image>\ndescribe it in detail."
-// number of bitmaps must be equal to the number of image markers in the prompt
+// number of bitmaps must be equal to the number of markers in the prompt
 // this function is thread-safe (shared ctx)
 // return values:
 //   0 on success
-//   1 on number of images not matching the number of markers
+//   1 on number of bitmaps not matching the number of markers
 //   2 on image preprocessing error
 MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
                               mtmd_input_chunks * output,
@ -173,9 +194,14 @@ MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
                               size_t n_bitmaps);

 // returns 0 on success
+// TODO: deprecate
 MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
                             const mtmd_image_tokens * image_tokens);

+// returns 0 on success
+MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
+                                   const mtmd_input_chunk * chunk);
+
 // get output embeddings from the last encode pass
 MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);

@ -189,12 +215,16 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 //

 // helper function to construct a mtmd_bitmap from a file
+// it calls mtmd_helper_bitmap_init_from_buf() internally
 // returns nullptr on failure
 // this function is thread-safe
 MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname);

 // helper function to construct a mtmd_bitmap from a buffer containing a file
-// the file content must be an image in format supported by stb_image (jpg, png, bmp, gif, etc.)
+// supported formats:
+//     image: formats supported by stb_image: jpg, png, bmp, gif, etc.
+//     audio: formats supported by miniaudio: wav, mp3, flac
+// note: audio files will be auto-detected based on magic bytes
 // returns nullptr on failure
 // this function is thread-safe
 MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len);
@ -293,6 +323,7 @@ struct bitmap {
    uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); }
    uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); }
    const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); }
+    size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); }
    std::string id() { return mtmd_bitmap_get_id(ptr.get()); }
    void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); }
 };
--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -1891,6 +1891,7 @@ struct server_context {
    float slot_prompt_similarity = 0.0f;

    common_chat_templates_ptr chat_templates;
+    oaicompat_parser_options  oai_parser_opt;

    ~server_context() {
        mtmd_free(mctx);
@ -2086,6 +2087,15 @@ struct server_context {
        }

        metrics.init();
+
+        oai_parser_opt = {
+            /* use_jinja             */ params_base.use_jinja,
+            /* prefill_assistant     */ params_base.prefill_assistant,
+            /* reasoning_format      */ params_base.reasoning_format,
+            /* common_chat_templates */ chat_templates.get(),
+            /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
+            /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
+        };
    }

    server_slot * get_slot_by_id(int id) {
@ -4092,7 +4102,10 @@ int main(int argc, char ** argv) {
            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
            { "total_slots",                 ctx_server.params_base.n_parallel },
            { "model_path",                  ctx_server.params_base.model.path },
-            { "modalities",                  json{{"vision", ctx_server.mctx != nullptr}} }, // TODO: add more in the future
+            { "modalities",                  json{
+                {"vision", ctx_server.oai_parser_opt.allow_image},
+                {"audio",  ctx_server.oai_parser_opt.allow_audio},
+            } },
            { "chat_template",               common_chat_templates_source(ctx_server.chat_templates.get()) },
            { "bos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
            { "eos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
@ -4183,10 +4196,10 @@ int main(int argc, char ** argv) {
                for (auto & file : files) {
                    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
                    if (!bmp.ptr) {
-                        throw std::runtime_error("Failed to load image");
+                        throw std::runtime_error("Failed to load image or audio file");
                    }
                    // calculate bitmap hash (for KV caching)
-                    std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
+                    std::string hash = fnv_hash(bmp.data(), bmp.n_bytes());
                    bmp.set_id(hash.c_str());
                    bitmaps.entries.push_back(std::move(bmp));
                }
@ -4418,7 +4431,7 @@ int main(int argc, char ** argv) {
            OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
    };

-    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_chat_completions = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
        LOG_DBG("request: %s\n", req.body.c_str());
        if (ctx_server.params_base.embedding) {
            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
@ -4427,13 +4440,9 @@ int main(int argc, char ** argv) {

        auto body = json::parse(req.body);
        std::vector<raw_buffer> files;
-        json data = oaicompat_completion_params_parse(
+        json data = oaicompat_chat_params_parse(
            body,
-            params.use_jinja,
-            params.prefill_assistant,
-            params.reasoning_format,
-            ctx_server.chat_templates.get(),
-            ctx_server.mctx,
+            ctx_server.oai_parser_opt,
            files);

        handle_completions_impl(
@ -4446,16 +4455,12 @@ int main(int argc, char ** argv) {
    };

    // same with handle_chat_completions, but without inference part
-    const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_apply_template = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
        auto body = json::parse(req.body);
        std::vector<raw_buffer> files; // dummy, unused
-        json data = oaicompat_completion_params_parse(
+        json data = oaicompat_chat_params_parse(
            body,
-            params.use_jinja,
-            params.prefill_assistant,
-            params.reasoning_format,
-            ctx_server.chat_templates.get(),
-            ctx_server.mctx,
+            ctx_server.oai_parser_opt,
            files);
        res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
    };
--- a/tools/server/tests/unit/test_vision_api.py
+++ b/tools/server/tests/unit/test_vision_api.py
@ -30,6 +30,7 @@ def create_server():
        ("What is this:\n", "malformed",              False, None),
        ("What is this:\n", "https://google.com/404", False, None), # non-existent image
        ("What is this:\n", "https://ggml.ai",        False, None), # non-image data
+        # TODO @ngxson : test with multiple images, no images and with audio
    ]
 )
 def test_vision_chat_completion(prompt, image_url, success, re_content):
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@ -536,6 +536,7 @@ static bool server_sent_event(httplib::DataSink & sink, const char * event, cons
 // OAI utils
 //

+// used by /completions endpoint
 static json oaicompat_completion_params_parse(const json & body) {
    json llama_params;

@ -580,13 +581,19 @@ static json oaicompat_completion_params_parse(const json & body) {
    return llama_params;
 }

-static json oaicompat_completion_params_parse(
+struct oaicompat_parser_options {
+    bool use_jinja;
+    bool prefill_assistant;
+    common_reasoning_format reasoning_format;
+    common_chat_templates * tmpls;
+    bool allow_image;
+    bool allow_audio;
+};
+
+// used by /chat/completions endpoint
+static json oaicompat_chat_params_parse(
    const json & body, /* openai api json semantics */
-    bool use_jinja,
-    bool prefill_assistant,
-    common_reasoning_format reasoning_format,
-    const struct common_chat_templates * tmpls,
-    bool allow_non_text,
+    const oaicompat_parser_options & opt,
    std::vector<raw_buffer> & out_files)
 {
    json llama_params;
@ -598,11 +605,11 @@ static json oaicompat_completion_params_parse(
        if (stream) {
            throw std::runtime_error("Cannot use tools with stream");
        }
-        if (!use_jinja) {
+        if (!opt.use_jinja) {
            throw std::runtime_error("tools param requires --jinja flag");
        }
    }
-    if (!use_jinja) {
+    if (!opt.use_jinja) {
        if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) {
            throw std::runtime_error("Unsupported param: tool_choice");
        }
@ -667,12 +674,12 @@ static json oaicompat_completion_params_parse(

        for (auto & p : content) {
            std::string type      = json_value(p, "type", std::string());
-            json        image_url = json_value(p, "image_url", json::object());
            if (type == "image_url") {
-                if (!allow_non_text) {
-                    throw std::runtime_error("image input is not supported by this server");
+                if (!opt.allow_image) {
+                    throw std::runtime_error("image input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
                }

+                json image_url  = json_value(p, "image_url", json::object());
                std::string url = json_value(image_url, "url", std::string());
                if (string_starts_with(url, "http")) {
                    // download remote image
@ -710,8 +717,31 @@ static json oaicompat_completion_params_parse(

                // replace this chunk with a marker
                p["type"] = "text";
-                p["text"] = MTMD_DEFAULT_IMAGE_MARKER;
+                p["text"] = mtmd_default_marker();
                p.erase("image_url");
+
+            } else if (type == "input_audio") {
+                if (!opt.allow_audio) {
+                    throw std::runtime_error("audio input is not supported - hint: if this is unexpected, you may need to provide the mmproj");
+                }
+
+                json input_audio   = json_value(p, "input_audio", json::object());
+                std::string data   = json_value(input_audio, "data", std::string());
+                std::string format = json_value(input_audio, "format", std::string());
+                // while we also support flac, we don't allow it here so we matches the OAI spec
+                if (format != "wav" && format != "mp3") {
+                    throw std::runtime_error("input_audio.format must be either 'wav' or 'mp3'");
+                }
+                auto decoded_data = base64_decode(data); // expected to be base64 encoded
+                out_files.push_back(decoded_data);
+
+                // replace this chunk with a marker
+                p["type"] = "text";
+                p["text"] = mtmd_default_marker();
+                p.erase("input_audio");
+
+            } else if (type != "text") {
+                throw std::runtime_error("unsupported content[].type");
            }
        }
    }
@ -723,9 +753,9 @@ static json oaicompat_completion_params_parse(
    inputs.json_schema           = json_schema.is_null() ? "" : json_schema.dump();
    inputs.grammar               = grammar;
    inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
-    inputs.use_jinja             = use_jinja;
+    inputs.use_jinja             = opt.use_jinja;
    inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", false);
-    inputs.extract_reasoning     = reasoning_format != COMMON_REASONING_FORMAT_NONE;
+    inputs.extract_reasoning     = opt.reasoning_format != COMMON_REASONING_FORMAT_NONE;
    inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
    if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
        throw std::runtime_error("Cannot use custom grammar constraints with tools.");
@ -733,7 +763,7 @@ static json oaicompat_completion_params_parse(

    // if the assistant message appears at the end of list, we do not add end-of-turn token
    // for ex. this can be useful to modify the reasoning process in reasoning models
-    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && prefill_assistant;
+    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
    common_chat_msg last_message;
    if (prefill_assistant_message) {
        last_message = inputs.messages.back();
@ -749,7 +779,7 @@ static json oaicompat_completion_params_parse(
    }

    // Apply chat template to the list of messages
-    auto chat_params = common_chat_templates_apply(tmpls, inputs);
+    auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);

    /* Append assistant prefilled message */
    if (prefill_assistant_message) {
@ -1040,7 +1070,7 @@ struct server_tokens {
 private: // disallow accessing these members directly, risking out-of-sync

    // map a **start** position in tokens to the image chunk
-    std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image;
+    std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_media;

    // list of tokens
    // it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
@ -1051,7 +1081,7 @@ private: // disallow accessing these members directly, risking out-of-sync
    // for ex. with input of 5 text tokens and 2 images:
    //      [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
    // pos  0   1   2   3   4   5      6      7      8      9
-    // map_pos_to_image will contain: {5, img0}, {8, img1}
+    // map_pos_to_media will contain: {5, img0}, {8, img1}

 public:
    server_tokens() = default;
@ -1090,15 +1120,15 @@ public:
        }
        oss << "\n";
        oss << "image pos: ";
-        for (const auto & it : map_pos_to_image) {
+        for (const auto & it : map_pos_to_media) {
            oss << it.first << ", ";
        }
        return oss.str();
    }

    const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
-        auto it = map_pos_to_image.find(pos);
-        if (it != map_pos_to_image.end()) {
+        auto it = map_pos_to_media.find(pos);
+        if (it != map_pos_to_media.end()) {
            return it->second;
        } else {
            throw std::runtime_error("Chunk not found");
@ -1115,16 +1145,15 @@ public:
    // will create a copy of the chunk if it contains non-text data
    void push_back(const mtmd_input_chunk * chunk) {
        auto type = mtmd_input_chunk_get_type(chunk);
-        if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
            GGML_ASSERT(has_mtmd);
-            auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
-            const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
+            const int n_pos = mtmd_input_chunk_get_n_pos(chunk);
            llama_pos start_pos = tokens.size();
            for (int i = 0; i < n_pos; ++i) {
                tokens.emplace_back(LLAMA_TOKEN_NULL);
            }
            mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
-            map_pos_to_image[start_pos] = std::move(new_chunk);
+            map_pos_to_media[start_pos] = std::move(new_chunk);
        } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
            size_t n_tokens;
            auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
@ -1169,6 +1198,9 @@ public:
    void keep_first(size_t n) {
        GGML_ASSERT(n <= tokens.size());
        if (has_mtmd) {
+            if (n == tokens.size()) {
+                return; // nothing to do
+            }
            // we throw an error if we try to remove a token in the middle of an image
            // for ex. with input of 5 text tokens and 2 images:
            //    [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
@ -1183,10 +1215,10 @@ public:
                }
            }
            // remove all image chunks that are not used anymore
-            for (auto it = map_pos_to_image.begin(); it != map_pos_to_image.end(); ) {
+            for (auto it = map_pos_to_media.begin(); it != map_pos_to_media.end(); ) {
                llama_pos pos = it->first;
                if (pos >= (llama_pos)n) {
-                    it = map_pos_to_image.erase(it);
+                    it = map_pos_to_media.erase(it);
                } else {
                    ++it;
                }
@ -1217,14 +1249,12 @@ public:
                const auto & a_chunk =   find_chunk(i);
                const auto & b_chunk = b.find_chunk(i);
                GGML_ASSERT(a_chunk && b_chunk);
-                const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get());
-                const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get());
-                std::string ai_id  = mtmd_image_tokens_get_id(a_img);
-                std::string bi_id  = mtmd_image_tokens_get_id(b_img);
-                size_t a_pos       = mtmd_image_tokens_get_n_pos(a_img);
-                size_t b_pos       = mtmd_image_tokens_get_n_pos(b_img);
+                std::string ai_id  = mtmd_input_chunk_get_id(a_chunk.get());
+                std::string bi_id  = mtmd_input_chunk_get_id(b_chunk.get());
+                size_t a_pos       = mtmd_input_chunk_get_n_pos(a_chunk.get());
+                size_t b_pos       = mtmd_input_chunk_get_n_pos(b_chunk.get());
                if (ai_id == bi_id && a_pos == b_pos) {
-                    GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen
+                    GGML_ASSERT(a_pos > 0 && "Invalid media chunk"); // should never happen
                    i += a_pos - 1; // will be +1 by the for loop
                    continue;
                } else {
@ -1250,8 +1280,7 @@ public:
            if (t == LLAMA_TOKEN_NULL) {
                try {
                    const auto & chunk = find_chunk(i);
-                    const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get());
-                    size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
+                    size_t n_pos = mtmd_input_chunk_get_n_pos(chunk.get());
                    i += n_pos - 1; // will be +1 by the for loop
                } catch (const std::exception & e) {
                    return false;
@ -1270,22 +1299,21 @@ public:
                llama_pos n_past,
                int32_t seq_id,
                llama_pos & n_pos_out) {
-        auto it = map_pos_to_image.find(n_past);
-        if (it == map_pos_to_image.end()) {
-            throw std::runtime_error("Chunk not found");
-        }
-        SRV_INF("%s\n", "processing image...");
+        auto & chunk = find_chunk(n_past);
+        const char * name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE
+                            ? "image" : "audio";
+        SRV_INF("processing %s...\n", name);
        int32_t n_batch = llama_n_batch(ctx);
        int64_t t0 = ggml_time_ms();
        llama_pos new_n_past = n_past;
        int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
-            it->second.get(), // chunk
+            chunk.get(),
            n_past,
            seq_id,
            n_batch,
            true, // logits last
            &new_n_past);
-        SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
+        SRV_INF("%s processed in %" PRId64 " ms\n", name, ggml_time_ms() - t0);
        if (result != 0) {
            LOG_ERR("mtmd_helper_eval failed with status %d", result);
            n_pos_out = n_past;
--- a/tools/server/webui/src/components/ChatInputExtraContextItem.tsx
+++ b/tools/server/webui/src/components/ChatInputExtraContextItem.tsx
@ -1,4 +1,8 @@
-import { DocumentTextIcon, XMarkIcon } from '@heroicons/react/24/outline';
+import {
+  DocumentTextIcon,
+  SpeakerWaveIcon,
+  XMarkIcon,
+} from '@heroicons/react/24/outline';
 import { MessageExtra } from '../utils/types';
 import { useState } from 'react';
 import { classNames } from '../utils/misc';
@ -66,7 +70,11 @@ export default function ChatInputExtraContextItem({
                  className="w-14 h-14 flex items-center justify-center"
                  aria-description="Document icon"
                >
-                  <DocumentTextIcon className="h-8 w-14 text-base-content/50" />
+                  {item.type === 'audioFile' ? (
+                    <SpeakerWaveIcon className="h-8 w-8 text-gray-500" />
+                  ) : (
+                    <DocumentTextIcon className="h-8 w-8 text-gray-500" />
+                  )}
                </div>

                <div className="text-xs pr-4">
@ -98,6 +106,19 @@ export default function ChatInputExtraContextItem({
                src={showingItem.base64Url}
                alt={`Preview image for ${showingItem.name}`}
              />
+            ) : showingItem.type === 'audioFile' ? (
+              <audio
+                controls
+                className="w-full"
+                aria-description={`Audio file ${showingItem.name}`}
+              >
+                <source
+                  src={`data:${showingItem.mimeType};base64,${showingItem.base64Data}`}
+                  type={showingItem.mimeType}
+                  aria-description={`Audio file ${showingItem.name}`}
+                />
+                Your browser does not support the audio element.
+              </audio>
            ) : (
              <div className="overflow-x-auto">
                <pre className="whitespace-pre-wrap break-words text-sm">
--- a/tools/server/webui/src/components/ChatScreen.tsx
+++ b/tools/server/webui/src/components/ChatScreen.tsx
@ -278,6 +278,13 @@ export default function ChatScreen() {

 function ServerInfo() {
  const { serverProps } = useAppContext();
+  const modalities = [];
+  if (serverProps?.modalities?.audio) {
+    modalities.push('audio');
+  }
+  if (serverProps?.modalities?.vision) {
+    modalities.push('vision');
+  }
  return (
    <div
      className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6"
@ -291,6 +298,13 @@ function ServerInfo() {
          <br />
          <b>Build</b>: {serverProps?.build_info}
          <br />
+          {modalities.length > 0 ? (
+            <>
+              <b>Supported modalities:</b> {modalities.join(', ')}
+            </>
+          ) : (
+            ''
+          )}
        </p>
      </div>
    </div>
--- a/tools/server/webui/src/components/useChatExtraContext.tsx
+++ b/tools/server/webui/src/components/useChatExtraContext.tsx
@ -11,6 +11,7 @@ pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorkerSrc;
 // This file handles uploading extra context items (a.k.a files)
 // It allows processing these kinds of files:
 // - image files (converted to base64)
+// - audio files (converted to base64)
 // - text files (including code files)
 // - pdf (converted to text)

@ -41,96 +42,73 @@ export function useChatExtraContext(): ChatExtraContextApi {

  const isSupportVision = serverProps?.modalities?.vision;

-  const onFileAdded = (files: File[]) => {
-    for (const file of files) {
-      const mimeType = file.type;
-      console.debug({ mimeType, file });
-      if (file.size > 10 * 1024 * 1024) {
-        toast.error('File is too large. Maximum size is 10MB.');
-        break;
-      }
-
-      if (mimeType.startsWith('image/')) {
-        if (!isSupportVision) {
-          toast.error('Multimodal is not supported by this server or model.');
+  const onFileAdded = async (files: File[]) => {
+    try {
+      for (const file of files) {
+        const mimeType = file.type;
+        if (file.size > 10 * 1024 * 1024) {
+          toast.error('File is too large. Maximum size is 10MB.');
          break;
        }
-        const reader = new FileReader();
-        reader.onload = async (event) => {
-          if (event.target?.result) {
-            let base64Url = event.target.result as string;

-            if (mimeType === 'image/svg+xml') {
-              // Convert SVG to PNG
-              base64Url = await svgBase64UrlToPngDataURL(base64Url);
-            }
+        if (mimeType.startsWith('image/')) {
+          if (!isSupportVision) {
+            toast.error('Multimodal is not supported by this server or model.');
+            break;
+          }

-            addItems([
-              {
+          let base64Url = await getFileAsBase64(file);
+          if (mimeType === 'image/svg+xml') {
+            // Convert SVG to PNG
+            base64Url = await svgBase64UrlToPngDataURL(base64Url);
+          }
+          addItems([
+            {
+              type: 'imageFile',
+              name: file.name,
+              base64Url,
+            },
+          ]);
+        } else if (mimeType.startsWith('video/')) {
+          toast.error('Video files are not supported yet.');
+          break;
+        } else if (mimeType.startsWith('audio/')) {
+          if (!/mpeg|wav/.test(mimeType)) {
+            toast.error('Only mp3 and wav audio files are supported.');
+            break;
+          }
+
+          // plain base64, not a data URL
+          const base64Data = await getFileAsBase64(file, false);
+          addItems([
+            {
+              type: 'audioFile',
+              name: file.name,
+              mimeType,
+              base64Data,
+            },
+          ]);
+        } else if (mimeType.startsWith('application/pdf')) {
+          if (config.pdfAsImage && !isSupportVision) {
+            toast(
+              'Multimodal is not supported, PDF will be converted to text instead of image.'
+            );
+            break;
+          }
+
+          if (config.pdfAsImage && isSupportVision) {
+            // Convert PDF to images
+            const base64Urls = await convertPDFToImage(file);
+            addItems(
+              base64Urls.map((base64Url) => ({
                type: 'imageFile',
                name: file.name,
                base64Url,
-              },
-            ]);
-          }
-        };
-        reader.readAsDataURL(file);
-      } else if (
-        mimeType.startsWith('video/') ||
-        mimeType.startsWith('audio/')
-      ) {
-        toast.error('Video and audio files are not supported yet.');
-        break;
-      } else if (mimeType.startsWith('application/pdf')) {
-        if (config.pdfAsImage && !isSupportVision) {
-          toast(
-            'Multimodal is not supported, PDF will be converted to text instead of image.'
-          );
-          break;
-        }
-
-        const promise =
-          config.pdfAsImage && isSupportVision
-            ? convertPDFToImage(file).then((base64Urls) => {
-                addItems(
-                  base64Urls.map((base64Url) => ({
-                    type: 'imageFile',
-                    name: file.name,
-                    base64Url,
-                  }))
-                );
-              })
-            : convertPDFToText(file).then((content) => {
-                if (isSupportVision) {
-                  toast.success(
-                    'PDF file converted to text. You can also convert it to image, see in Settings.'
-                  );
-                }
-                addItems([
-                  {
-                    type: 'textFile',
-                    name: file.name,
-                    content,
-                  },
-                ]);
-              });
-
-        promise.catch((error) => {
-          console.error(error);
-          toast.error('Failed to parse PDF file.');
-        });
-        break;
-      } else {
-        // Because there can be many text file types (like code file), we will not check the mime type
-        // and will just check if the file is not binary.
-        const reader = new FileReader();
-        reader.onload = (event) => {
-          if (event.target?.result) {
-            const content = event.target.result as string;
-            if (!isLikelyNotBinary(content)) {
-              toast.error('File is binary. Please upload a text file.');
-              return;
-            }
+              }))
+            );
+          } else {
+            // Convert PDF to text
+            const content = await convertPDFToText(file);
            addItems([
              {
                type: 'textFile',
@ -138,10 +116,40 @@ export function useChatExtraContext(): ChatExtraContextApi {
                content,
              },
            ]);
+            if (isSupportVision) {
+              toast.success(
+                'PDF file converted to text. You can also convert it to image, see in Settings.'
+              );
+            }
          }
-        };
-        reader.readAsText(file);
+          break;
+        } else {
+          // Because there can be many text file types (like code file), we will not check the mime type
+          // and will just check if the file is not binary.
+          const reader = new FileReader();
+          reader.onload = (event) => {
+            if (event.target?.result) {
+              const content = event.target.result as string;
+              if (!isLikelyNotBinary(content)) {
+                toast.error('File is binary. Please upload a text file.');
+                return;
+              }
+              addItems([
+                {
+                  type: 'textFile',
+                  name: file.name,
+                  content,
+                },
+              ]);
+            }
+          };
+          reader.readAsText(file);
+        }
      }
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      const errorMessage = `Error processing file: ${message}`;
+      toast.error(errorMessage);
    }
  };

@ -154,6 +162,25 @@ export function useChatExtraContext(): ChatExtraContextApi {
  };
 }

+async function getFileAsBase64(file: File, outputUrl = true): Promise<string> {
+  return new Promise((resolve, reject) => {
+    const reader = new FileReader();
+    reader.onload = (event) => {
+      if (event.target?.result) {
+        let result = event.target.result as string;
+        if (!outputUrl) {
+          // remove base64 url prefix and correct characters
+          result = result.substring(result.indexOf(',') + 1);
+        }
+        resolve(result);
+      } else {
+        reject(new Error('Failed to read file.'));
+      }
+    };
+    reader.readAsDataURL(file);
+  });
+}
+
 async function getFileAsBuffer(file: File): Promise<ArrayBuffer> {
  return new Promise((resolve, reject) => {
    const reader = new FileReader();
--- a/tools/server/webui/src/utils/misc.ts
+++ b/tools/server/webui/src/utils/misc.ts
@ -89,6 +89,14 @@ export function normalizeMsgsForAPI(messages: Readonly<Message[]>) {
          type: 'image_url',
          image_url: { url: extra.base64Url },
        });
+      } else if (extra.type === 'audioFile') {
+        contentArr.push({
+          type: 'input_audio',
+          input_audio: {
+            data: extra.base64Data,
+            format: /wav/.test(extra.mimeType) ? 'wav' : 'mp3',
+          },
+        });
      } else {
        throw new Error('Unknown extra type');
      }
--- a/tools/server/webui/src/utils/types.ts
+++ b/tools/server/webui/src/utils/types.ts
@ -51,6 +51,7 @@ export interface Message {
 export type MessageExtra =
  | MessageExtraTextFile
  | MessageExtraImageFile
+  | MessageExtraAudioFile
  | MessageExtraContext;

 export interface MessageExtraTextFile {
@ -65,6 +66,13 @@ export interface MessageExtraImageFile {
  base64Url: string;
 }

+export interface MessageExtraAudioFile {
+  type: 'audioFile';
+  name: string;
+  base64Data: string;
+  mimeType: string;
+}
+
 export interface MessageExtraContext {
  type: 'context';
  name: string;
@ -79,6 +87,10 @@ export type APIMessageContentPart =
  | {
      type: 'image_url';
      image_url: { url: string };
+    }
+  | {
+      type: 'input_audio';
+      input_audio: { data: string; format: 'wav' | 'mp3' };
    };

 export type APIMessage = {
@ -120,6 +132,7 @@ export interface LlamaCppServerProps {
  n_ctx: number;
  modalities?: {
    vision: boolean;
+    audio: boolean;
  };
  // TODO: support params
 }
--- a/tools/tts/tts.cpp
+++ b/tools/tts/tts.cpp
@ -579,6 +579,8 @@ int main(int argc, char ** argv) {

    params.model = params.vocoder.model;
    params.embedding = true;
+    params.ctx_shift = false; // silence warning
+    params.n_ubatch = params.n_batch;

    common_init_result llama_init_cts = common_init_from_params(params);

@ -1020,8 +1022,8 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
    }
    GGML_ASSERT(batch.n_tokens == n_codes);

-    if (llama_decode(ctx_cts, batch) != 0) {
-        LOG_ERR("%s: llama_decode() failed\n", __func__);
+    if (llama_encode(ctx_cts, batch) != 0) {
+        LOG_ERR("%s: llama_encode() failed\n", __func__);
        return 1;
    }