diff --git a/Makefile b/Makefile
index 40e2a67c4..8b1982823 100644
--- a/Makefile
+++ b/Makefile
@@ -235,7 +235,7 @@ else
HCC := $(ROCM_PATH)/llvm/bin/clang
HCXX := $(ROCM_PATH)/llvm/bin/clang++
endif
- HIPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA -DSD_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
+ HIPFLAGS += -DGGML_USE_HIP -DGGML_HIP_NO_VMM -DGGML_USE_CUDA -DSD_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
HIPLDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
HIPLDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
HIPLDFLAGS += -lhipblas -lamdhip64 -lrocblas
diff --git a/colab.ipynb b/colab.ipynb
index 95d420885..4960bad68 100644
--- a/colab.ipynb
+++ b/colab.ipynb
@@ -48,7 +48,7 @@
"source": [
"#@title v-- Enter your model below and then click this to start Koboldcpp\n",
"\n",
- "Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\",\"https://huggingface.co/KoboldAI/LLaMA2-13B-Estopia-GGUF/resolve/main/LLaMA2-13B-Estopia.Q4_K_S.gguf\",\"https://huggingface.co/mradermacher/Fimbulvetr-11B-v2-GGUF/resolve/main/Fimbulvetr-11B-v2.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/mradermacher/mini-magnum-12b-v1.1-GGUF/resolve/main/mini-magnum-12b-v1.1.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-Kimiko-v2-13B-GGUF/resolve/main/mythomax-l2-kimiko-v2-13b.Q4_K_M.gguf\",\"https://huggingface.co/bartowski/Rocinante-12B-v1.1-GGUF/resolve/main/Rocinante-12B-v1.1-Q4_K_S.gguf\",\"https://huggingface.co/KoboldAI/Llama-3.1-8B-BookAdventures-GGUF/resolve/main/Llama-3.1-8B-BookAdventures.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/MistRP-Airoboros-7B-GGUF/resolve/main/mistrp-airoboros-7b.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf\",\"https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf\",\"https://huggingface.co/grimjim/kukulemon-7B-GGUF/resolve/main/kukulemon-7B.Q8_0.gguf\",\"https://huggingface.co/mradermacher/LemonKunoichiWizardV3-GGUF/resolve/main/LemonKunoichiWizardV3.Q4_K_M.gguf\",\"https://huggingface.co/Lewdiculous/Kunoichi-DPO-v2-7B-GGUF-Imatrix/resolve/main/Kunoichi-DPO-v2-7B-Q4_K_M-imatrix.gguf\",\"https://huggingface.co/mradermacher/L3-8B-Stheno-v3.2-i1-GGUF/resolve/main/L3-8B-Stheno-v3.2.i1-Q4_K_M.gguf\",\"https://huggingface.co/Lewdiculous/Llama-3-Lumimaid-8B-v0.1-OAS-GGUF-IQ-Imatrix/resolve/main/v2-Llama-3-Lumimaid-8B-v0.1-OAS-Q4_K_M-imat.gguf\",\"https://huggingface.co/bartowski/NeuralDaredevil-8B-abliterated-GGUF/resolve/main/NeuralDaredevil-8B-abliterated-Q4_K_M.gguf\",\"https://huggingface.co/bartowski/L3-8B-Lunaris-v1-GGUF/resolve/main/L3-8B-Lunaris-v1-Q4_K_M.gguf\",\"https://huggingface.co/mradermacher/L3-Umbral-Mind-RP-v2.0-8B-GGUF/resolve/main/L3-Umbral-Mind-RP-v2.0-8B.Q4_K_M.gguf\",\"https://huggingface.co/bartowski/TheDrummer_Cydonia-24B-v2-GGUF/resolve/main/TheDrummer_Cydonia-24B-v2-Q4_K_S.gguf\"]{allow-input: true}\n",
+ "Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\",\"https://huggingface.co/KoboldAI/LLaMA2-13B-Estopia-GGUF/resolve/main/LLaMA2-13B-Estopia.Q4_K_S.gguf\",\"https://huggingface.co/mradermacher/Fimbulvetr-11B-v2-GGUF/resolve/main/Fimbulvetr-11B-v2.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/mradermacher/mini-magnum-12b-v1.1-GGUF/resolve/main/mini-magnum-12b-v1.1.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-Kimiko-v2-13B-GGUF/resolve/main/mythomax-l2-kimiko-v2-13b.Q4_K_M.gguf\",\"https://huggingface.co/bartowski/Rocinante-12B-v1.1-GGUF/resolve/main/Rocinante-12B-v1.1-Q4_K_S.gguf\",\"https://huggingface.co/KoboldAI/Llama-3.1-8B-BookAdventures-GGUF/resolve/main/Llama-3.1-8B-BookAdventures.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/MistRP-Airoboros-7B-GGUF/resolve/main/mistrp-airoboros-7b.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf\",\"https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf\",\"https://huggingface.co/grimjim/kukulemon-7B-GGUF/resolve/main/kukulemon-7B.Q8_0.gguf\",\"https://huggingface.co/mradermacher/LemonKunoichiWizardV3-GGUF/resolve/main/LemonKunoichiWizardV3.Q4_K_M.gguf\",\"https://huggingface.co/Lewdiculous/Kunoichi-DPO-v2-7B-GGUF-Imatrix/resolve/main/Kunoichi-DPO-v2-7B-Q4_K_M-imatrix.gguf\",\"https://huggingface.co/mradermacher/L3-8B-Stheno-v3.2-i1-GGUF/resolve/main/L3-8B-Stheno-v3.2.i1-Q4_K_M.gguf\",\"https://huggingface.co/Lewdiculous/Llama-3-Lumimaid-8B-v0.1-OAS-GGUF-IQ-Imatrix/resolve/main/v2-Llama-3-Lumimaid-8B-v0.1-OAS-Q4_K_M-imat.gguf\",\"https://huggingface.co/bartowski/NeuralDaredevil-8B-abliterated-GGUF/resolve/main/NeuralDaredevil-8B-abliterated-Q4_K_M.gguf\",\"https://huggingface.co/bartowski/L3-8B-Lunaris-v1-GGUF/resolve/main/L3-8B-Lunaris-v1-Q4_K_M.gguf\",\"https://huggingface.co/mradermacher/L3-Umbral-Mind-RP-v2.0-8B-GGUF/resolve/main/L3-Umbral-Mind-RP-v2.0-8B.Q4_K_M.gguf\",\"https://huggingface.co/bartowski/TheDrummer_Cydonia-24B-v2-GGUF/resolve/main/TheDrummer_Cydonia-24B-v2-Q4_K_S.gguf\",\"https://huggingface.co/bartowski/PocketDoc_Dans-PersonalityEngine-V1.2.0-24b-GGUF/resolve/main/PocketDoc_Dans-PersonalityEngine-V1.2.0-24b-IQ4_XS.gguf\"]{allow-input: true}\n",
"Layers = 99 #@param [99]{allow-input: true}\n",
"ContextSize = 4096 #@param [4096,8192] {allow-input: true}\n",
"FlashAttention = True #@param {type:\"boolean\"}\n",
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 8b7c75d85..6358a94e9 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -699,6 +699,9 @@ class Model:
if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
# ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
res = "deepseek-r1-qwen"
+ if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
+ # ref: https://huggingface.co/Xenova/gpt-4o
+ res = "gpt-4o"
if res is None:
logger.warning("\n")
@@ -2512,7 +2515,8 @@ class Phi3MiniModel(Model):
rms_eps = self.find_hparam(["rms_norm_eps"])
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
- rope_dims = n_embd // n_head
+ rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
+ rope_dims = int(rot_pct * n_embd) // n_head
self.gguf_writer.add_context_length(max_pos_embds)
self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
@@ -2536,7 +2540,8 @@ class Phi3MiniModel(Model):
n_head = self.find_hparam(["num_attention_heads", "n_head"])
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
- rope_dims = n_embd // n_head
+ rot_pct = self.hparams.get("partial_rotary_factor", 1.0)
+ rope_dims = int(rot_pct * n_embd) // n_head
# write rope scaling for long context (128k) model
rope_scaling = self.find_hparam(['rope_scaling'], True)
@@ -2565,7 +2570,7 @@ class Phi3MiniModel(Model):
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
- raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
+ raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}.')
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32))
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index fa4989a80..07d3ce0e4 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -109,6 +109,7 @@ models = [
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
{"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
+ {"name": "gpt-4o", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
]
@@ -131,6 +132,10 @@ def download_model(model):
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
+ if name == "gpt-4o":
+ # Xenova/gpt-4o is tokenizer-only, it does not contain config.json
+ files = ["tokenizer.json", "tokenizer_config.json"]
+
if tokt == TOKENIZER_TYPE.SPM:
files.append("tokenizer.model")
diff --git a/docs/function-calling.md b/docs/function-calling.md
new file mode 100644
index 000000000..92cb6531a
--- /dev/null
+++ b/docs/function-calling.md
@@ -0,0 +1,390 @@
+# Function Calling
+
+[chat.h](../common/chat.h) (https://github.com/ggml-org/llama.cpp/pull/9639) adds support for [OpenAI-style function calling](https://platform.openai.com/docs/guides/function-calling) and is used in:
+- `llama-server` when started w/ `--jinja` flag
+- `llama-cli` (WIP: https://github.com/ggml-org/llama.cpp/pull/11556)
+
+## Universal support w/ Native & Generic handlers
+
+Function calling is supported for all models (see https://github.com/ggml-org/llama.cpp/pull/9639):
+
+- Native tool call formats supported:
+ - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2
+ - Functionary v3.1 / v3.2
+ - Hermes 2/3, Qwen 2.5
+ - Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034)
+ - Mistral Nemo
+ - Firefunction v2
+ - Command R7B
+ - DeepSeek R1 (WIP / seems reluctant to call any tools?)
+
+- Generic tool call is supported when the template isn't recognized by native format handlers (you'll see `Chat format: Generic` in the logs).
+ - Use `--chat-template-file` to override the template when appropriate (see examples below)
+ - Generic support may consume more tokens and be less efficient than a model's native format.
+
+
+Show some common templates and which format handler they use
+
+| Template | Format |
+|----------|--------|
+| Almawave-Velvet-14B.jinja | Hermes 2 Pro |
+| AtlaAI-Selene-1-Mini-Llama-3.1-8B.jinja | Llama 3.x |
+| CohereForAI-aya-expanse-8b.jinja | Generic |
+| CohereForAI-c4ai-command-r-plus-default.jinja | Generic |
+| CohereForAI-c4ai-command-r-plus-rag.jinja | Generic |
+| CohereForAI-c4ai-command-r-plus-tool_use.jinja | Generic |
+| CohereForAI-c4ai-command-r7b-12-2024-default.jinja | Command R7B (extract reasoning) |
+| CohereForAI-c4ai-command-r7b-12-2024-rag.jinja | Command R7B (extract reasoning) |
+| CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja | Command R7B (extract reasoning) |
+| CohereForAI-c4ai-command-r7b-12-2024.jinja | Generic |
+| DavieLion-Llama-3.2-1B-SPIN-iter3.jinja | Generic |
+| Delta-Vector-Rei-12B.jinja | Mistral Nemo |
+| EpistemeAI-Mistral-Nemo-Instruct-12B-Philosophy-Math.jinja | Mistral Nemo |
+| FlofloB-83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.jinja | Hermes 2 Pro |
+| FlofloB-test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit.jinja | Generic |
+| HelpingAI-HAI-SER.jinja | Generic |
+| HuggingFaceTB-SmolLM2-1.7B-Instruct.jinja | Generic |
+| HuggingFaceTB-SmolLM2-135M-Instruct.jinja | Generic |
+| HuggingFaceTB-SmolLM2-360M-Instruct.jinja | Generic |
+| INSAIT-Institute-BgGPT-Gemma-2-27B-IT-v1.0.jinja | Generic |
+| Ihor-Text2Graph-R1-Qwen2.5-0.5b.jinja | Hermes 2 Pro |
+| Infinigence-Megrez-3B-Instruct.jinja | Generic |
+| Josephgflowers-TinyLlama_v1.1_math_code-world-test-1.jinja | Generic |
+| LGAI-EXAONE-EXAONE-3.5-2.4B-Instruct.jinja | Generic |
+| LGAI-EXAONE-EXAONE-3.5-7.8B-Instruct.jinja | Generic |
+| LatitudeGames-Wayfarer-12B.jinja | Generic |
+| Magpie-Align-Llama-3-8B-Magpie-Align-v0.1.jinja | Generic |
+| Magpie-Align-Llama-3.1-8B-Magpie-Align-v0.1.jinja | Generic |
+| MaziyarPanahi-calme-3.2-instruct-78b.jinja | Generic |
+| MiniMaxAI-MiniMax-Text-01.jinja | Generic |
+| MiniMaxAI-MiniMax-VL-01.jinja | Generic |
+| NaniDAO-deepseek-r1-qwen-2.5-32B-ablated.jinja | DeepSeek R1 (extract reasoning) |
+| NexaAIDev-Octopus-v2.jinja | Generic |
+| NousResearch-Hermes-2-Pro-Llama-3-8B-default.jinja | Generic |
+| NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja | Hermes 2 Pro |
+| NousResearch-Hermes-2-Pro-Mistral-7B-default.jinja | Generic |
+| NousResearch-Hermes-2-Pro-Mistral-7B-tool_use.jinja | Hermes 2 Pro |
+| NousResearch-Hermes-3-Llama-3.1-70B-default.jinja | Generic |
+| NousResearch-Hermes-3-Llama-3.1-70B-tool_use.jinja | Hermes 2 Pro |
+| NovaSky-AI-Sky-T1-32B-Flash.jinja | Hermes 2 Pro |
+| NovaSky-AI-Sky-T1-32B-Preview.jinja | Hermes 2 Pro |
+| OnlyCheeini-greesychat-turbo.jinja | Generic |
+| Orenguteng-Llama-3.1-8B-Lexi-Uncensored-V2.jinja | Llama 3.x |
+| OrionStarAI-Orion-14B-Chat.jinja | Generic |
+| PowerInfer-SmallThinker-3B-Preview.jinja | Generic |
+| PrimeIntellect-INTELLECT-1-Instruct.jinja | Generic |
+| Qwen-QVQ-72B-Preview.jinja | Generic |
+| Qwen-QwQ-32B-Preview.jinja | Hermes 2 Pro |
+| Qwen-Qwen1.5-7B-Chat.jinja | Generic |
+| Qwen-Qwen2-7B-Instruct.jinja | Generic |
+| Qwen-Qwen2-VL-72B-Instruct.jinja | Generic |
+| Qwen-Qwen2-VL-7B-Instruct.jinja | Generic |
+| Qwen-Qwen2.5-0.5B.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-1.5B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-14B-Instruct-1M.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-14B.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-32B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-32B.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-3B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-72B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-7B-Instruct-1M.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-7B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-7B.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-Coder-32B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-Coder-7B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-Math-1.5B.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-Math-7B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-VL-3B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-VL-72B-Instruct.jinja | Hermes 2 Pro |
+| Qwen-Qwen2.5-VL-7B-Instruct.jinja | Hermes 2 Pro |
+| RWKV-Red-Team-ARWKV-7B-Preview-0.1.jinja | Hermes 2 Pro |
+| SakanaAI-TinySwallow-1.5B-Instruct.jinja | Hermes 2 Pro |
+| SakanaAI-TinySwallow-1.5B.jinja | Hermes 2 Pro |
+| Sao10K-70B-L3.3-Cirrus-x1.jinja | Llama 3.x |
+| SentientAGI-Dobby-Mini-Leashed-Llama-3.1-8B.jinja | Llama 3.x |
+| SentientAGI-Dobby-Mini-Unhinged-Llama-3.1-8B.jinja | Llama 3.x |
+| Steelskull-L3.3-Damascus-R1.jinja | Llama 3.x |
+| Steelskull-L3.3-MS-Nevoria-70b.jinja | Llama 3.x |
+| Steelskull-L3.3-Nevoria-R1-70b.jinja | Llama 3.x |
+| THUDM-glm-4-9b-chat.jinja | Generic |
+| THUDM-glm-edge-1.5b-chat.jinja | Generic |
+| Tarek07-Progenitor-V1.1-LLaMa-70B.jinja | Llama 3.x |
+| TheBloke-FusionNet_34Bx2_MoE-AWQ.jinja | Generic |
+| TinyLlama-TinyLlama-1.1B-Chat-v1.0.jinja | Generic |
+| UCLA-AGI-Mistral7B-PairRM-SPPO-Iter3.jinja | Generic |
+| ValiantLabs-Llama3.1-8B-Enigma.jinja | Llama 3.x |
+| abacusai-Fewshot-Metamath-OrcaVicuna-Mistral.jinja | Generic |
+| ai21labs-AI21-Jamba-1.5-Large.jinja | Generic |
+| allenai-Llama-3.1-Tulu-3-405B-SFT.jinja | Generic |
+| allenai-Llama-3.1-Tulu-3-405B.jinja | Generic |
+| allenai-Llama-3.1-Tulu-3-8B.jinja | Generic |
+| arcee-ai-Virtuoso-Lite.jinja | Hermes 2 Pro |
+| arcee-ai-Virtuoso-Medium-v2.jinja | Hermes 2 Pro |
+| arcee-ai-Virtuoso-Small-v2.jinja | Hermes 2 Pro |
+| avemio-GRAG-NEMO-12B-ORPO-HESSIAN-AI.jinja | Generic |
+| bespokelabs-Bespoke-Stratos-7B.jinja | Hermes 2 Pro |
+| bfuzzy1-acheron-m1a-llama.jinja | Generic |
+| bofenghuang-vigogne-2-70b-chat.jinja | Generic |
+| bytedance-research-UI-TARS-72B-DPO.jinja | Generic |
+| bytedance-research-UI-TARS-7B-DPO.jinja | Generic |
+| bytedance-research-UI-TARS-7B-SFT.jinja | Generic |
+| carsenk-phi3.5_mini_exp_825_uncensored.jinja | Generic |
+| cyberagent-DeepSeek-R1-Distill-Qwen-14B-Japanese.jinja | DeepSeek R1 (extract reasoning) |
+| cyberagent-DeepSeek-R1-Distill-Qwen-32B-Japanese.jinja | DeepSeek R1 (extract reasoning) |
+| databricks-dbrx-instruct.jinja | Generic |
+| deepseek-ai-DeepSeek-Coder-V2-Instruct.jinja | Generic |
+| deepseek-ai-DeepSeek-Coder-V2-Lite-Base.jinja | Generic |
+| deepseek-ai-DeepSeek-Coder-V2-Lite-Instruct.jinja | Generic |
+| deepseek-ai-DeepSeek-R1-Distill-Llama-70B.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1-Distill-Qwen-14B.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1-Distill-Qwen-7B.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1-Zero.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-R1.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-V2-Lite.jinja | Generic |
+| deepseek-ai-DeepSeek-V2.5.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-DeepSeek-V3.jinja | DeepSeek R1 (extract reasoning) |
+| deepseek-ai-deepseek-coder-33b-instruct.jinja | Generic |
+| deepseek-ai-deepseek-coder-6.7b-instruct.jinja | Generic |
+| deepseek-ai-deepseek-coder-7b-instruct-v1.5.jinja | Generic |
+| deepseek-ai-deepseek-llm-67b-chat.jinja | Generic |
+| deepseek-ai-deepseek-llm-7b-chat.jinja | Generic |
+| dicta-il-dictalm2.0-instruct.jinja | Generic |
+| ehristoforu-Falcon3-8B-Franken-Basestruct.jinja | Hermes 2 Pro |
+| fireworks-ai-llama-3-firefunction-v2.jinja | FireFunction v2 |
+| godlikehhd-alpaca_data_sampled_ifd_new_5200.jinja | Hermes 2 Pro |
+| godlikehhd-alpaca_data_score_max_0.7_2600.jinja | Hermes 2 Pro |
+| google-gemma-2-27b-it.jinja | Generic |
+| google-gemma-2-2b-it.jinja | Generic |
+| google-gemma-2-2b-jpn-it.jinja | Generic |
+| google-gemma-7b-it.jinja | Generic |
+| huihui-ai-DeepSeek-R1-Distill-Llama-70B-abliterated.jinja | DeepSeek R1 (extract reasoning) |
+| huihui-ai-DeepSeek-R1-Distill-Llama-8B-abliterated.jinja | DeepSeek R1 (extract reasoning) |
+| huihui-ai-DeepSeek-R1-Distill-Qwen-14B-abliterated-v2.jinja | DeepSeek R1 (extract reasoning) |
+| huihui-ai-DeepSeek-R1-Distill-Qwen-32B-abliterated.jinja | DeepSeek R1 (extract reasoning) |
+| huihui-ai-DeepSeek-R1-Distill-Qwen-7B-abliterated-v2.jinja | DeepSeek R1 (extract reasoning) |
+| huihui-ai-Qwen2.5-14B-Instruct-1M-abliterated.jinja | Hermes 2 Pro |
+| ibm-granite-granite-3.1-8b-instruct.jinja | Generic |
+| indischepartij-MiniCPM-3B-OpenHermes-2.5-v2.jinja | Generic |
+| inflatebot-MN-12B-Mag-Mell-R1.jinja | Generic |
+| jinaai-ReaderLM-v2.jinja | Generic |
+| kms7530-chemeng_qwen-math-7b_24_1_100_1_nonmath.jinja | Hermes 2 Pro |
+| knifeayumu-Cydonia-v1.3-Magnum-v4-22B.jinja | Mistral Nemo |
+| langgptai-qwen1.5-7b-chat-sa-v0.1.jinja | Generic |
+| lightblue-DeepSeek-R1-Distill-Qwen-7B-Japanese.jinja | DeepSeek R1 (extract reasoning) |
+| mattshumer-Reflection-Llama-3.1-70B.jinja | Generic |
+| meetkai-functionary-medium-v3.1.jinja | Functionary v3.1 Llama 3.1 |
+| meetkai-functionary-medium-v3.2.jinja | Functionary v3.2 |
+| meta-llama-Llama-2-7b-chat-hf.jinja | Generic |
+| meta-llama-Llama-3.1-8B-Instruct.jinja | Llama 3.x |
+| meta-llama-Llama-3.2-11B-Vision-Instruct.jinja | Llama 3.x |
+| meta-llama-Llama-3.2-1B-Instruct.jinja | Llama 3.x |
+| meta-llama-Llama-3.2-3B-Instruct.jinja | Llama 3.x |
+| meta-llama-Llama-3.3-70B-Instruct.jinja | Llama 3.x |
+| meta-llama-Meta-Llama-3-8B-Instruct.jinja | Generic |
+| meta-llama-Meta-Llama-3.1-8B-Instruct.jinja | Llama 3.x |
+| microsoft-Phi-3-medium-4k-instruct.jinja | Generic |
+| microsoft-Phi-3-mini-4k-instruct.jinja | Generic |
+| microsoft-Phi-3-small-8k-instruct.jinja | Generic |
+| microsoft-Phi-3.5-mini-instruct.jinja | Generic |
+| microsoft-Phi-3.5-vision-instruct.jinja | Generic |
+| microsoft-phi-4.jinja | Generic |
+| migtissera-Tess-3-Mistral-Nemo-12B.jinja | Generic |
+| ministral-Ministral-3b-instruct.jinja | Generic |
+| mistralai-Codestral-22B-v0.1.jinja | Generic |
+| mistralai-Mistral-7B-Instruct-v0.1.jinja | Generic |
+| mistralai-Mistral-7B-Instruct-v0.2.jinja | Generic |
+| mistralai-Mistral-7B-Instruct-v0.3.jinja | Mistral Nemo |
+| mistralai-Mistral-Large-Instruct-2407.jinja | Mistral Nemo |
+| mistralai-Mistral-Large-Instruct-2411.jinja | Generic |
+| mistralai-Mistral-Nemo-Instruct-2407.jinja | Mistral Nemo |
+| mistralai-Mistral-Small-24B-Instruct-2501.jinja | Generic |
+| mistralai-Mixtral-8x7B-Instruct-v0.1.jinja | Generic |
+| mkurman-Qwen2.5-14B-DeepSeek-R1-1M.jinja | Hermes 2 Pro |
+| mlabonne-AlphaMonarch-7B.jinja | Generic |
+| mlx-community-Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32.jinja | Hermes 2 Pro |
+| mlx-community-Qwen2.5-VL-7B-Instruct-8bit.jinja | Hermes 2 Pro |
+| mobiuslabsgmbh-DeepSeek-R1-ReDistill-Qwen-1.5B-v1.1.jinja | DeepSeek R1 (extract reasoning) |
+| netcat420-MFANNv0.20.jinja | Generic |
+| netcat420-MFANNv0.24.jinja | Generic |
+| netease-youdao-Confucius-o1-14B.jinja | Hermes 2 Pro |
+| nvidia-AceMath-7B-RM.jinja | Hermes 2 Pro |
+| nvidia-Eagle2-1B.jinja | Hermes 2 Pro |
+| nvidia-Eagle2-9B.jinja | Hermes 2 Pro |
+| nvidia-Llama-3.1-Nemotron-70B-Instruct-HF.jinja | Llama 3.x |
+| onnx-community-DeepSeek-R1-Distill-Qwen-1.5B-ONNX.jinja | DeepSeek R1 (extract reasoning) |
+| open-thoughts-OpenThinker-7B.jinja | Hermes 2 Pro |
+| openchat-openchat-3.5-0106.jinja | Generic |
+| pankajmathur-orca_mini_v6_8b.jinja | Generic |
+| princeton-nlp-Mistral-7B-Base-SFT-RDPO.jinja | Generic |
+| princeton-nlp-Mistral-7B-Instruct-DPO.jinja | Generic |
+| princeton-nlp-Mistral-7B-Instruct-RDPO.jinja | Generic |
+| prithivMLmods-Bellatrix-Tiny-1.5B-R1.jinja | Hermes 2 Pro |
+| prithivMLmods-Bellatrix-Tiny-1B-R1.jinja | Llama 3.x |
+| prithivMLmods-Bellatrix-Tiny-1B-v3.jinja | Generic |
+| prithivMLmods-Bellatrix-Tiny-3B-R1.jinja | Llama 3.x |
+| prithivMLmods-Blaze-14B-xElite.jinja | Generic |
+| prithivMLmods-Calcium-Opus-14B-Elite2-R1.jinja | Hermes 2 Pro |
+| prithivMLmods-Calme-Ties-78B.jinja | Generic |
+| prithivMLmods-Calme-Ties2-78B.jinja | Generic |
+| prithivMLmods-Calme-Ties3-78B.jinja | Generic |
+| prithivMLmods-ChemQwen2-vL.jinja | Generic |
+| prithivMLmods-GWQ2b.jinja | Generic |
+| prithivMLmods-LatexMind-2B-Codec.jinja | Generic |
+| prithivMLmods-Llama-3.2-6B-AlgoCode.jinja | Llama 3.x |
+| prithivMLmods-Megatron-Opus-14B-Exp.jinja | Hermes 2 Pro |
+| prithivMLmods-Megatron-Opus-14B-Stock.jinja | Hermes 2 Pro |
+| prithivMLmods-Megatron-Opus-7B-Exp.jinja | Hermes 2 Pro |
+| prithivMLmods-Omni-Reasoner-Merged.jinja | Hermes 2 Pro |
+| prithivMLmods-Omni-Reasoner4-Merged.jinja | Hermes 2 Pro |
+| prithivMLmods-Primal-Opus-14B-Optimus-v1.jinja | Hermes 2 Pro |
+| prithivMLmods-QwQ-Math-IO-500M.jinja | Hermes 2 Pro |
+| prithivMLmods-Qwen-7B-Distill-Reasoner.jinja | DeepSeek R1 (extract reasoning) |
+| prithivMLmods-Qwen2.5-1.5B-DeepSeek-R1-Instruct.jinja | Hermes 2 Pro |
+| prithivMLmods-Qwen2.5-14B-DeepSeek-R1-1M.jinja | Hermes 2 Pro |
+| prithivMLmods-Qwen2.5-32B-DeepSeek-R1-Instruct.jinja | Hermes 2 Pro |
+| prithivMLmods-Qwen2.5-7B-DeepSeek-R1-1M.jinja | Hermes 2 Pro |
+| prithivMLmods-Triangulum-v2-10B.jinja | Hermes 2 Pro |
+| qingy2024-Falcon3-2x10B-MoE-Instruct.jinja | Hermes 2 Pro |
+| rubenroy-Zurich-14B-GCv2-5m.jinja | Hermes 2 Pro |
+| rubenroy-Zurich-7B-GCv2-5m.jinja | Hermes 2 Pro |
+| silma-ai-SILMA-Kashif-2B-Instruct-v1.0.jinja | Generic |
+| simplescaling-s1-32B.jinja | Hermes 2 Pro |
+| sometimesanotion-Lamarck-14B-v0.7.jinja | Hermes 2 Pro |
+| sonthenguyen-zephyr-sft-bnb-4bit-DPO-mtbr-180steps.jinja | Generic |
+| sthenno-tempesthenno-icy-0130.jinja | Generic |
+| sumink-qwft.jinja | Hermes 2 Pro |
+| teknium-OpenHermes-2.5-Mistral-7B.jinja | Generic |
+| thirdeyeai-elevate360m.jinja | Generic |
+| tiiuae-Falcon3-10B-Instruct.jinja | Hermes 2 Pro |
+| unsloth-DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit.jinja | DeepSeek R1 (extract reasoning) |
+| unsloth-DeepSeek-R1-Distill-Llama-8B.jinja | DeepSeek R1 (extract reasoning) |
+| unsloth-DeepSeek-R1.jinja | DeepSeek R1 (extract reasoning) |
+| unsloth-Mistral-Small-24B-Instruct-2501-unsloth-bnb-4bit.jinja | Generic |
+| upstage-solar-pro-preview-instruct.jinja | Generic |
+| whyhow-ai-PatientSeek.jinja | Generic |
+| xwen-team-Xwen-72B-Chat.jinja | Hermes 2 Pro |
+| xwen-team-Xwen-7B-Chat.jinja | Hermes 2 Pro |
+
+This table can be generated with:
+
+```bash
+./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null
+```
+
+
+
+# Usage - need tool-aware Jinja template
+
+First, start a server with any model, but make sure it has a tools-enabled template: you can verify this by inspecting the `chat_template` or `chat_template_tool_use` properties in `http://localhost:8080/props`).
+
+Here are some models known to work (w/ chat template override when needed):
+
+```shell
+# Native support:
+
+llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
+llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L
+llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
+llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M
+
+# Native support for DeepSeek R1 works best w/ our own template (official template buggy)
+
+llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L \
+--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
+
+llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M \
+--chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
+
+# Native support requires the right template for these GGUFs:
+
+llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \
+--chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use )
+
+llama-server --jinja -fa -hf bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M \
+--chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use )
+
+llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \
+--chat-template-file <( python scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2 tool_use )
+
+llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L \
+--chat-template-file <( python scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 tool_use )
+
+# Generic format support
+llama-server --jinja -fa -hf bartowski/phi-4-GGUF:Q4_0
+llama-server --jinja -fa -hf bartowski/gemma-2-2b-it-GGUF:Q8_0
+llama-server --jinja -fa -hf bartowski/c4ai-command-r-v01-GGUF:Q2_K
+```
+
+> [!TIP]
+> If there is no official `tool_use` Jinja template, you may want to set `--chat-template chatml` to use a default that works with many models (YMMV!), or write your own (e.g. we provide a custom [llama-cpp-deepseek-r1.jinja](../models/templates/llama-cpp-deepseek-r1.jinja) for DeepSeek R1 distills)
+
+Test in CLI (or with any library / software that can use OpenAI-compatible API backends):
+
+```bash
+curl http://localhost:8080/v1/chat/completions -d '{
+"model": "gpt-3.5-turbo",
+"tools": [
+ {
+ "type":"function",
+ "function":{
+ "name":"python",
+ "description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.",
+ "parameters":{
+ "type":"object",
+ "properties":{
+ "code":{
+ "type":"string",
+ "description":"The code to run in the ipython interpreter."
+ }
+ },
+ "required":["code"]
+ }
+ }
+ }
+],
+"messages": [
+ {
+ "role": "user",
+ "content": "Print a hello world message with python."
+ }
+]
+}'
+```
+
+
+Show output
+
+```json
+{
+"choices": [
+ {
+ "finish_reason": "tool",
+ "index": 0,
+ "message": {
+ "content": null,
+ "tool_calls": [
+ {
+ "name": "python",
+ "arguments": "{\"code\":\" \\nprint(\\\"Hello, World!\\\")\"}"
+ }
+ ],
+ "role": "assistant"
+ }
+ }
+],
+"created": 1727287211,
+"model": "gpt-3.5-turbo",
+"object": "chat.completion",
+"usage": {
+ "completion_tokens": 16,
+ "prompt_tokens": 44,
+ "total_tokens": 60
+},
+"id": "chatcmpl-Htbgh9feMmGM0LEH2hmQvwsCxq3c6Ni8"
+}
+```
+
+
diff --git a/examples/llava/README-granitevision.md b/examples/llava/README-granitevision.md
new file mode 100644
index 000000000..f08a21cc1
--- /dev/null
+++ b/examples/llava/README-granitevision.md
@@ -0,0 +1,190 @@
+# Granite Vision
+
+Download the model and point your `GRANITE_MODEL` environment variable to the path.
+
+```bash
+$ git clone https://huggingface.co/ibm-granite/granite-vision-3.2-2b
+$ export GRANITE_MODEL=./granite-vision-3.2-2b
+```
+
+
+### 1. Running llava surgery v2.
+First, we need to run the llava surgery script as shown below:
+
+`python llava_surgery_v2.py -C -m $GRANITE_MODEL`
+
+You should see two new files (`llava.clip` and `llava.projector`) written into your model's directory, as shown below.
+
+```bash
+$ ls $GRANITE_MODEL | grep -i llava
+llava.clip
+llava.projector
+```
+
+We should see that the projector and visual encoder get split out into the llava files. Quick check to make sure they aren't empty:
+```python
+import os
+import torch
+
+MODEL_PATH = os.getenv("GRANITE_MODEL")
+if not MODEL_PATH:
+ raise ValueError("env var GRANITE_MODEL is unset!")
+
+encoder_tensors = torch.load(os.path.join(MODEL_PATH, "llava.clip"))
+projector_tensors = torch.load(os.path.join(MODEL_PATH, "llava.projector"))
+
+assert len(encoder_tensors) > 0
+assert len(projector_tensors) > 0
+```
+
+If you actually inspect the `.keys()` of the loaded tensors, you should see a lot of `vision_model` tensors in the `encoder_tensors`, and 5 tensors (`'multi_modal_projector.linear_1.bias'`, `'multi_modal_projector.linear_1.weight'`, `'multi_modal_projector.linear_2.bias'`, `'multi_modal_projector.linear_2.weight'`, `'image_newline'`) in the multimodal `projector_tensors`.
+
+
+### 2. Creating the Visual Component GGUF
+Next, create a new directory to hold the visual components, and copy the llava.clip/projector files, as shown below.
+
+```bash
+$ ENCODER_PATH=$PWD/visual_encoder
+$ mkdir $ENCODER_PATH
+
+$ cp $GRANITE_MODEL/llava.clip $ENCODER_PATH/pytorch_model.bin
+$ cp $GRANITE_MODEL/llava.projector $ENCODER_PATH/
+```
+
+Now, we need to write a config for the visual encoder. In order to convert the model, be sure to use the correct `image_grid_pinpoints`, as these may vary based on the model. You can find the `image_grid_pinpoints` in `$GRANITE_MODEL/config.json`.
+
+```json
+{
+ "_name_or_path": "siglip-model",
+ "architectures": [
+ "SiglipVisionModel"
+ ],
+ "image_grid_pinpoints": [
+ [384,384],
+ [384,768],
+ [384,1152],
+ [384,1536],
+ [384,1920],
+ [384,2304],
+ [384,2688],
+ [384,3072],
+ [384,3456],
+ [384,3840],
+ [768,384],
+ [768,768],
+ [768,1152],
+ [768,1536],
+ [768,1920],
+ [1152,384],
+ [1152,768],
+ [1152,1152],
+ [1536,384],
+ [1536,768],
+ [1920,384],
+ [1920,768],
+ [2304,384],
+ [2688,384],
+ [3072,384],
+ [3456,384],
+ [3840,384]
+ ],
+ "mm_patch_merge_type": "spatial_unpad",
+ "hidden_size": 1152,
+ "image_size": 384,
+ "intermediate_size": 4304,
+ "model_type": "siglip_vision_model",
+ "num_attention_heads": 16,
+ "num_hidden_layers": 27,
+ "patch_size": 14,
+ "layer_norm_eps": 1e-6,
+ "hidden_act": "gelu_pytorch_tanh",
+ "projection_dim": 0,
+ "vision_feature_layer": [-24, -20, -12, -1]
+}
+```
+
+At this point you should have something like this:
+```bash
+$ ls $ENCODER_PATH
+config.json llava.projector pytorch_model.bin
+```
+
+Now convert the components to GGUF; Note that we also override the image mean/std dev to `[.5,.5,.5]` since we use the SigLIP visual encoder - in the transformers model, you can find these numbers in the `preprocessor_config.json`.
+```bash
+$ python convert_image_encoder_to_gguf.py \
+ -m $ENCODER_PATH \
+ --llava-projector $ENCODER_PATH/llava.projector \
+ --output-dir $ENCODER_PATH \
+ --clip-model-is-vision \
+ --clip-model-is-siglip \
+ --image-mean 0.5 0.5 0.5 \
+ --image-std 0.5 0.5 0.5
+```
+
+This will create the first GGUF file at `$ENCODER_PATH/mmproj-model-f16.gguf`; we will refer to the absolute path of this file as the `$VISUAL_GGUF_PATH.`
+
+
+### 3. Creating the LLM GGUF.
+The granite vision model contains a granite LLM as its language model. For now, the easiest way to get the GGUF for LLM is by loading the composite model in `transformers` and exporting the LLM so that it can be directly converted with the normal conversion path.
+
+First, set the `LLM_EXPORT_PATH` to the path to export the `transformers` LLM to.
+```bash
+$ export LLM_EXPORT_PATH=$PWD/granite_vision_llm
+```
+
+```python
+import os
+import transformers
+
+MODEL_PATH = os.getenv("GRANITE_MODEL")
+if not MODEL_PATH:
+ raise ValueError("env var GRANITE_MODEL is unset!")
+
+LLM_EXPORT_PATH = os.getenv("LLM_EXPORT_PATH")
+if not LLM_EXPORT_PATH:
+ raise ValueError("env var LLM_EXPORT_PATH is unset!")
+
+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
+
+# NOTE: granite vision support was added to transformers very recently (4.49);
+# if you get size mismatches, your version is too old.
+# If you are running with an older version, set `ignore_mismatched_sizes=True`
+# as shown below; it won't be loaded correctly, but the LLM part of the model that
+# we are exporting will be loaded correctly.
+model = transformers.AutoModelForImageTextToText.from_pretrained(MODEL_PATH, ignore_mismatched_sizes=True)
+
+tokenizer.save_pretrained(LLM_EXPORT_PATH)
+model.language_model.save_pretrained(LLM_EXPORT_PATH)
+```
+
+Now you can convert the exported LLM to GGUF with the normal converter in the root of the llama cpp project.
+```bash
+$ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm.gguf
+...
+$ python convert_hf_to_gguf.py --outfile $LLM_GGUF_PATH $LLM_EXPORT_PATH
+```
+
+
+### 4. Quantization
+If you want to quantize the LLM, you can do so with `llama-quantize` as you would any other LLM. For example:
+```bash
+$ ./build/bin/llama-quantize $LLM_EXPORT_PATH/granite_llm.gguf $LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf Q4_K_M
+$ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf
+```
+
+Note that currently you cannot quantize the visual encoder because granite vision models use SigLIP as the visual encoder, which has tensor dimensions that are not divisible by 32.
+
+
+### 5. Running the Model in Llama cpp
+Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
+
+```bash
+$ ./build/bin/llama-llava-cli -m $LLM_GGUF_PATH \
+ --mmproj $VISUAL_GGUF_PATH \
+ --image ./media/llama0-banner.png \
+ -c 16384 \
+ -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\\nWhat does the text in this image say?\n<|assistant|>\n" \
+ --temp 0
+```
+
+Sample output: `The text in the image reads "LLAMA C++ Can it run DOOM Llama?"`
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 3ddd4ce5b..ef5de2df1 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -43,6 +43,7 @@
#include