From 56d7a9f81274717fe035db192f315a049261c7fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Tue, 4 Mar 2025 17:19:39 +0100
Subject: [PATCH 01/19] main: allow preloading conversation with -p and add -st
 / --single-turn (#12145)

* Add chat template formatting to -no-cnv

* only enable prompt formatting if explicitly enabled

* add -st / --single-turn

* add --single-turn and -p in conversation mode

* fix -sys + -p

* reword warning

* small readability change and fix (long) outdated example usage

* only activate single turn in conversation mode
---
 common/arg.cpp         |  9 +++++++
 common/common.h        |  2 ++
 examples/main/main.cpp | 55 +++++++++++++++++++++++++++++++-----------
 3 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 8773caaef..3e549ede0 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -949,6 +949,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"-st", "--single-turn"},
+        "run conversation for a single turn only, then exit when done\n"
+        "will not be interactive if first turn is predefined with --prompt\n"
+        "(default: false)",
+        [](common_params & params) {
+            params.single_turn = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-i", "--interactive"},
         string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
diff --git a/common/common.h b/common/common.h
index 615d179de..f4b4a96fb 100644
--- a/common/common.h
+++ b/common/common.h
@@ -328,6 +328,8 @@ struct common_params {
     bool warmup            = true;  // warmup run
     bool check_tensors     = false; // validate tensor data
 
+    bool single_turn       = false; // single turn chat conversation
+
     ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
     ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index acf79a892..4e0c69473 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -45,8 +45,8 @@ static void print_usage(int argc, char ** argv) {
     (void) argc;
 
     LOG("\nexample usage:\n");
-    LOG("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
-    LOG("\n  chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
+    LOG("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128 -no-cnv\n", argv[0]);
+    LOG("\n  chat (conversation): %s -m your_model.gguf -sys \"You are a helpful assistant\"\n", argv[0]);
     LOG("\n");
 }
 
@@ -217,8 +217,8 @@ int main(int argc, char ** argv) {
     // print chat template example in conversation mode
     if (params.conversation_mode) {
         if (params.enable_chat_template) {
-            if (!params.prompt.empty()) {
-                LOG_WRN("*** User-specified prompt in conversation mode will be ignored, did you mean to set --system-prompt (-sys) instead?\n");
+            if (!params.prompt.empty() && params.system_prompt.empty()) {
+                LOG_WRN("*** User-specified prompt will pre-start conversation, did you mean to set --system-prompt (-sys) instead?\n");
             }
 
             LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());
@@ -265,7 +265,7 @@ int main(int argc, char ** argv) {
 
     std::vector<llama_token> embd_inp;
 
-    bool waiting_for_first_input = params.conversation_mode && params.enable_chat_template && params.system_prompt.empty();
+    bool waiting_for_first_input = false;
     auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
         common_chat_msg new_msg;
         new_msg.role = role;
@@ -276,22 +276,34 @@ int main(int argc, char ** argv) {
         return formatted;
     };
 
+    std::string prompt;
     {
-        std::string prompt;
-
         if (params.conversation_mode && params.enable_chat_template) {
-            // format the system prompt in conversation mode (will use template default if empty)
-            prompt = params.system_prompt;
+            if (!params.system_prompt.empty()) {
+                // format the system prompt (will use template default if empty)
+                chat_add_and_format("system", params.system_prompt);
+            }
 
-            if (!prompt.empty()) {
-                prompt = chat_add_and_format("system", prompt);
+            if (!params.prompt.empty()) {
+                // format and append the user prompt
+                chat_add_and_format("user", params.prompt);
+            } else {
+                waiting_for_first_input = true;
+            }
+
+            if (!params.system_prompt.empty() || !params.prompt.empty()) {
+                common_chat_templates_inputs inputs;
+                inputs.messages = chat_msgs;
+                inputs.add_generation_prompt = !params.prompt.empty();
+
+                prompt = common_chat_templates_apply(chat_templates.get(), inputs).prompt;
             }
         } else {
             // otherwise use the prompt as is
             prompt = params.prompt;
         }
 
-        if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
+        if (params.interactive_first || !prompt.empty() || session_tokens.empty()) {
             LOG_DBG("tokenize the prompt\n");
             embd_inp = common_tokenize(ctx, prompt, true, true);
         } else {
@@ -304,7 +316,7 @@ int main(int argc, char ** argv) {
     }
 
     // Should not run without any tokens
-    if (!params.conversation_mode && embd_inp.empty()) {
+    if (!waiting_for_first_input && embd_inp.empty()) {
         if (add_bos) {
             embd_inp.push_back(llama_vocab_bos(vocab));
             LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
@@ -364,7 +376,12 @@ int main(int argc, char ** argv) {
     }
 
     if (params.conversation_mode) {
-        params.interactive_first = true;
+        if (params.single_turn && !params.prompt.empty()) {
+            params.interactive = false;
+            params.interactive_first = false;
+        } else {
+            params.interactive_first = true;
+        }
     }
 
     // enable interactive mode if interactive start is specified
@@ -808,6 +825,11 @@ int main(int argc, char ** argv) {
             if (params.conversation_mode && !waiting_for_first_input) {
                 const auto id = common_sampler_last(smpl);
                 assistant_ss << common_token_to_piece(ctx, id, false);
+
+                if (!prompt.empty()) {
+                    prompt.clear();
+                    is_interacting = false;
+                }
             }
 
             if ((n_past > 0 || waiting_for_first_input) && is_interacting) {
@@ -905,6 +927,11 @@ int main(int argc, char ** argv) {
                     common_sampler_reset(smpl);
                 }
                 is_interacting = false;
+
+                if (waiting_for_first_input && params.single_turn) {
+                    params.interactive = false;
+                    params.interactive_first = false;
+                }
                 waiting_for_first_input = false;
             }
         }

From 20a9b8f5e1380243ed03aeb50ae1bf94b8d68501 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 4 Mar 2025 18:42:44 +0200
Subject: [PATCH 02/19] readme : fix roadmap link (#12185)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f7f1a521e..ffb59a422 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
 
-[Roadmap](https://github.com/users/ggml-org/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
+[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
 
 Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
 

From 5bbe6a9fe9a8796a9389c85accec89dbc4d91e39 Mon Sep 17 00:00:00 2001
From: mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
Date: Tue, 4 Mar 2025 17:53:26 +0100
Subject: [PATCH 03/19] ggml : portability fixes for VS 2017 (#12150)

* Add include files for std::min/max and std::toupper/tolower

* win32: move _USE_MATH_DEFINES before includes to ensure M_PI is defined

* Use GGML_RESTRICT instead of "restrict" keyword everywhere, and use "__restrict" in MSVC plain C mode

* win32: only use __restrict in MSVC if C11/C17 support is not enabled

---------

Co-authored-by: Marcus Groeber <Marcus.Groeber@cerence.com>
---
 common/ngram-cache.cpp              |   1 +
 common/sampling.cpp                 |   1 +
 common/speculative.cpp              |   1 +
 examples/embedding/embedding.cpp    |   1 +
 examples/lookahead/lookahead.cpp    |   1 +
 examples/parallel/parallel.cpp      |   1 +
 examples/passkey/passkey.cpp        |   1 +
 examples/quantize/quantize.cpp      |   1 +
 examples/tts/tts.cpp                |   4 +-
 ggml/include/ggml.h                 |   6 +-
 ggml/src/ggml-backend-reg.cpp       |   1 +
 ggml/src/ggml-backend.cpp           |   1 +
 ggml/src/ggml-cpu/ggml-cpu-quants.c | 792 ++++++++++++++--------------
 ggml/src/ggml-cpu/ggml-cpu.c        |  26 +-
 ggml/src/ggml-quants.c              | 228 ++++----
 ggml/src/ggml.c                     |   6 +-
 src/llama-chat.cpp                  |   1 +
 src/llama-kv-cache.h                |   1 +
 src/llama-mmap.cpp                  |   1 +
 src/llama-vocab.cpp                 |   1 +
 20 files changed, 547 insertions(+), 529 deletions(-)

diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp
index a057ae45f..d1a4d84c4 100644
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@@ -7,6 +7,7 @@
 #include <cstdio>
 #include <fstream>
 #include <thread>
+#include <algorithm>
 
 void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
                               std::vector<llama_token> & inp, int nnew, bool print_progress) {
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 37a0d9c85..972fbe9da 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -4,6 +4,7 @@
 
 #include <cmath>
 #include <unordered_map>
+#include <algorithm>
 
 // the ring buffer works similarly to std::deque, but with a fixed capacity
 // TODO: deduplicate with llama-impl.h
diff --git a/common/speculative.cpp b/common/speculative.cpp
index b1fff27a5..1bac3a1ce 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -5,6 +5,7 @@
 #include "sampling.h"
 
 #include <cstring>
+#include <algorithm>
 
 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 38d22c90f..3dd9f2b07 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -4,6 +4,7 @@
 #include "llama.h"
 
 #include <ctime>
+#include <algorithm>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index 2f0898e62..b9e8de694 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -7,6 +7,7 @@
 #include <cstdio>
 #include <string>
 #include <vector>
+#include <algorithm>
 
 struct ngram_data {
     bool active = false;
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 7ef43d5e1..be18909ed 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -12,6 +12,7 @@
 #include <string>
 #include <vector>
 #include <ctime>
+#include <algorithm>
 
 // trim whitespace from the beginning and end of a string
 static std::string trim(const std::string & str) {
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index 5953928d4..fa8519051 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -7,6 +7,7 @@
 #include <cstdio>
 #include <string>
 #include <vector>
+#include <algorithm>
 
 static void print_usage(int, char ** argv) {
     LOG("\nexample usage:\n");
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 8d47b17b6..a4468b169 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -8,6 +8,7 @@
 #include <unordered_map>
 #include <fstream>
 #include <cmath>
+#include <cctype>
 
 struct quant_option {
     std::string name;
diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp
index 8108bac8c..c658f3182 100644
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -1,3 +1,5 @@
+#define _USE_MATH_DEFINES // For M_PI on MSVC
+
 #include "arg.h"
 #include "common.h"
 #include "sampling.h"
@@ -5,8 +7,6 @@
 #include "llama.h"
 #include "json.hpp"
 
-#define _USE_MATH_DEFINES // For M_PI on MSVC
-
 #include <algorithm>
 #include <cmath>
 #include <cstdio>
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index dd0c6a96e..2e5076d36 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2140,7 +2140,11 @@ extern "C" {
 #        define GGML_RESTRICT
 #    endif
 #else
-#    define GGML_RESTRICT restrict
+#    if defined (_MSC_VER) && (__STDC_VERSION__ < 201112L)
+#        define GGML_RESTRICT __restrict
+#    else
+#        define GGML_RESTRICT restrict
+#    endif
 #endif
     typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
     typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 23ddf41d4..d0d68becd 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -8,6 +8,7 @@
 #include <string>
 #include <type_traits>
 #include <vector>
+#include <cctype>
 
 #ifdef _WIN32
 #    define WIN32_LEAN_AND_MEAN
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 184f99af5..273075f4e 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -21,6 +21,7 @@
 #include <string.h>
 #include <string>
 #include <vector>
+#include <algorithm>
 
 #ifdef __APPLE__
 #include <sys/types.h>
diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c
index 2679b71ff..c30ac0318 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -719,28 +719,28 @@ static inline __m128i packNibbles( __m256i bytes ) {
 }
 #endif  //__loongarch_asx
 
-void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
+void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
     quantize_row_q4_0_ref(x, y, k);
 }
 
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
+void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
     quantize_row_q4_1_ref(x, y, k);
 }
 
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
+void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
     quantize_row_q5_0_ref(x, y, k);
 }
 
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
+void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
     quantize_row_q5_1_ref(x, y, k);
 }
 
-void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
     assert(QK8_0 == 32);
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
 
-    block_q8_0 * restrict y = vy;
+    block_q8_0 * GGML_RESTRICT y = vy;
 
 #if defined(__ARM_NEON)
     for (int i = 0; i < nb; i++) {
@@ -1050,11 +1050,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
 #endif
 }
 
-void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
     assert(k % QK8_1 == 0);
     const int nb = k / QK8_1;
 
-    block_q8_1 * restrict y = vy;
+    block_q8_1 * GGML_RESTRICT y = vy;
 
 #if defined(__ARM_NEON)
     for (int i = 0; i < nb; i++) {
@@ -1428,8 +1428,8 @@ static inline int nearest_int(float fval) {
     return (i & 0x007fffff) - 0x00400000;
 }
 
-static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
-        const float * restrict qw) {
+static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
+        const float * GGML_RESTRICT qw) {
     float max = 0;
     float amax = 0;
     for (int i = 0; i < n; ++i) {
@@ -1497,7 +1497,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
     return scale;
 }
 
-static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
+static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) {
     float max = 0;
     float amax = 0;
     for (int i = 0; i < n; ++i) {
@@ -1556,7 +1556,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
     return 1/iscale;
 }
 
-static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
+static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min,
         int ntry, float alpha) {
     float min = x[0];
     float max = x[0];
@@ -1599,8 +1599,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
     return scale;
 }
 
-static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
-        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
+static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
+        uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
         float rmin, float rdelta, int nstep, bool use_mad) {
     float min = x[0];
     float max = x[0];
@@ -1680,7 +1680,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
     return scale;
 }
 
-static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
+static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) {
     if (j < 4) {
         *d = q[j] & 63; *m = q[j + 4] & 63;
     } else {
@@ -1691,51 +1691,51 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
 
 //========================- 2-bit (de)-quantization
 
-void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
     quantize_row_q2_K_ref(x, vy, k);
 }
 
 //========================= 3-bit (de)-quantization
 
-void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
     quantize_row_q3_K_ref(x, vy, k);
 }
 
 // ====================== 4-bit (de)-quantization
 
-void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
     assert(k % QK_K == 0);
-    block_q4_K * restrict y = vy;
+    block_q4_K * GGML_RESTRICT y = vy;
     quantize_row_q4_K_ref(x, y, k);
 }
 
 // ====================== 5-bit (de)-quantization
 
-void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
     assert(k % QK_K == 0);
-    block_q5_K * restrict y = vy;
+    block_q5_K * GGML_RESTRICT y = vy;
     quantize_row_q5_K_ref(x, y, k);
 }
 
 // ====================== 6-bit (de)-quantization
 
-void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
     assert(k % QK_K == 0);
-    block_q6_K * restrict y = vy;
+    block_q6_K * GGML_RESTRICT y = vy;
     quantize_row_q6_K_ref(x, y, k);
 }
 
 // ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
 
-void quantize_row_tq1_0(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
     assert(k % QK_K == 0);
-    block_tq1_0 * restrict y = vy;
+    block_tq1_0 * GGML_RESTRICT y = vy;
     quantize_row_tq1_0_ref(x, y, k);
 }
 
-void quantize_row_tq2_0(const float * restrict x, void * restrict vy, int64_t k) {
+void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
     assert(k % QK_K == 0);
-    block_tq2_0 * restrict y = vy;
+    block_tq2_0 * GGML_RESTRICT y = vy;
     quantize_row_tq2_0_ref(x, y, k);
 }
 
@@ -1743,11 +1743,11 @@ static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -1
 
 //===================================== Q8_K ==============================================
 
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
+void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
 #ifdef __wasm_simd128__
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
-    block_q8_K * restrict yc = y; // Cast to proper type
+    block_q8_K * GGML_RESTRICT yc = y; // Cast to proper type
 
     for (int i = 0; i < nb; i++) {
         const float * x_block = x + i * QK_K;
@@ -1909,7 +1909,7 @@ static inline __m128i get_scale_shuffle(int i) {
 }
 #endif
 
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     const int qk = QK8_0;
     const int nb = n / qk;
 
@@ -1924,23 +1924,23 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     UNUSED(by);
     UNUSED(bs);
 
-    const block_q4_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
+    const block_q4_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
 
 #if defined(__ARM_FEATURE_MATMUL_INT8)
     if (nrc == 2) {
-        const block_q4_0 * restrict vx0 = vx;
-        const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
-        const block_q8_0 * restrict vy0 = vy;
-        const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
+        const block_q4_0 * GGML_RESTRICT vx0 = vx;
+        const block_q4_0 * GGML_RESTRICT vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
+        const block_q8_0 * GGML_RESTRICT vy0 = vy;
+        const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
 
         float32x4_t sumv0 = vdupq_n_f32(0.0f);
 
         for (int i = 0; i < nb; i++) {
-            const block_q4_0 * restrict b_x0 = &vx0[i];
-            const block_q4_0 * restrict b_x1 = &vx1[i];
-            const block_q8_0 * restrict b_y0 = &vy0[i];
-            const block_q8_0 * restrict b_y1 = &vy1[i];
+            const block_q4_0 * GGML_RESTRICT b_x0 = &vx0[i];
+            const block_q4_0 * GGML_RESTRICT b_x1 = &vx1[i];
+            const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
+            const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
 
             const uint8x16_t m4b = vdupq_n_u8(0x0F);
             const int8x16_t  s8b = vdupq_n_s8(0x8);
@@ -2017,10 +2017,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
                 const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
 
                 for (; ib + 1 < nb; ib += 2) {
-                    const block_q4_0 * restrict x0 = &x[ib + 0];
-                    const block_q4_0 * restrict x1 = &x[ib + 1];
-                    const block_q8_0 * restrict y0 = &y[ib + 0];
-                    const block_q8_0 * restrict y1 = &y[ib + 1];
+                    const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
+                    const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
+                    const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+                    const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
 
                     // load x
                     const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
@@ -2063,10 +2063,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
                 const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
 
                 for (; ib + 1 < nb; ib += 2) {
-                    const block_q4_0 * restrict x0 = &x[ib + 0];
-                    const block_q4_0 * restrict x1 = &x[ib + 1];
-                    const block_q8_0 * restrict y0 = &y[ib + 0];
-                    const block_q8_0 * restrict y1 = &y[ib + 1];
+                    const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
+                    const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
+                    const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+                    const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
 
                     // load x
                     const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
@@ -2104,10 +2104,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
                 const svbool_t pl16 = svnot_b_z(ph32, ph16);
 
                 for (; ib + 1 < nb; ib += 2) {
-                    const block_q4_0 * restrict x0 = &x[ib + 0];
-                    const block_q4_0 * restrict x1 = &x[ib + 1];
-                    const block_q8_0 * restrict y0 = &y[ib + 0];
-                    const block_q8_0 * restrict y1 = &y[ib + 1];
+                    const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
+                    const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
+                    const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+                    const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
 
                     // load x
                     const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
@@ -2144,10 +2144,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     float32x4_t sumv1 = vdupq_n_f32(0.0f);
 
     for (; ib + 1 < nb; ib += 2) {
-        const block_q4_0 * restrict x0 = &x[ib + 0];
-        const block_q4_0 * restrict x1 = &x[ib + 1];
-        const block_q8_0 * restrict y0 = &y[ib + 0];
-        const block_q8_0 * restrict y1 = &y[ib + 1];
+        const block_q4_0 * GGML_RESTRICT x0 = &x[ib + 0];
+        const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
+        const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+        const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
 
         const uint8x16_t m4b = vdupq_n_u8(0x0F);
         const int8x16_t  s8b = vdupq_n_s8(0x8);
@@ -2189,10 +2189,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     const v128_t s8b = wasm_i8x16_splat(0x8);
 
     for (; ib + 1 < nb; ib += 2) {
-        const block_q4_0 * restrict x0 = &x[ib];
-        const block_q4_0 * restrict x1 = &x[ib + 1];
-        const block_q8_0 * restrict y0 = &y[ib];
-        const block_q8_0 * restrict y1 = &y[ib + 1];
+        const block_q4_0 * GGML_RESTRICT x0 = &x[ib];
+        const block_q4_0 * GGML_RESTRICT x1 = &x[ib + 1];
+        const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
+        const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
 
         // Load and process x0
         v128_t v0_0 = wasm_v128_load(x0->qs);
@@ -2609,7 +2609,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     *s = sumf;
 }
 
-void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     const int qk = QK8_1;
     const int nb = n / qk;
 
@@ -2624,24 +2624,24 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
     UNUSED(by);
     UNUSED(bs);
 
-    const block_q4_1 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
+    const block_q4_1 * GGML_RESTRICT x = vx;
+    const block_q8_1 * GGML_RESTRICT y = vy;
 
 #if defined(__ARM_FEATURE_MATMUL_INT8)
     if (nrc == 2) {
-        const block_q4_1 * restrict vx0 = vx;
-        const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
-        const block_q8_1 * restrict vy0 = vy;
-        const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
+        const block_q4_1 * GGML_RESTRICT vx0 = vx;
+        const block_q4_1 * GGML_RESTRICT vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
+        const block_q8_1 * GGML_RESTRICT vy0 = vy;
+        const block_q8_1 * GGML_RESTRICT vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
 
         float32x4_t sumv0 = vdupq_n_f32(0.0f);
         float32x4_t summs0 = vdupq_n_f32(0.0f);
 
         for (int i = 0; i < nb; i++) {
-            const block_q4_1 * restrict b_x0 = &vx0[i];
-            const block_q4_1 * restrict b_x1 = &vx1[i];
-            const block_q8_1 * restrict b_y0 = &vy0[i];
-            const block_q8_1 * restrict b_y1 = &vy1[i];
+            const block_q4_1 * GGML_RESTRICT b_x0 = &vx0[i];
+            const block_q4_1 * GGML_RESTRICT b_x1 = &vx1[i];
+            const block_q8_1 * GGML_RESTRICT b_y0 = &vy0[i];
+            const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i];
 
             float32_t summs_t[4] = {
                 GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
@@ -2715,10 +2715,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
     float summs = 0;
 
     for (; ib + 1 < nb; ib += 2) {
-        const block_q4_1 * restrict x0 = &x[ib + 0];
-        const block_q4_1 * restrict x1 = &x[ib + 1];
-        const block_q8_1 * restrict y0 = &y[ib + 0];
-        const block_q8_1 * restrict y1 = &y[ib + 1];
+        const block_q4_1 * GGML_RESTRICT x0 = &x[ib + 0];
+        const block_q4_1 * GGML_RESTRICT x1 = &x[ib + 1];
+        const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
+        const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
 
         summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
 
@@ -2931,7 +2931,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
     *s = sumf;
 }
 
-void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     const int qk = QK8_0;
     const int nb = n / qk;
 
@@ -2946,8 +2946,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     UNUSED(by);
     UNUSED(bs);
 
-    const block_q5_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
+    const block_q5_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
 
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -2960,10 +2960,10 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     uint64_t tmp1[4];
 
     for (; ib + 1 < nb; ib += 2) {
-        const block_q5_0 * restrict x0 = &x[ib];
-        const block_q5_0 * restrict x1 = &x[ib + 1];
-        const block_q8_0 * restrict y0 = &y[ib];
-        const block_q8_0 * restrict y1 = &y[ib + 1];
+        const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
+        const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1];
+        const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
+        const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
 
         const uint8x16_t m4b = vdupq_n_u8(0x0F);
 
@@ -3024,8 +3024,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
 
     // TODO: check if unrolling this is better
     for (; ib < nb; ++ib) {
-        const block_q5_0 * restrict x0 = &x[ib];
-        const block_q8_0 * restrict y0 = &y[ib];
+        const block_q5_0 * GGML_RESTRICT x0 = &x[ib];
+        const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
 
         const v128_t m4b  = wasm_i8x16_splat(0x0F);
 
@@ -3286,7 +3286,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     *s = sumf;
 }
 
-void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     const int qk = QK8_1;
     const int nb = n / qk;
 
@@ -3301,8 +3301,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
     UNUSED(by);
     UNUSED(bs);
 
-    const block_q5_1 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
+    const block_q5_1 * GGML_RESTRICT x = vx;
+    const block_q8_1 * GGML_RESTRICT y = vy;
 
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -3318,10 +3318,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
     uint64_t tmp1[4];
 
     for (; ib + 1 < nb; ib += 2) {
-        const block_q5_1 * restrict x0 = &x[ib];
-        const block_q5_1 * restrict x1 = &x[ib + 1];
-        const block_q8_1 * restrict y0 = &y[ib];
-        const block_q8_1 * restrict y1 = &y[ib + 1];
+        const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
+        const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1];
+        const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
+        const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
 
         const uint8x16_t m4b = vdupq_n_u8(0x0F);
 
@@ -3387,8 +3387,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
 
     // TODO: check if unrolling this is better
     for (; ib < nb; ++ib) {
-        const block_q5_1 * restrict x0 = &x[ib];
-        const block_q8_1 * restrict y0 = &y[ib];
+        const block_q5_1 * GGML_RESTRICT x0 = &x[ib];
+        const block_q8_1 * GGML_RESTRICT y0 = &y[ib];
 
         summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
 
@@ -3660,7 +3660,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
     *s = sumf;
 }
 
-void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     const int qk = QK8_0;
     const int nb = n / qk;
 
@@ -3675,24 +3675,24 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     UNUSED(by);
     UNUSED(bs);
 
-    const block_q8_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
+    const block_q8_0 * GGML_RESTRICT x = vx;
+    const block_q8_0 * GGML_RESTRICT y = vy;
 
 #if defined(__ARM_FEATURE_MATMUL_INT8)
     if (nrc == 2) {
-        const block_q8_0 * restrict vx0 = vx;
-        const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
-        const block_q8_0 * restrict vy0 = vy;
-        const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
+        const block_q8_0 * GGML_RESTRICT vx0 = vx;
+        const block_q8_0 * GGML_RESTRICT vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
+        const block_q8_0 * GGML_RESTRICT vy0 = vy;
+        const block_q8_0 * GGML_RESTRICT vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
 
         float32x4_t sumv0 = vdupq_n_f32(0.0f);
 
         for (int i = 0; i < nb; i++) {
-            const block_q8_0 * restrict b_x0 = &vx0[i];
-            const block_q8_0 * restrict b_y0 = &vy0[i];
+            const block_q8_0 * GGML_RESTRICT b_x0 = &vx0[i];
+            const block_q8_0 * GGML_RESTRICT b_y0 = &vy0[i];
 
-            const block_q8_0 * restrict b_x1 = &vx1[i];
-            const block_q8_0 * restrict b_y1 = &vy1[i];
+            const block_q8_0 * GGML_RESTRICT b_x1 = &vx1[i];
+            const block_q8_0 * GGML_RESTRICT b_y1 = &vy1[i];
 
             const int8x16_t x0_l = vld1q_s8(b_x0->qs);
             const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
@@ -3757,10 +3757,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
                 const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
 
                 for (; ib + 1 < nb; ib += 2) {
-                    const block_q8_0 * restrict x0 = &x[ib + 0];
-                    const block_q8_0 * restrict x1 = &x[ib + 1];
-                    const block_q8_0 * restrict y0 = &y[ib + 0];
-                    const block_q8_0 * restrict y1 = &y[ib + 1];
+                    const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
+                    const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
+                    const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+                    const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
 
                     // load x
                     const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
@@ -3788,10 +3788,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
             {
                 //printf("sve256");
                 for (; ib + 1 < nb; ib += 2) {
-                    const block_q8_0 * restrict x0 = &x[ib + 0];
-                    const block_q8_0 * restrict x1 = &x[ib + 1];
-                    const block_q8_0 * restrict y0 = &y[ib + 0];
-                    const block_q8_0 * restrict y1 = &y[ib + 1];
+                    const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
+                    const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
+                    const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+                    const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
 
                     // load x
                     const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
@@ -3824,10 +3824,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
                 svfloat32_t sumv00 = svdup_n_f32(0.0f);
 
                 for (; ib + 1 < nb; ib += 2) {
-                    const block_q8_0 * restrict x0 = &x[ib + 0];
-                    const block_q8_0 * restrict x1 = &x[ib + 1];
-                    const block_q8_0 * restrict y0 = &y[ib + 0];
-                    const block_q8_0 * restrict y1 = &y[ib + 1];
+                    const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
+                    const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
+                    const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+                    const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
 
                     //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
                     // and add them to make one 64 element vector
@@ -3867,10 +3867,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     float32x4_t sumv1 = vdupq_n_f32(0.0f);
 
     for (; ib + 1 < nb; ib += 2) {
-        const block_q8_0 * restrict x0 = &x[ib + 0];
-        const block_q8_0 * restrict x1 = &x[ib + 1];
-        const block_q8_0 * restrict y0 = &y[ib + 0];
-        const block_q8_0 * restrict y1 = &y[ib + 1];
+        const block_q8_0 * GGML_RESTRICT x0 = &x[ib + 0];
+        const block_q8_0 * GGML_RESTRICT x1 = &x[ib + 1];
+        const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
+        const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
 
         const int8x16_t x0_0 = vld1q_s8(x0->qs);
         const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
@@ -3897,8 +3897,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     v128_t sumv = wasm_f32x4_splat(0.0f);
 
     for (; ib < nb; ++ib) {
-        const block_q8_0 * restrict x0 = &x[ib];
-        const block_q8_0 * restrict y0 = &y[ib];
+        const block_q8_0 * GGML_RESTRICT x0 = &x[ib];
+        const block_q8_0 * GGML_RESTRICT y0 = &y[ib];
 
         const v128_t x0_0 = wasm_v128_load(x0->qs);
         const v128_t x0_1 = wasm_v128_load(x0->qs + 16);
@@ -4080,15 +4080,15 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     *s = sumf;
 }
 
-void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(nrc == 1);
     UNUSED(nrc);
     UNUSED(bx);
     UNUSED(by);
     UNUSED(bs);
 
-    const block_tq1_0 * restrict x = vx;
-    const block_q8_K  * restrict y = vy;
+    const block_tq1_0 * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
 
     const int nb = n / QK_K;
 
@@ -4403,15 +4403,15 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * restrict s, size_t bs, const void *
 #endif
 }
 
-void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(nrc == 1);
     UNUSED(nrc);
     UNUSED(bx);
     UNUSED(by);
     UNUSED(bs);
 
-    const block_tq2_0 * restrict x = vx;
-    const block_q8_K  * restrict y = vy;
+    const block_tq2_0 * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
 
     const int nb = n / QK_K;
 
@@ -4575,15 +4575,15 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * restrict s, size_t bs, const void *
 #endif
 }
 
-void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(nrc == 1);
     UNUSED(nrc);
     UNUSED(bx);
     UNUSED(by);
     UNUSED(bs);
 
-    const block_q2_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q2_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
 
     const int nb = n / QK_K;
 
@@ -4603,9 +4603,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
                 const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
                 svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
 
-                const uint8_t * restrict q2 = x[i].qs;
-                const int8_t  * restrict q8_sv = y[i].qs;
-                const uint8_t * restrict sc = x[i].scales;
+                const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+                const int8_t  * GGML_RESTRICT q8_sv = y[i].qs;
+                const uint8_t * GGML_RESTRICT sc = x[i].scales;
 
                 svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_b32(), sc);
                 const svint32_t mins_sv_1 = svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), mins_and_scales_sve, 4));
@@ -4748,9 +4748,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
                 const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
                 svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
 
-                const uint8_t * restrict q2 = x[i].qs;
-                const int8_t  * restrict q8_sv = y[i].qs;
-                const uint8_t * restrict sc = x[i].scales;
+                const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+                const int8_t  * GGML_RESTRICT q8_sv = y[i].qs;
+                const uint8_t * GGML_RESTRICT sc = x[i].scales;
 
                 const svuint32_t mins_and_scales_sve = svld1ub_u32(svptrue_pat_b32(SV_VL8), sc); sc += 8;
                 const svint32_t scales_sv = svreinterpret_s32_u32(svand_u32_m(svptrue_pat_b32(SV_VL8), mins_and_scales_sve, m4s));
@@ -4847,9 +4847,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
-        const uint8_t * restrict sc = x[i].scales;
+        const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT sc = x[i].scales;
 
         const uint8x16_t mins_and_scales = vld1q_u8(sc);
         const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
@@ -4912,8 +4912,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
         const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
@@ -4979,8 +4979,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         // load mins and scales from block_q2_K.scales[QK_K/16]
         const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
@@ -5306,8 +5306,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         vector signed int vsumi6 = v0;
         vector signed int vsumi7 = v0;
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         for (int j = 0; j < QK_K/128; ++j) {
             __builtin_prefetch(q2, 0, 1);
@@ -5398,8 +5398,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q2 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         const __m128i mins_and_scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
         const __m128i scales128 = __lsx_vandi_b(mins_and_scales128, 0xf);
@@ -5492,7 +5492,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 #endif
 }
 
-void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(n % QK_K == 0);
     assert(nrc == 1);
     UNUSED(nrc);
@@ -5503,8 +5503,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     const uint32_t kmask1 = 0x03030303;
     const uint32_t kmask2 = 0x0f0f0f0f;
 
-    const block_q3_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q3_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
 
     const int nb = n / QK_K;
 
@@ -5529,9 +5529,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3_sv = x[i].qs;
-        const uint8_t * restrict qh_sv = x[i].hmask;
-        const int8_t  * restrict q8_sv = y[i].qs;
+        const uint8_t * GGML_RESTRICT q3_sv = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask;
+        const int8_t  * GGML_RESTRICT q8_sv = y[i].qs;
 
         // Set up scales
         memcpy(aux, x[i].scales, 12);
@@ -5705,9 +5705,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict qh = x[i].hmask;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].hmask;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
 
@@ -5791,8 +5791,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         // Set up scales
         memcpy(aux, x[i].scales, 12);
@@ -5896,8 +5896,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         // Set up scales
         aux = (const uint32_t *)x[i].scales;
@@ -6030,9 +6030,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         // Process blocks with SIMD
         int8_t * a = aux8;
@@ -6119,9 +6119,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict qh = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].hmask;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
 
         memcpy(aux, x[i].scales, 12);
         utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
@@ -6261,8 +6261,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         vector signed int vsumi6 = v0;
         vector signed int vsumi7 = v0;
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         for (int j = 0; j < QK_K/128; ++j) {
             __builtin_prefetch(q3, 0, 1);
@@ -6375,8 +6375,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     for (int i = 0; i < nb; ++i) {
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
-        const uint8_t * restrict q3 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
         // Set up scales
         memcpy(aux, x[i].scales, 12);
         __m128i scales128 = lsx_set_w(
@@ -6461,11 +6461,11 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * GGML_RESTRICT a = aux8;
         uint8_t m = 1;
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
@@ -6508,7 +6508,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
 }
 
-void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(n % QK_K == 0);
     assert(nrc == 1);
     UNUSED(nrc);
@@ -6516,8 +6516,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     UNUSED(by);
     UNUSED(bs);
 
-    const block_q4_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q4_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
 
     const int nb = n / QK_K;
 
@@ -6552,8 +6552,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         const uint8_t * scales = (const uint8_t *)utmp;
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         const int vector_length = ggml_cpu_get_sve_cnt()*8;
         const svuint8_t m4b = svdup_n_u8(0xf);
@@ -6640,8 +6640,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         const uint8_t * scales = (const uint8_t *)utmp;
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         int32_t sumi1 = 0;
         int32_t sumi2 = 0;
@@ -6679,8 +6679,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         // Process scales and mins
         memcpy(utmp, x[i].scales, 12);
@@ -6692,7 +6692,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         // Sum mins * q8sums
         int32_t sumi = 0;
-        const int16_t * restrict q8sums = y[i].bsums;
+        const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
         const uint8_t * m = (const uint8_t *)&utmp[2];
         for (int j = 0; j < 16; j += 2) {
             sumi += (q8sums[j] + q8sums[j+1]) * m[j/2];
@@ -6791,8 +6791,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         utmp[2] = uaux;
         utmp[0] &= kmask1;
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]));
 
@@ -6850,8 +6850,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         memcpy(utmp, x[i].scales, 12);
         utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -6951,8 +6951,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
         sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         vl = 32;
 
@@ -7053,8 +7053,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         vector signed int vsumi2 = v0;
         vector signed int vsumi3 = v0;
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         for (int j = 0; j < QK_K/64; j+=2) {
             __builtin_prefetch(q4, 0, 1);
@@ -7145,8 +7145,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         utmp[2] = uaux;
         utmp[0] &= kmask1;
 
-        const uint8_t * restrict q4 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         const __m128i mins_and_scales128 = lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]);
         const __m128i mins128 = __lsx_vexth_h_b(mins_and_scales128);
@@ -7228,8 +7228,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]);
 
         const uint8_t * scales = (const uint8_t *)utmp;
-        const uint8_t * restrict x0 = x[i].qs;
-        const int8_t  * restrict y0 = y[i].qs;
+        const uint8_t * GGML_RESTRICT x0 = x[i].qs;
+        const int8_t  * GGML_RESTRICT y0 = y[i].qs;
 
         int32_t sumi1 = 0;
         int32_t sumi2 = 0;
@@ -7277,10 +7277,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * GGML_RESTRICT a = aux8;
         for (int j = 0; j < QK_K/64; ++j) {
             for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
             a += 32;
@@ -7323,7 +7323,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 #endif
 }
 
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy,  size_t by, int nrc) {
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy,  size_t by, int nrc) {
     assert(n % QK_K == 0);
     assert(nrc == 1);
     UNUSED(nrc);
@@ -7331,8 +7331,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     UNUSED(by);
     UNUSED(bs);
 
-    const block_q5_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q5_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
 
     const int nb = n / QK_K;
 
@@ -7374,9 +7374,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         const uint8_t * scales = (const uint8_t *)utmp;
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
 
@@ -7421,8 +7421,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     float summs = 0.f;
 
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
@@ -7505,8 +7505,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         memcpy(utmp, x[i].scales, 12);
         utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -7597,9 +7597,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Fixed sign
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         // Process scales and mins
         memcpy(utmp, x[i].scales, 12);
@@ -7611,7 +7611,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         // Sum mins * q8sums
         int32_t sumi_mins = 0;
-        const int16_t * restrict q8sums = y[i].bsums;
+        const int16_t * GGML_RESTRICT q8sums = y[i].bsums;
         const uint8_t * m = (const uint8_t *)&utmp[2];
         for (int j = 0; j < 16; j += 2) {
             sumi_mins += (q8sums[j] + q8sums[j+1]) * m[j/2];
@@ -7715,9 +7715,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         vl = 8;
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
 
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
         const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
@@ -7856,8 +7856,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         vector signed int vsumi2 = v0;
         vector signed int vsumi3 = v0;
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         for (int j = 0; j < QK_K/64; ++j) {
             __builtin_prefetch(q5, 0, 1);
@@ -7929,8 +7929,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     for (int i = 0; i < nb; ++i) {
 
-        const uint8_t * restrict q5 = x[i].qs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q5 = x[i].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
         const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
@@ -8039,9 +8039,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
 
         const uint8_t * scales = (const uint8_t *)utmp;
-        const uint8_t * restrict x0l = x[i].qs;
-        const uint8_t * restrict x0h = x[i].qh;
-        const int8_t  * restrict y0 = y[i].qs;
+        const uint8_t * GGML_RESTRICT x0l = x[i].qs;
+        const uint8_t * GGML_RESTRICT x0h = x[i].qh;
+        const int8_t  * GGML_RESTRICT y0 = y[i].qs;
 
         v_xh[0] = vec_xl(0 , x0h);
         v_xh[1] = vec_xl(16, x0h);
@@ -8094,11 +8094,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].qs;
-        const uint8_t * restrict hm = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * GGML_RESTRICT a = aux8;
         uint8_t m = 1;
         for (int j = 0; j < QK_K/64; ++j) {
             for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
@@ -8145,7 +8145,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 #endif
 }
 
-void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(n % QK_K == 0);
     assert(nrc == 1);
     UNUSED(nrc);
@@ -8153,8 +8153,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     UNUSED(by);
     UNUSED(bs);
 
-    const block_q6_K * restrict x = vx;
-    const block_q8_K * restrict y = vy;
+    const block_q6_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
 
     const int nb = n / QK_K;
 
@@ -8174,11 +8174,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         const float d_all = GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q6 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
-        const int8_t * restrict scale = x[i].scales;
+        const int8_t * GGML_RESTRICT scale = x[i].scales;
 
         const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
         const int8x16_t scales = vld1q_s8(scale);
@@ -8265,9 +8265,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
 
@@ -8343,9 +8343,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         // handle the q6_k -32 offset separately using bsums
         const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
@@ -8444,8 +8444,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     for (int i = 0; i < nb; ++i) {
         // Unpack 6-bit quantized data into aux8 (unchanged)
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
         int8_t * a = aux8;
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; ++l) {
@@ -8459,8 +8459,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
             qh += 32;
         }
 
-        const int8_t * restrict a_ptr = aux8;
-        const int8_t * restrict q8 = y[i].qs;
+        const int8_t * GGML_RESTRICT a_ptr = aux8;
+        const int8_t * GGML_RESTRICT q8 = y[i].qs;
         v128_t acc0 = wasm_i32x4_splat(0);
         v128_t acc1 = wasm_i32x4_splat(0);
 
@@ -8523,11 +8523,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
 
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q6 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
 
-        const int8_t * restrict scale = x[i].scales;
+        const int8_t * GGML_RESTRICT scale = x[i].scales;
 
         size_t vl;
 
@@ -8629,10 +8629,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         vector signed int vsumi6 = v0;
         vector signed int vsumi7 = v0;
 
-        const uint8_t * restrict q6 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict qs = x[i].scales;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q6 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT qs = x[i].scales;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         for (int j = 0; j < QK_K/128; ++j) {
             __builtin_prefetch(q6, 0, 0);
@@ -8748,9 +8748,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         const __m128i scales128 = __lsx_vld((const __m128i*)x[i].scales, 0);
         const v16i8 shuffle_mask = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
@@ -8816,11 +8816,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     for (int i = 0; i < nb; ++i) {
         const float d_all = GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict x0l = x[i].ql;
-        const uint8_t * restrict x0h = x[i].qh;
-        const int8_t  * restrict y0 = y[i].qs;
+        const uint8_t * GGML_RESTRICT x0l = x[i].ql;
+        const uint8_t * GGML_RESTRICT x0h = x[i].qh;
+        const int8_t  * GGML_RESTRICT y0 = y[i].qs;
 
-        const int8_t  * restrict scale = x[i].scales;
+        const int8_t  * GGML_RESTRICT scale = x[i].scales;
 
         const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
         const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
@@ -8931,11 +8931,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
-        const uint8_t * restrict q4 = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const  int8_t * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
         memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * restrict a = aux8;
+        int8_t * GGML_RESTRICT a = aux8;
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; ++l) {
                 a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
@@ -9003,7 +9003,7 @@ static const int8_t keven_signs_q2xs[1024] = {
 };
 #endif
 
-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(n % QK_K == 0);
     assert(nrc == 1);
     UNUSED(nrc);
@@ -9011,8 +9011,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
     UNUSED(by);
     UNUSED(bs);
 
-    const block_iq2_xxs * restrict x = vx;
-    const block_q8_K    * restrict y = vy;
+    const block_iq2_xxs * GGML_RESTRICT x = vx;
+    const block_q8_K    * GGML_RESTRICT y = vy;
 
     const int nb = n / QK_K;
 
@@ -9030,8 +9030,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
         float sumf1 = 0, sumf2 = 0;
         for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
             q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
@@ -9067,8 +9067,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
     __m256 accumf = _mm256_setzero_ps();
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
         __m256i sumi1 = _mm256_setzero_si256();
         __m256i sumi2 = _mm256_setzero_si256();
         for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -9108,8 +9108,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
     __m256 accumf = _mm256_setzero_ps();
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
         __m128i sumi1_0 = _mm_setzero_si128();
         __m128i sumi1_1 = _mm_setzero_si128();
         __m128i sumi2_0 = _mm_setzero_si128();
@@ -9173,8 +9173,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
         vector signed int vsumi2 = v0;
         vector signed int vsumi3 = v0;
 
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t  *  restrict q8 = y[i].qs;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t  *  GGML_RESTRICT q8 = y[i].qs;
 
         for (int j = 0; j < QK_K/32; j += 2) {
             __builtin_prefetch(q2, 0, 1);
@@ -9250,8 +9250,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
     __m256 accumf = (__m256)__lasx_xvldi(0);
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
         __m256i sumi1 = __lasx_xvldi(0);
         __m256i sumi2 = __lasx_xvldi(0);
         for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -9291,8 +9291,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
 //
 //    for (int i = 0; i < nb; ++i) {
 //        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-//        const uint16_t * restrict q2 = x[i].qs;
-//        const int8_t   * restrict q8 = y[i].qs;
+//        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+//        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
 //
 //        float sumf1 = 0, sumf2 = 0;
 //
@@ -9340,8 +9340,8 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
     float sumf = 0.f;
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
         int32_t bsum = 0;
         for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
             memcpy(aux32, q2, 2*sizeof(uint32_t));
@@ -9364,7 +9364,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
 #endif
 }
 
-void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(n % QK_K == 0);
     assert(nrc == 1);
     UNUSED(nrc);
@@ -9372,8 +9372,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
     UNUSED(by);
     UNUSED(bs);
 
-    const block_iq2_xs * restrict x = vx;
-    const block_q8_K   * restrict y = vy;
+    const block_iq2_xs * GGML_RESTRICT x = vx;
+    const block_q8_K   * GGML_RESTRICT y = vy;
 
     const int nb = n / QK_K;
 
@@ -9390,8 +9390,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
         const uint8x8_t scales8 = vld1_u8(x[i].scales);
         const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
         const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
@@ -9468,8 +9468,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
     __m256 accumf = _mm256_setzero_ps();
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
 
         memcpy(&aux64, x[i].scales, 8);
         __m128i stmp = _mm_set1_epi64x(aux64);
@@ -9589,8 +9589,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
     __m256 accumf = _mm256_setzero_ps();
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
 
         memcpy(&aux64, x[i].scales, 8);
         __m128i stmp = _mm_set1_epi64x(aux64);
@@ -9744,8 +9744,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
     __m256 accumf = (__m256)__lasx_xvldi(0);
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const int8_t   * restrict q8 = y[i].qs;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
 
         memcpy(&aux64, x[i].scales, 8);
         __m128i stmp = __lsx_vreplgr2vr_d(aux64);
@@ -9842,9 +9842,9 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
         vector signed int vsumi2 = v0;
         vector signed int vsumi3 = v0;
 
-        const uint16_t * restrict q2 = x[i].qs;
-        const uint8_t  * restrict sc = x[i].scales;
-        const int8_t  *  restrict q8 = y[i].qs;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const uint8_t  * GGML_RESTRICT sc = x[i].scales;
+        const int8_t  *  GGML_RESTRICT q8 = y[i].qs;
 
         for (int j = 0; j < QK_K/64; ++j) {
             __builtin_prefetch(q2, 0, 1);
@@ -9914,9 +9914,9 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
     float sumf = 0.f;
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint16_t * restrict q2 = x[i].qs;
-        const uint8_t  * restrict sc = x[i].scales;
-        const int8_t   * restrict q8 = y[i].qs;
+        const uint16_t * GGML_RESTRICT q2 = x[i].qs;
+        const uint8_t  * GGML_RESTRICT sc = x[i].scales;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
         int32_t bsum = 0;
         for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
             const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
@@ -9949,7 +9949,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
 #endif
 }
 
-void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(n % QK_K == 0);
     assert(nrc == 1);
     UNUSED(nrc);
@@ -9957,8 +9957,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
     UNUSED(by);
     UNUSED(bs);
 
-    const block_iq2_s * restrict x = vx;
-    const block_q8_K  * restrict y = vy;
+    const block_iq2_s * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
 
     const int nb = n / QK_K;
 
@@ -9984,10 +9984,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
 
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
 
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         int sumi1 = 0, sumi2 = 0;
         for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -10058,10 +10058,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
     __m256 accumf = _mm256_setzero_ps();
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         memcpy(&aux64, x[i].scales, 8);
         const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
@@ -10131,10 +10131,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
     __m256 accumf = _mm256_setzero_ps();
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         memcpy(&aux64, x[i].scales, 8);
         const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
@@ -10229,11 +10229,11 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
         vector signed int vsumi2 = v0;
         vector signed int vsumi3 = v0;
 
-        const uint8_t *  restrict q2 = x[i].qs;
-        const uint8_t *  restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
-        const uint8_t *  restrict sc = x[i].scales;
-        const int8_t  *  restrict q8 = y[i].qs;
+        const uint8_t *  GGML_RESTRICT q2 = x[i].qs;
+        const uint8_t *  GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const uint8_t *  GGML_RESTRICT sc = x[i].scales;
+        const int8_t  *  GGML_RESTRICT q8 = y[i].qs;
 
         for (int j = 0; j < QK_K/32; j += 2) {
             __builtin_prefetch(q2, 0, 1);
@@ -10330,10 +10330,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
     __m256 accumf = (__m256)__lasx_xvldi(0);
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
         __m128i tmp1;
         memcpy(&aux64, x[i].scales, 8);
@@ -10427,7 +10427,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
 
 }
 
-void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(n % QK_K == 0);
     assert(nrc == 1);
     UNUSED(nrc);
@@ -10435,8 +10435,8 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
     UNUSED(by);
     UNUSED(bs);
 
-    const block_iq3_xxs * restrict x = vx;
-    const block_q8_K    * restrict y = vy;
+    const block_iq3_xxs * GGML_RESTRICT x = vx;
+    const block_q8_K    * GGML_RESTRICT y = vy;
 
     const int nb = n / QK_K;
 
@@ -10452,9 +10452,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict gas = x[i].qs + QK_K/4;
-        const int8_t   * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
         float sumf1 = 0, sumf2 = 0;
         for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
             q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
@@ -10490,9 +10490,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
     __m256 accumf = _mm256_setzero_ps();
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict gas = x[i].qs + QK_K/4;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
         __m256i sumi1 = _mm256_setzero_si256();
         __m256i sumi2 = _mm256_setzero_si256();
         for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -10535,9 +10535,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
     __m256 accumf = _mm256_setzero_ps();
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict gas = x[i].qs + QK_K/4;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
         __m128i sumi1_0 = _mm_setzero_si128();
         __m128i sumi1_1 = _mm_setzero_si128();
         __m128i sumi2_0 = _mm_setzero_si128();
@@ -10604,9 +10604,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
         vector signed int vsumi2 = v0;
         vector signed int vsumi3 = v0;
 
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint32_t * GGML_RESTRICT signs = (const uint32_t *)(x[i].qs + QK_K/4);
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
 
 #pragma GCC unroll 1
         for (int j = 0; j < QK_K/32; j += 2) {
@@ -10678,9 +10678,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
     __m256 accumf = (__m256)__lasx_xvldi(0);
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict gas = x[i].qs + QK_K/4;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
         __m256i sumi1 = __lasx_xvldi(0);
         __m256i sumi2 = __lasx_xvldi(0);
         for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -10723,9 +10723,9 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
     float sumf = 0.f;
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict q3 = x[i].qs;
-        const uint8_t * restrict gas = x[i].qs + QK_K/4;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
         int32_t bsum = 0;
         for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
             memcpy(&aux32, gas, sizeof(uint32_t)); gas += sizeof(uint32_t);
@@ -10750,7 +10750,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
 #endif
 }
 
-void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(n % QK_K == 0);
     assert(nrc == 1);
     UNUSED(nrc);
@@ -10758,8 +10758,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
     UNUSED(by);
     UNUSED(bs);
 
-    const block_iq3_s * restrict x = vx;
-    const block_q8_K  * restrict y = vy;
+    const block_iq3_s * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
 
     const int nb = n / QK_K;
 
@@ -10796,10 +10796,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
     float sumf = 0;
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
-        const int8_t   * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
 
         memcpy(scales32, x[i].scales, 4);
         scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101;
@@ -10878,10 +10878,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
     __m256 accumf = _mm256_setzero_ps();
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
         __m256i sumi1 = _mm256_setzero_si256();
         __m256i sumi2 = _mm256_setzero_si256();
         for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -10963,10 +10963,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
     __m256 accumf = _mm256_setzero_ps();
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
         __m128i sumi1_0 = _mm_setzero_si128();
         __m128i sumi1_1 = _mm_setzero_si128();
         __m128i sumi2_0 = _mm_setzero_si128();
@@ -11064,11 +11064,11 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
         vector float vyd = vec_splats(y[i].d);
         vector float vd = vec_mul(vxd, vyd);
 
-        const uint8_t *  restrict q3 = x[i].qs;
-        const uint8_t *  restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)(x[i].signs);
-        const uint8_t *  restrict sc = x[i].scales;
-        const int8_t  *  restrict q8 = y[i].qs;
+        const uint8_t *  GGML_RESTRICT q3 = x[i].qs;
+        const uint8_t *  GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].signs);
+        const uint8_t *  GGML_RESTRICT sc = x[i].scales;
+        const int8_t  *  GGML_RESTRICT q8 = y[i].qs;
 
         vector signed int vsumi0 = v0;
         vector signed int vsumi1 = v0;
@@ -11175,10 +11175,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
     __m256 accumf = (__m256)__lasx_xvldi(0);
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
         __m256i sumi1 = __lasx_xvldi(0);
         __m256i sumi2 = __lasx_xvldi(0);
         for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
@@ -11236,10 +11236,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
     float sumf = 0.f;
     for (int i = 0; i < nb; ++i) {
         const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        const uint8_t * restrict qs = x[i].qs;
-        const uint8_t * restrict qh = x[i].qh;
-        const uint8_t * restrict signs = x[i].signs;
-        const int8_t  * restrict q8 = y[i].qs;
+        const uint8_t * GGML_RESTRICT qs = x[i].qs;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const uint8_t * GGML_RESTRICT signs = x[i].signs;
+        const int8_t  * GGML_RESTRICT q8 = y[i].qs;
         int32_t bsum = 0;
         for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
             const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1;
@@ -11291,7 +11291,7 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
 }
 #endif
 
-void ggml_vec_dot_iq1_s_q8_K  (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(n % QK_K == 0);
     assert(nrc == 1);
     UNUSED(nrc);
@@ -11299,8 +11299,8 @@ void ggml_vec_dot_iq1_s_q8_K  (int n, float * restrict s, size_t bs, const void
     UNUSED(by);
     UNUSED(bs);
 
-    const block_iq1_s * restrict x = vx;
-    const block_q8_K  * restrict y = vy;
+    const block_iq1_s * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
 
     const int nb = n / QK_K;
 
@@ -11458,10 +11458,10 @@ void ggml_vec_dot_iq1_s_q8_K  (int n, float * restrict s, size_t bs, const void
         vector signed int vsumi3 = vec_splats((int32_t)0);
         vector signed int vsumi8 = vec_splats((int32_t)0);
 
-        const uint8_t  * restrict q1 = x[i].qs;
-        const uint16_t * restrict qh = x[i].qh;
-        const int8_t   * restrict q8 = y[i].qs;
-        const int16_t  * restrict qs = y[i].bsums;
+        const uint8_t  * GGML_RESTRICT q1 = x[i].qs;
+        const uint16_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t   * GGML_RESTRICT q8 = y[i].qs;
+        const int16_t  * GGML_RESTRICT qs = y[i].bsums;
 
         for (int j = 0; j < QK_K/32; j += 2) {
             __builtin_prefetch(q1, 0, 1);
@@ -11622,7 +11622,7 @@ void ggml_vec_dot_iq1_s_q8_K  (int n, float * restrict s, size_t bs, const void
 #endif
 }
 
-void ggml_vec_dot_iq1_m_q8_K  (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(n % QK_K == 0);
     assert(nrc == 1);
     UNUSED(nrc);
@@ -11630,8 +11630,8 @@ void ggml_vec_dot_iq1_m_q8_K  (int n, float * restrict s, size_t bs, const void
     UNUSED(by);
     UNUSED(bs);
 
-    const block_iq1_m * restrict x = vx;
-    const block_q8_K  * restrict y = vy;
+    const block_iq1_m * GGML_RESTRICT x = vx;
+    const block_q8_K  * GGML_RESTRICT y = vy;
 
     const int nb = n / QK_K;
 
@@ -11912,7 +11912,7 @@ void ggml_vec_dot_iq1_m_q8_K  (int n, float * restrict s, size_t bs, const void
 #endif
 }
 
-void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(nrc == 1);
     UNUSED(nrc);
     UNUSED(bx);
@@ -11921,8 +11921,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
     assert(n % QK4_NL == 0);
     static_assert(QK4_NL == QK8_0, "QK4_NL and QK8_0 must be the same");
 
-    const block_iq4_nl * restrict x = vx;
-    const block_q8_0   * restrict y = vy;
+    const block_iq4_nl * GGML_RESTRICT x = vx;
+    const block_q8_0   * GGML_RESTRICT y = vy;
 
     const int nb = n / QK4_NL;
 
@@ -12097,8 +12097,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
     const uint8x16_t v_m = vec_splat_u8(0x0F);
 
     for (; ib < nb; ++ib) {
-        const block_iq4_nl * restrict x0 = &x[ib];
-        const block_q8_0   * restrict y0 = &y[ib];
+        const block_iq4_nl * GGML_RESTRICT x0 = &x[ib];
+        const block_q8_0   * GGML_RESTRICT y0 = &y[ib];
 
         const uint8x16_t v_x = vec_xl(0, x0->qs);
         int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
@@ -12126,7 +12126,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
     *s = sumf;
 }
 
-void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
+void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(nrc == 1);
     UNUSED(nrc);
     UNUSED(bx);
@@ -12134,8 +12134,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
     UNUSED(bs);
     assert(n % QK_K == 0);
 
-    const block_iq4_xs * restrict x = vx;
-    const block_q8_K   * restrict y = vy;
+    const block_iq4_xs * GGML_RESTRICT x = vx;
+    const block_q8_K   * GGML_RESTRICT y = vy;
 
     const int nb = n / QK_K;
 
@@ -12292,9 +12292,9 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
 
         uint16_t h = x[ibl].scales_h;
 
-        const uint8_t * restrict q4 = x[ibl].qs;
-        const uint8_t * restrict sc = x[ibl].scales_l;
-        const int8_t  * restrict q8 = y[ibl].qs;
+        const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
+        const uint8_t * GGML_RESTRICT sc = x[ibl].scales_l;
+        const int8_t  * GGML_RESTRICT q8 = y[ibl].qs;
 
         for (int ib = 0; ib < QK_K/64; ib ++ ) {
             __builtin_prefetch(q4, 0, 1);
@@ -12398,8 +12398,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
     float sumf = 0;
 
     for (int ibl = 0; ibl < nb; ++ibl) {
-        const uint8_t * restrict q4 = x[ibl].qs;
-        const int8_t  * restrict q8 = y[ibl].qs;
+        const uint8_t * GGML_RESTRICT q4 = x[ibl].qs;
+        const int8_t  * GGML_RESTRICT q8 = y[ibl].qs;
 
         uint16_t h = x[ibl].scales_h;
 
@@ -12479,12 +12479,12 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
 
 // ============================ 4-bit non-linear quants
 
-void quantize_row_iq4_nl(const float * restrict x, void * restrict y, int64_t k) {
+void quantize_row_iq4_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
     assert(k % QK4_NL == 0);
     quantize_row_iq4_nl_ref(x, y, k);
 }
 
-void quantize_row_iq4_xs(const float * restrict x, void * restrict y, int64_t k) {
+void quantize_row_iq4_xs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     quantize_iq4_xs(x, y, 1, k, NULL);
 }
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 9ab24522c..2a5463fdf 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -247,9 +247,9 @@ typedef pthread_t ggml_thread_t;
 static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 
 
-static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
-static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
-static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
+static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
+static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
+static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
 
 static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F32] = {
@@ -1451,7 +1451,7 @@ inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp
     }
 }
 
-static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
+static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
    assert(nrc == 1);
    UNUSED(nrc);
    UNUSED(bx);
@@ -1494,7 +1494,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
     *s = sumf;
 }
 
-static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc) {
+static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) {
     assert(nrc == 1);
     UNUSED(nrc);
     UNUSED(bx);
@@ -1562,7 +1562,7 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t
     *s = sumf;
 }
 
-static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
+static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) {
     assert(nrc == 1);
     UNUSED(nrc);
     UNUSED(bx);
@@ -1606,10 +1606,10 @@ static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t *
 
 // compute GGML_VEC_DOT_UNROLL dot products at once
 // xs - x row stride in bytes
-inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
+inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
     ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
 
-    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
+    ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
 
     for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
         x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
@@ -1659,7 +1659,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re
     }
 }
 
-inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
+inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
 #if defined(GGML_SIMD)
     const int np = (n & ~(GGML_F32_STEP - 1));
 
@@ -1690,7 +1690,7 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
 #endif
 }
 
-inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) {
+inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
 #if defined(GGML_SIMD)
     const int np = (n & ~(GGML_F16_STEP - 1));
 
@@ -1722,10 +1722,10 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const
 }
 
 // xs and vs are byte strides of x and v
-inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
+inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
 
-    const float * restrict x[GGML_VEC_MAD_UNROLL];
-    const float * restrict v[GGML_VEC_MAD_UNROLL];
+    const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
+    const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
 
     for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
         x[i] = (const float *) ((const char *) xv + i*xs);
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 7918388ae..ac918a60d 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -28,7 +28,7 @@
 #define UNUSED GGML_UNUSED
 
 // reference implementation for deterministic creation of model files
-void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
+void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k) {
     static const int qk = QK4_0;
 
     assert(k % qk == 0);
@@ -65,7 +65,7 @@ void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, in
     }
 }
 
-void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
+void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k) {
     const int qk = QK4_1;
 
     assert(k % qk == 0);
@@ -102,7 +102,7 @@ void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, in
     }
 }
 
-void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
+void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k) {
     static const int qk = QK5_0;
 
     assert(k % qk == 0);
@@ -146,7 +146,7 @@ void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, in
     }
 }
 
-void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
+void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k) {
     const int qk = QK5_1;
 
     assert(k % qk == 0);
@@ -191,7 +191,7 @@ void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, in
 }
 
 // reference implementation for deterministic creation of model files
-void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
+void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k) {
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
 
@@ -217,7 +217,7 @@ void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, in
 }
 
 // reference implementation for deterministic creation of model files
-void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
+void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k) {
     assert(QK8_1 == 32);
     assert(k % QK8_1 == 0);
     const int nb = k / QK8_1;
@@ -252,7 +252,7 @@ void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, in
     }
 }
 
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     static const int qk = QK4_0;
 
     assert(k % qk == 0);
@@ -272,7 +272,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int6
     }
 }
 
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     static const int qk = QK4_1;
 
     assert(k % qk == 0);
@@ -293,7 +293,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int6
     }
 }
 
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     static const int qk = QK5_0;
 
     assert(k % qk == 0);
@@ -319,7 +319,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int6
     }
 }
 
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     static const int qk = QK5_1;
 
     assert(k % qk == 0);
@@ -346,7 +346,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int6
     }
 }
 
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     static const int qk = QK8_0;
 
     assert(k % qk == 0);
@@ -376,8 +376,8 @@ static inline int nearest_int(float fval) {
     return (i & 0x007fffff) - 0x00400000;
 }
 
-static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type,
-        const float * restrict qw) {
+static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
+        const float * GGML_RESTRICT qw) {
     float max = 0;
     float amax = 0;
     for (int i = 0; i < n; ++i) {
@@ -445,7 +445,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
     return scale;
 }
 
-static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) {
+static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) {
     float max = 0;
     float amax = 0;
     for (int i = 0; i < n; ++i) {
@@ -504,7 +504,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
     return 1/iscale;
 }
 
-static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
+static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min,
         int ntry, float alpha) {
     float min = x[0];
     float max = x[0];
@@ -547,8 +547,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
     return scale;
 }
 
-static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
-        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
+static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
+        uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
         float rmin, float rdelta, int nstep, bool use_mad) {
     float min = x[0];
     float max = x[0];
@@ -628,7 +628,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
     return scale;
 }
 
-static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
+static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) {
     if (j < 4) {
         *d = q[j] & 63; *m = q[j + 4] & 63;
     } else {
@@ -639,7 +639,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
 
 //========================- 2-bit (de)-quantization
 
-void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, int64_t k) {
+void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -709,7 +709,7 @@ void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, in
     }
 }
 
-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -741,8 +741,8 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int6
     }
 }
 
-static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
-        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
+static float make_qkx3_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights,
+        uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux,
         float rmin, float rdelta, int nstep, bool use_mad) {
     float min = x[0];
     float max = x[0];
@@ -824,7 +824,7 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
     return scale;
 }
 
-static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) {
+static float make_qp_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, const float * quant_weights) {
     float max = 0;
     for (int i = 0; i < n; ++i) {
         max = MAX(max, x[i]);
@@ -897,7 +897,7 @@ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t *
     return sumlx/suml2;
 }
 
-static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
+static void quantize_row_q2_K_impl(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k, const float * GGML_RESTRICT quant_weights) {
     GGML_ASSERT(quant_weights);
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
@@ -917,7 +917,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
         for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
         float sigma2 = sumx2/QK_K;
         for (int j = 0; j < QK_K/16; ++j) {
-            const float * restrict qw = quant_weights + QK_K * i + 16*j;
+            const float * GGML_RESTRICT qw = quant_weights + QK_K * i + 16*j;
             for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
             for (int l = 0; l < QK_K/16; ++l) sw[j] += weight[l];
             scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
@@ -959,7 +959,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
     }
 }
 
-size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -977,7 +977,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nr
 
 //========================= 3-bit (de)-quantization
 
-void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, int64_t k) {
+void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1053,7 +1053,7 @@ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, in
     }
 }
 
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1067,8 +1067,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
 
         const float d_all = GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict q = x[i].qs;
-        const uint8_t * restrict hm = x[i].hmask;
+        const uint8_t * GGML_RESTRICT q = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].hmask;
         uint8_t m = 1;
 
         memcpy(aux, x[i].scales, 12);
@@ -1103,7 +1103,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
     }
 }
 
-static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
+static void quantize_row_q3_K_impl(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t n_per_row, const float * GGML_RESTRICT quant_weights) {
     assert(n_per_row % QK_K == 0);
     const int nb = n_per_row / QK_K;
 
@@ -1187,7 +1187,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
     }
 }
 
-size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -1205,7 +1205,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nr
 
 // ====================== 4-bit (de)-quantization
 
-void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, int64_t k) {
+void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1277,7 +1277,7 @@ void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, in
     }
 }
 
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
 
@@ -1301,7 +1301,7 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int6
     }
 }
 
-static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
+static void quantize_row_q4_K_impl(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
     assert(n_per_row % QK_K == 0);
     const int64_t nb = n_per_row / QK_K;
 
@@ -1374,7 +1374,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
     }
 }
 
-size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -1392,7 +1392,7 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nr
 
 // ====================== 5-bit (de)-quantization
 
-void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, int64_t k) {
+void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
@@ -1454,8 +1454,8 @@ void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, in
             }
         }
 
-        uint8_t * restrict qh = y[i].qh;
-        uint8_t * restrict ql = y[i].qs;
+        uint8_t * GGML_RESTRICT qh = y[i].qh;
+        uint8_t * GGML_RESTRICT ql = y[i].qs;
         memset(qh, 0, QK_K/8);
 
         uint8_t m1 = 1, m2 = 2;
@@ -1479,7 +1479,7 @@ void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, in
     }
 }
 
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
@@ -1506,7 +1506,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int6
     }
 }
 
-static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
+static void quantize_row_q5_K_impl(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
     assert(n_per_row % QK_K == 0);
     const int64_t nb = n_per_row / QK_K;
 
@@ -1573,8 +1573,8 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
             }
         }
 
-        uint8_t * restrict qh = y[i].qh;
-        uint8_t * restrict ql = y[i].qs;
+        uint8_t * GGML_RESTRICT qh = y[i].qh;
+        uint8_t * GGML_RESTRICT ql = y[i].qs;
         memset(qh, 0, QK_K/8);
 
         uint8_t m1 = 1, m2 = 2;
@@ -1599,7 +1599,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
     }
 }
 
-size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -1617,7 +1617,7 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nr
 
 // ====================== 6-bit (de)-quantization
 
-void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, int64_t k) {
+void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
@@ -1667,8 +1667,8 @@ void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, in
             }
         }
 
-        uint8_t * restrict ql = y[i].ql;
-        uint8_t * restrict qh = y[i].qh;
+        uint8_t * GGML_RESTRICT ql = y[i].ql;
+        uint8_t * GGML_RESTRICT qh = y[i].qh;
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; ++l) {
                 const uint8_t q1 = L[j + l +  0] & 0xF;
@@ -1687,16 +1687,16 @@ void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, in
     }
 }
 
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
     for (int i = 0; i < nb; i++) {
         const float d = GGML_FP16_TO_FP32(x[i].d);
 
-        const uint8_t * restrict ql = x[i].ql;
-        const uint8_t * restrict qh = x[i].qh;
-        const int8_t  * restrict sc = x[i].scales;
+        const uint8_t * GGML_RESTRICT ql = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT sc = x[i].scales;
 
         for (int n = 0; n < QK_K; n += 128) {
             for (int l = 0; l < 32; ++l) {
@@ -1718,7 +1718,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int6
     }
 }
 
-static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
+static void quantize_row_q6_K_impl(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
     assert(n_per_row % QK_K == 0);
     const int64_t nb = n_per_row / QK_K;
 
@@ -1781,8 +1781,8 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
             }
         }
 
-        uint8_t * restrict ql = y[i].ql;
-        uint8_t * restrict qh = y[i].qh;
+        uint8_t * GGML_RESTRICT ql = y[i].ql;
+        uint8_t * GGML_RESTRICT qh = y[i].qh;
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; ++l) {
                 const uint8_t q1 = L[j + l +  0] & 0xF;
@@ -1802,7 +1802,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
     }
 }
 
-size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -1818,7 +1818,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nr
     return nrow * row_size;
 }
 
-static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
+static void quantize_row_q4_0_impl(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
     static_assert(QK4_0 == 32, "QK4_0 must be 32");
 
     if (!quant_weights) {
@@ -1846,7 +1846,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
     }
 }
 
-size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     if (!quant_weights) {
         quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
         return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
@@ -1861,7 +1861,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nr
     return nrow * row_size;
 }
 
-static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
+static void quantize_row_q4_1_impl(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
     static_assert(QK4_1 == 32, "QK4_1 must be 32");
 
     if (!quant_weights) {
@@ -1891,7 +1891,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
     }
 }
 
-size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     if (!quant_weights) {
         quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row);
         return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
@@ -1906,7 +1906,7 @@ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nr
     return nrow * row_size;
 }
 
-static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
+static void quantize_row_q5_0_impl(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
     static_assert(QK5_0 == 32, "QK5_0 must be 32");
 
     if (!quant_weights) {
@@ -1945,7 +1945,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
     }
 }
 
-size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     if (!quant_weights) {
         quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row);
         return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
@@ -1960,7 +1960,7 @@ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nr
     return nrow * row_size;
 }
 
-static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int64_t n_per_row, const float * quant_weights) {
+static void quantize_row_q5_1_impl(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
     static_assert(QK5_1 == 32, "QK5_1 must be 32");
 
     if (!quant_weights) {
@@ -1998,7 +1998,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
     }
 }
 
-size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     if (!quant_weights) {
         quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row);
         return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
@@ -2013,7 +2013,7 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nr
     return nrow * row_size;
 }
 
-size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     (void)quant_weights; // not used
     const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
     quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row);
@@ -2022,7 +2022,7 @@ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nr
 
 // ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
 
-void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y, int64_t k) {
+void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
@@ -2088,7 +2088,7 @@ void quantize_row_tq1_0_ref(const float * restrict x, block_tq1_0 * restrict y,
     }
 }
 
-void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y, int64_t k) {
+void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
@@ -2120,21 +2120,21 @@ void quantize_row_tq2_0_ref(const float * restrict x, block_tq2_0 * restrict y,
     }
 }
 
-size_t quantize_tq1_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     (void)quant_weights; // not used
     const size_t row_size = ggml_row_size(GGML_TYPE_TQ1_0, n_per_row);
     quantize_row_tq1_0_ref(src, dst, (int64_t)nrow*n_per_row);
     return nrow * row_size;
 }
 
-size_t quantize_tq2_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     (void)quant_weights; // not used
     const size_t row_size = ggml_row_size(GGML_TYPE_TQ2_0, n_per_row);
     quantize_row_tq2_0_ref(src, dst, (int64_t)nrow*n_per_row);
     return nrow * row_size;
 }
 
-void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
@@ -2173,7 +2173,7 @@ void dequantize_row_tq1_0(const block_tq1_0 * restrict x, float * restrict y, in
     }
 }
 
-void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
@@ -2194,7 +2194,7 @@ void dequantize_row_tq2_0(const block_tq2_0 * restrict x, float * restrict y, in
 
 // ====================== "True" 2-bit (de)-quantization
 
-void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
@@ -2222,7 +2222,7 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
 
 // ====================== 2.3125 bpw (de)-quantization
 
-void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_iq2_xs(const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
@@ -2249,7 +2249,7 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
 
 // ====================== 2.5625 bpw (de)-quantization
 
-void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_iq2_s(const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
@@ -2281,7 +2281,7 @@ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, in
 
 // ====================== 3.0625 bpw (de)-quantization
 
-void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
@@ -2313,7 +2313,7 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y
 
 // ====================== 3.3125 bpw (de)-quantization
 
-void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_iq3_s(const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
@@ -2356,7 +2356,7 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in
 
 // ====================== 1.5625 bpw (de)-quantization
 
-void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_iq1_s(const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
@@ -2381,7 +2381,7 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
     }
 }
 
-void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_iq1_m(const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
@@ -2433,7 +2433,7 @@ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, in
 
 static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
 
-void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_iq4_nl(const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK4_NL == 0);
     const int64_t nb = k / QK4_NL;
 
@@ -2451,7 +2451,7 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y,
     }
 }
 
-void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_iq4_xs(const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
@@ -2476,7 +2476,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
 
 //===================================== Q8_K ==============================================
 
-void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, int64_t k) {
+void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
@@ -2515,7 +2515,7 @@ void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, in
     }
 }
 
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int64_t k) {
+void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
@@ -2927,8 +2927,8 @@ void iq2xs_free_impl(enum ggml_type type) {
     }
 }
 
-static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
-        const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
+static int iq2_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
+        const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, int8_t * GGML_RESTRICT L) {
     int num_neighbors = neighbours[0];
     GGML_ASSERT(num_neighbors > 0);
     float best_d2 = FLT_MAX;
@@ -2951,7 +2951,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u
     return grid_index;
 }
 
-static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
+static void quantize_row_iq2_xxs_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
 
     const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS);
 
@@ -3124,7 +3124,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
     }
 }
 
-static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
+static void quantize_row_iq2_xs_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
 
     const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS);
 
@@ -3304,7 +3304,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
     }
 }
 
-size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK_K == 0);
     int64_t nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
@@ -3316,7 +3316,7 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t
     return nrow * nblock * sizeof(block_iq2_xxs);
 }
 
-size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq2_xs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK_K == 0);
     int64_t nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
@@ -3521,8 +3521,8 @@ void iq3xs_free_impl(int grid_size) {
     }
 }
 
-static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const uint32_t * restrict grid,
-        const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
+static int iq3_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint32_t * GGML_RESTRICT grid,
+        const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, int8_t * GGML_RESTRICT L) {
     int num_neighbors = neighbours[0];
     GGML_ASSERT(num_neighbors > 0);
     float best_d2 = FLT_MAX;
@@ -3545,8 +3545,8 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u
     return grid_index;
 }
 
-static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int64_t n,
-        const float * restrict quant_weights) {
+static void quantize_row_iq3_xxs_impl(int grid_size, const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n,
+        const float * GGML_RESTRICT quant_weights) {
 
     const int gindex = iq3_data_index(grid_size);
 
@@ -3758,7 +3758,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
     }
 }
 
-size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK_K == 0);
     int64_t nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
@@ -3770,13 +3770,13 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t
     return nrow * nblock * sizeof(block_iq3_xxs);
 }
 
-void quantize_row_iq3_xxs_ref(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
+void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
 }
 
-static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n,
-        const float * restrict quant_weights,
+static void quantize_row_iq3_s_impl(int block_size, const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int n,
+        const float * GGML_RESTRICT quant_weights,
         float   * scales,
         float   * weight,
         float   * xval,
@@ -3958,7 +3958,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
 }
 
 #define IQ3S_BLOCK_SIZE 32
-size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq3_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK_K == 0);
     int64_t nblock = n_per_row/QK_K;
     float scales[QK_K/IQ3S_BLOCK_SIZE];
@@ -3980,7 +3980,7 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t n
     return nrow * nblock * sizeof(block_iq3_s);
 }
 
-void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
+void quantize_row_iq3_s_ref(const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     quantize_iq3_s(x, y, 1, k, NULL);
 }
@@ -3988,8 +3988,8 @@ void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y,
 
 // =================================== 1.5 bpw ===================================================
 
-static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
-        const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) {
+static int iq1_find_best_neighbour(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
+        const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float * scale, int8_t * GGML_RESTRICT L, int ngrid) {
     int num_neighbors = neighbours[0];
     GGML_ASSERT(num_neighbors > 0);
     float best_score = -FLT_MAX;
@@ -4048,8 +4048,8 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u
     return grid_index;
 }
 
-static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
-        const float * restrict xval, const float * restrict weight, float scale, const float * restrict xg, int8_t * restrict L, int ngrid) {
+static int iq1_find_best_neighbour2(const uint16_t * GGML_RESTRICT neighbours, const uint64_t * GGML_RESTRICT grid,
+        const float * GGML_RESTRICT xval, const float * GGML_RESTRICT weight, float scale, const float * GGML_RESTRICT xg, int8_t * GGML_RESTRICT L, int ngrid) {
     int num_neighbors = neighbours[0];
     GGML_ASSERT(num_neighbors > 0);
     float best_score = FLT_MAX;
@@ -4113,7 +4113,7 @@ static int iq1_sort_helper(const void * left, const void * right) {
 
 #define IQ1S_BLOCK_SIZE 32
 #define IQ1M_BLOCK_SIZE 16
-static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
+static void quantize_row_iq1_s_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights,
         float    * scales,
         float    * weight,
         float    * sumx,
@@ -4271,7 +4271,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
     }
 }
 
-size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq1_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK_K == 0);
     float  scales[QK_K/IQ1S_BLOCK_SIZE];
     float  weight[IQ1S_BLOCK_SIZE];
@@ -4291,7 +4291,7 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t n
     return nrow * nblock * sizeof(block_iq1_s);
 }
 
-static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights,
+static void quantize_row_iq1_m_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights,
         float    * scales,
         float    * weight,
         float    * pairs,
@@ -4539,7 +4539,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
     }
 }
 
-size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq1_m(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK_K == 0);
     float  scales[QK_K/IQ1M_BLOCK_SIZE];
     float  weight[IQ1M_BLOCK_SIZE];
@@ -4570,7 +4570,7 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
     return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
 }
 
-static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x,
+static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
         ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
         float * scales, float * weight, uint8_t * L,
         const int8_t * values,
@@ -4681,7 +4681,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
     }
 }
 
-size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq4_nl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK4_NL == 0);
     int64_t nblock = n_per_row/QK4_NL;
     char * qrow = (char *)dst;
@@ -4703,8 +4703,8 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t
     return nrow * nblock * sizeof(block_iq4_nl);
 }
 
-//void quantize_row_iq4_nl_ref(const float * restrict x, void * restrict vy, int64_t k) {
-void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
+//void quantize_row_iq4_nl_ref(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+void quantize_row_iq4_nl_ref(const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k) {
     GGML_ASSERT(k%QK4_NL == 0);
     int64_t nblock = k/QK4_NL;
     uint8_t L[QK4_NL];
@@ -4719,7 +4719,7 @@ void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y
     }
 }
 
-size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq4_xs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK_K == 0);
     int64_t nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
@@ -4739,14 +4739,14 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t
     return nrow * nblock * sizeof(block_iq4_xs);
 }
 
-void quantize_row_iq4_xs_ref(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
+void quantize_row_iq4_xs_ref(const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     quantize_iq4_xs(x, y, 1, k, NULL);
 }
 
 // =============================== 2.5625 bpw
 
-static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) {
+static void quantize_row_iq2_s_impl(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t n, const float * GGML_RESTRICT quant_weights) {
 
     const int gindex = iq2_data_index(GGML_TYPE_IQ2_S);
 
@@ -4914,7 +4914,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
     }
 }
 
-size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+size_t quantize_iq2_s(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
     GGML_ASSERT(n_per_row%QK_K == 0);
     int64_t nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
@@ -4926,7 +4926,7 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t n
     return nrow * nblock * sizeof(block_iq2_s);
 }
 
-void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
+void quantize_row_iq2_s_ref(const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k) {
     assert(k % QK_K == 0);
     quantize_iq2_s(x, y, 1, k, NULL);
 }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7fc06724e..084240331 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -565,9 +565,9 @@ FILE * ggml_fopen(const char * fname, const char * mode) {
 #endif
 
 }
-static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
-static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
-static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
+static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
+static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
+static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
 
 static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
     [GGML_TYPE_I8] = {
diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
index 028a64794..de4458696 100644
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -4,6 +4,7 @@
 
 #include <map>
 #include <sstream>
+#include <algorithm>
 
 #if __cplusplus >= 202000L
     #define LU8(x) (const char*)(u8##x)
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 1ed688e3b..1ce0850ec 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -6,6 +6,7 @@
 
 #include <set>
 #include <vector>
+#include <algorithm>
 
 struct llama_kv_cell {
     llama_pos pos   = -1;
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index b716630a8..c13b3755a 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -8,6 +8,7 @@
 #include <climits>
 #include <stdexcept>
 #include <cerrno>
+#include <algorithm>
 
 #ifdef __has_include
     #if __has_include(<unistd.h>)
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 163ff64f7..a708d8b88 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -16,6 +16,7 @@
 #include <queue>
 #include <set>
 #include <unordered_map>
+#include <cctype>
 
 //
 // helpers

From a057897ad4e48e3df0354256e0cc80dadcb57595 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Wed, 5 Mar 2025 06:30:31 +0100
Subject: [PATCH 04/19] llama : add xcframework build script (#11996)

* llama : add xcframework build script

This commit adds a script to build an XCFramework for Apple
ios, macos, visionos, and tvos platforms.

The generated XCFramework can then be added to a project and used in
the same way as a regular framework. The llama.swiftui example project
has been updated to use the XCFramework and can be started using the
following command:
```console
$ open examples/llama.swiftui/llama.swiftui.xcodeproj/
```

Refs: https://github.com/ggml-org/llama.cpp/issues/10747

* examples : remove llama.cpp (source dir ref) from project.pbxproj

This commit removes the reference to llama.cpp from the project.pbxproj
file since Package.swift has been removed.

* ci : updated build.yml to use build-xcframework.sh

* ci : add xcframework build to github releases

This commit adds the ability to create a GitHub release with the
xcframework build artifact.

* scripts : add apple app validation scripts

This commit adds scripts that can validate the iOS, macOS, tvOS, and
VisionOS applications. The scripts create a simple test app project,
copy the llama.xcframework to the test project, build and archive the
app, create an IPA from the archive, and validate the IPA using altool.

The motivation for this is to provide some basic validation and
hopefully avoid having to manually validate apps in Xcode.

* llama : remove Package.swift

This commit removes the Package.swift file, as we are now building an
XCFramework for the project.

* llama : remove Sources and spm-headers directories

* llama : use TargetConditionals.h for visionOS/tvOS
---
 .github/workflows/build.yml                   |  33 +-
 Package.swift                                 |  19 -
 Sources/llama/llama.h                         |   4 -
 Sources/llama/module.modulemap                |   5 -
 build-xcframework.sh                          | 519 +++++++++++
 examples/llama.swiftui/README.md              |  15 +
 .../llama.swiftui.xcodeproj/project.pbxproj   |  30 +-
 scripts/apple/validate-apps.sh                |   5 +
 scripts/apple/validate-ios.sh                 | 820 ++++++++++++++++++
 scripts/apple/validate-macos.sh               | 781 +++++++++++++++++
 scripts/apple/validate-tvos.sh                | 813 +++++++++++++++++
 scripts/apple/validate-visionos.sh            | 811 +++++++++++++++++
 spm-headers/ggml-alloc.h                      |   1 -
 spm-headers/ggml-backend.h                    |   1 -
 spm-headers/ggml-cpp.h                        |   1 -
 spm-headers/ggml-cpu.h                        |   1 -
 spm-headers/ggml-metal.h                      |   1 -
 spm-headers/ggml.h                            |   1 -
 spm-headers/llama.h                           |   1 -
 src/llama-mmap.cpp                            |  11 +-
 20 files changed, 3822 insertions(+), 51 deletions(-)
 delete mode 100644 Package.swift
 delete mode 100644 Sources/llama/llama.h
 delete mode 100644 Sources/llama/module.modulemap
 create mode 100755 build-xcframework.sh
 create mode 100755 scripts/apple/validate-apps.sh
 create mode 100755 scripts/apple/validate-ios.sh
 create mode 100755 scripts/apple/validate-macos.sh
 create mode 100755 scripts/apple/validate-tvos.sh
 create mode 100755 scripts/apple/validate-visionos.sh
 delete mode 120000 spm-headers/ggml-alloc.h
 delete mode 120000 spm-headers/ggml-backend.h
 delete mode 120000 spm-headers/ggml-cpp.h
 delete mode 120000 spm-headers/ggml-cpu.h
 delete mode 120000 spm-headers/ggml-metal.h
 delete mode 120000 spm-headers/ggml.h
 delete mode 120000 spm-headers/llama.h

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b96e1f50a..b0c180f26 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -710,12 +710,36 @@ jobs:
             -DLLAMA_BUILD_SERVER=OFF \
             -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-          sudo cmake --install build --config Release
 
       - name: xcodebuild for swift package
         id: xcodebuild
         run: |
-          xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}"
+          ./build-xcframework.sh
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
 
   windows-msys2:
     runs-on: windows-latest
@@ -1336,15 +1360,14 @@ jobs:
             -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
             -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-          sudo cmake --install build --config Release
 
       - name: xcodebuild for swift package
         id: xcodebuild
         run: |
-          xcodebuild -scheme llama-Package -destination 'generic/platform=iOS'
+          ./build-xcframework.sh
 
       - name: Build Xcode project
-        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
+        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
 
   android-build:
     runs-on: ubuntu-latest
diff --git a/Package.swift b/Package.swift
deleted file mode 100644
index 01c996d24..000000000
--- a/Package.swift
+++ /dev/null
@@ -1,19 +0,0 @@
-// swift-tools-version:5.5
-
-import PackageDescription
-
-let package = Package(
-    name: "llama",
-    platforms: [
-        .macOS(.v12),
-        .iOS(.v14),
-        .watchOS(.v4),
-        .tvOS(.v14)
-    ],
-    products: [
-        .library(name: "llama", targets: ["llama"]),
-    ],
-    targets: [
-        .systemLibrary(name: "llama", pkgConfig: "llama"),
-    ]
-)
diff --git a/Sources/llama/llama.h b/Sources/llama/llama.h
deleted file mode 100644
index 41725880e..000000000
--- a/Sources/llama/llama.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#pragma once
-
-#include <llama.h>
-
diff --git a/Sources/llama/module.modulemap b/Sources/llama/module.modulemap
deleted file mode 100644
index d010555b1..000000000
--- a/Sources/llama/module.modulemap
+++ /dev/null
@@ -1,5 +0,0 @@
-module llama [system] {
-    header "llama.h"
-    link "llama"
-    export *
-}
diff --git a/build-xcframework.sh b/build-xcframework.sh
new file mode 100755
index 000000000..37833dc4e
--- /dev/null
+++ b/build-xcframework.sh
@@ -0,0 +1,519 @@
+#!/bin/bash
+#
+# Options
+IOS_MIN_OS_VERSION=16.4
+MACOS_MIN_OS_VERSION=13.3
+VISIONOS_MIN_OS_VERSION=1.0
+TVOS_MIN_OS_VERSION=16.4
+
+BUILD_SHARED_LIBS=OFF
+LLAMA_BUILD_EXAMPLES=OFF
+LLAMA_BUILD_TESTS=OFF
+LLAMA_BUILD_SERVER=OFF
+GGML_METAL=ON
+GGML_METAL_EMBED_LIBRARY=ON
+GGML_BLAS_DEFAULT=ON
+GGML_METAL_USE_BF16=ON
+GGML_OPENMP=OFF
+
+COMMON_C_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
+COMMON_CXX_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g"
+
+# Common options for all builds
+COMMON_CMAKE_ARGS=(
+    -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED=NO
+    -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY=""
+    -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED=NO
+    -DCMAKE_XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT="dwarf-with-dsym"
+    -DCMAKE_XCODE_ATTRIBUTE_GCC_GENERATE_DEBUGGING_SYMBOLS=YES
+    -DCMAKE_XCODE_ATTRIBUTE_COPY_PHASE_STRIP=NO
+    -DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
+    -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
+    -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
+    -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
+    -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
+    -DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER}
+    -DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY}
+    -DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT}
+    -DGGML_METAL=${GGML_METAL}
+    -DGGML_METAL_USE_BF16=${GGML_METAL_USE_BF16}
+    -DGGML_NATIVE=OFF
+    -DGGML_OPENMP=${GGML_OPENMP}
+)
+
+check_required_tool() {
+    local tool=$1
+    local install_message=$2
+
+    if ! command -v $tool &> /dev/null; then
+        echo "Error: $tool is required but not found."
+        echo "$install_message"
+        exit 1
+    fi
+}
+echo "Checking for required tools..."
+check_required_tool "cmake" "Please install CMake 3.28.0 or later (brew install cmake)"
+check_required_tool "xcodebuild" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
+check_required_tool "libtool" "Please install libtool which should be available with Xcode Command Line Tools (CLT). Make sure Xcode CLT is installed (xcode-select --install)"
+check_required_tool "dsymutil" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)"
+
+set -e
+
+## Clean up previous builds
+rm -rf build-apple
+rm -rf build-ios-sim
+rm -rf build-ios-device
+rm -rf build-macos
+rm -rf build-visionos
+rm -rf build-visionos-sim
+rm -rf build-tvos-sim
+rm -rf build-tvos-device
+
+# Setup the xcframework build directory structure
+setup_framework_structure() {
+    local build_dir=$1
+    local min_os_version=$2
+    local platform=$3  # "ios", "macos", "visionos", or "tvos"
+    local framework_name="llama"
+
+    echo "Creating ${platform}-style framework structure for ${build_dir}"
+
+    if [[ "$platform" == "macos" ]]; then
+        # macOS versioned structure uses versioned directories
+        mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Headers
+        mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Modules
+        mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Resources
+
+        # Create symbolic links
+        ln -sf A ${build_dir}/framework/${framework_name}.framework/Versions/Current
+        ln -sf Versions/Current/Headers ${build_dir}/framework/${framework_name}.framework/Headers
+        ln -sf Versions/Current/Modules ${build_dir}/framework/${framework_name}.framework/Modules
+        ln -sf Versions/Current/Resources ${build_dir}/framework/${framework_name}.framework/Resources
+        ln -sf Versions/Current/${framework_name} ${build_dir}/framework/${framework_name}.framework/${framework_name}
+
+        # Set header and module paths
+        local header_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Headers/
+        local module_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Modules/
+    else
+        # iOS/VisionOS/tvOS use a flat structure
+        mkdir -p ${build_dir}/framework/${framework_name}.framework/Headers
+        mkdir -p ${build_dir}/framework/${framework_name}.framework/Modules
+
+        # Remove any existing structure to ensure clean build
+        rm -rf ${build_dir}/framework/${framework_name}.framework/Versions
+
+        # Set header and module paths
+        local header_path=${build_dir}/framework/${framework_name}.framework/Headers/
+        local module_path=${build_dir}/framework/${framework_name}.framework/Modules/
+    fi
+
+    # Copy all required headers (common for all platforms)
+    cp include/llama.h             ${header_path}
+    cp ggml/include/ggml.h         ${header_path}
+    cp ggml/include/ggml-alloc.h   ${header_path}
+    cp ggml/include/ggml-backend.h ${header_path}
+    cp ggml/include/ggml-metal.h   ${header_path}
+    cp ggml/include/ggml-cpu.h     ${header_path}
+    cp ggml/include/ggml-blas.h    ${header_path}
+    cp ggml/include/gguf.h         ${header_path}
+
+    # Create module map (common for all platforms)
+    cat > ${module_path}module.modulemap << EOF
+framework module llama {
+    header "llama.h"
+    header "ggml.h"
+    header "ggml-alloc.h"
+    header "ggml-backend.h"
+    header "ggml-metal.h"
+    header "ggml-cpu.h"
+    header "ggml-blas.h"
+    header "gguf.h"
+
+    link "c++"
+    link framework "Accelerate"
+    link framework "Metal"
+    link framework "Foundation"
+
+    export *
+}
+EOF
+
+    # Platform-specific settings for Info.plist
+    local platform_name=""
+    local sdk_name=""
+    local supported_platform=""
+
+    case "$platform" in
+        "ios")
+            platform_name="iphoneos"
+            sdk_name="iphoneos${min_os_version}"
+            supported_platform="iPhoneOS"
+            local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
+            local device_family='    <key>UIDeviceFamily</key>
+    <array>
+        <integer>1</integer>
+        <integer>2</integer>
+    </array>'
+            ;;
+        "macos")
+            platform_name="macosx"
+            sdk_name="macosx${min_os_version}"
+            supported_platform="MacOSX"
+            local plist_path="${build_dir}/framework/${framework_name}.framework/Versions/A/Resources/Info.plist"
+            local device_family=""
+            ;;
+        "visionos")
+            platform_name="xros"
+            sdk_name="xros${min_os_version}"
+            supported_platform="XRPlatform"
+            local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
+            local device_family=""
+            ;;
+        "tvos")
+            platform_name="appletvos"
+            sdk_name="appletvos${min_os_version}"
+            supported_platform="AppleTVOS"
+            local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist"
+            local device_family='    <key>UIDeviceFamily</key>
+    <array>
+        <integer>3</integer>
+    </array>'
+            ;;
+    esac
+
+    # Create Info.plist
+    cat > ${plist_path} << EOF
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>CFBundleDevelopmentRegion</key>
+    <string>en</string>
+    <key>CFBundleExecutable</key>
+    <string>llama</string>
+    <key>CFBundleIdentifier</key>
+    <string>org.ggml.llama</string>
+    <key>CFBundleInfoDictionaryVersion</key>
+    <string>6.0</string>
+    <key>CFBundleName</key>
+    <string>llama</string>
+    <key>CFBundlePackageType</key>
+    <string>FMWK</string>
+    <key>CFBundleShortVersionString</key>
+    <string>1.0</string>
+    <key>CFBundleVersion</key>
+    <string>1</string>
+    <key>MinimumOSVersion</key>
+    <string>${min_os_version}</string>
+    <key>CFBundleSupportedPlatforms</key>
+    <array>
+        <string>${supported_platform}</string>
+    </array>${device_family}
+    <key>DTPlatformName</key>
+    <string>${platform_name}</string>
+    <key>DTSDKName</key>
+    <string>${sdk_name}</string>
+</dict>
+</plist>
+EOF
+}
+
+# Create dynamic libraries from static libraries.
+combine_static_libraries() {
+    local build_dir="$1"
+    local release_dir="$2"
+    local platform="$3"  # "ios", "macos", "visionos", or "tvos"
+    local is_simulator="$4"
+    local base_dir="$(pwd)"
+    local framework_name="llama"
+
+    # Determine output path based on platform
+    local output_lib=""
+    if [[ "$platform" == "macos" ]]; then
+        # macOS uses versioned structure
+        output_lib="${build_dir}/framework/${framework_name}.framework/Versions/A/${framework_name}"
+    else
+        # iOS, visionOS, and tvOS use a directory flat structure
+        output_lib="${build_dir}/framework/${framework_name}.framework/${framework_name}"
+    fi
+
+    local libs=(
+        "${base_dir}/${build_dir}/src/${release_dir}/libllama.a"
+        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml.a"
+        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-base.a"
+        "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a"
+        "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
+        "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
+    )
+
+    # Create temporary directory for processing
+    local temp_dir="${base_dir}/${build_dir}/temp"
+    mkdir -p "${temp_dir}"
+
+    # Since we have multiple architectures libtool will find object files that do not
+    # match the target architecture. We suppress these warnings.
+    libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null
+
+    # Determine SDK, architectures, and install_name based on platform and simulator flag.
+    local sdk=""
+    local archs=""
+    local min_version_flag=""
+    local install_name=""
+
+    case "$platform" in
+        "ios")
+            if [[ "$is_simulator" == "true" ]]; then
+                sdk="iphonesimulator"
+                archs="arm64 x86_64"
+                min_version_flag="-mios-simulator-version-min=${IOS_MIN_OS_VERSION}"
+            else
+                sdk="iphoneos"
+                archs="arm64"
+                min_version_flag="-mios-version-min=${IOS_MIN_OS_VERSION}"
+            fi
+            install_name="@rpath/llama.framework/llama"
+            ;;
+        "macos")
+            sdk="macosx"
+            archs="arm64 x86_64"
+            min_version_flag="-mmacosx-version-min=${MACOS_MIN_OS_VERSION}"
+            install_name="@rpath/llama.framework/Versions/Current/llama"
+            ;;
+        "visionos")
+            if [[ "$is_simulator" == "true" ]]; then
+                sdk="xrsimulator"
+                archs="arm64 x86_64"
+                min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}-simulator"
+            else
+                sdk="xros"
+                archs="arm64"
+                min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}"
+            fi
+            # Use flat structure for visionOS, same as iOS
+            install_name="@rpath/llama.framework/llama"
+            ;;
+        "tvos")
+            if [[ "$is_simulator" == "true" ]]; then
+                sdk="appletvsimulator"
+                archs="arm64 x86_64"
+                min_version_flag="-mtvos-simulator-version-min=${TVOS_MIN_OS_VERSION}"
+            else
+                sdk="appletvos"
+                archs="arm64"
+                min_version_flag="-mtvos-version-min=${TVOS_MIN_OS_VERSION}"
+            fi
+            install_name="@rpath/llama.framework/llama"
+            ;;
+    esac
+
+    # Build architecture flags
+    local arch_flags=""
+    for arch in $archs; do
+        arch_flags+=" -arch $arch"
+    done
+
+    # Create dynamic library
+    echo "Creating dynamic library for ${platform}."
+    xcrun -sdk $sdk clang++ -dynamiclib \
+        -isysroot $(xcrun --sdk $sdk --show-sdk-path) \
+        $arch_flags \
+        $min_version_flag \
+        -Wl,-force_load,"${temp_dir}/combined.a" \
+        -framework Foundation -framework Metal -framework Accelerate \
+        -install_name "$install_name" \
+        -o "${base_dir}/${output_lib}"
+
+    # Platform-specific post-processing for device builds
+    if [[ "$is_simulator" == "false" ]]; then
+        if command -v vtool &>/dev/null; then
+            case "$platform" in
+                "ios")
+                    echo "Marking binary as a framework binary for iOS..."
+                    vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
+                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
+                    ;;
+                "visionos")
+                    echo "Marking binary as a framework binary for visionOS..."
+                    vtool -set-build-version xros ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
+                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
+                    ;;
+                "tvos")
+                    echo "Marking binary as a framework binary for tvOS..."
+                    vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
+                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
+                    ;;
+            esac
+        else
+            echo "Warning: vtool not found. Binary may not pass App Store validation."
+        fi
+    fi
+
+    echo "Creating properly formatted dSYM..."
+    # Create a separate directory for dSYMs for all platforms
+    mkdir -p "${base_dir}/${build_dir}/dSYMs"
+
+    # iOS and visionOS style dSYM (flat structure)
+    if [[ "$platform" == "ios" || "$platform" == "visionos" || "$platform" == "tvos" ]]; then
+        # Generate dSYM in the dSYMs directory
+        xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM"
+
+        # Create a copy of the binary that will be stripped
+        cp "${base_dir}/${output_lib}" "${temp_dir}/binary_to_strip"
+
+        # Strip debug symbols from the copy
+        xcrun strip -S "${temp_dir}/binary_to_strip" -o "${temp_dir}/stripped_lib"
+
+        # Replace the original with the stripped version
+        mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
+    else
+        # macOS style dSYM
+        # First strip debug info to a separate file
+        xcrun strip -S "${base_dir}/${output_lib}" -o "${temp_dir}/stripped_lib"
+
+        # Generate dSYM in the dSYMs directory
+        xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM"
+
+        # Replace original binary with stripped version
+        mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}"
+    fi
+
+    # Remove any automatically generated dSYM files in the framework structure as they will
+    # otherwise case Invalid Bundle Structure validation errors.
+    if [ -d "${base_dir}/${output_lib}.dSYM" ]; then
+        echo "Removing generated dSYM file in framework structure: ${base_dir}/${output_lib}.dSYM"
+        rm -rf "${base_dir}/${output_lib}.dSYM"
+    fi
+
+    # Clean up
+    rm -rf "${temp_dir}"
+}
+
+echo "Building for iOS simulator..."
+cmake -B build-ios-sim -G Xcode \
+    "${COMMON_CMAKE_ARGS[@]}" \
+    -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
+    -DIOS=ON \
+    -DCMAKE_SYSTEM_NAME=iOS \
+    -DCMAKE_OSX_SYSROOT=iphonesimulator \
+    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
+    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
+    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -S .
+cmake --build build-ios-sim --config Release -- -quiet
+
+echo "Building for iOS devices..."
+cmake -B build-ios-device -G Xcode \
+    "${COMMON_CMAKE_ARGS[@]}" \
+    -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
+    -DCMAKE_OSX_SYSROOT=iphoneos \
+    -DCMAKE_OSX_ARCHITECTURES="arm64" \
+    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
+    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -S .
+cmake --build build-ios-device --config Release -- -quiet
+
+echo "Building for macOS..."
+cmake -B build-macos -G Xcode \
+    "${COMMON_CMAKE_ARGS[@]}" \
+    -DCMAKE_OSX_DEPLOYMENT_TARGET=${MACOS_MIN_OS_VERSION} \
+    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
+    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -S .
+cmake --build build-macos --config Release -- -quiet
+
+echo "Building for visionOS..."
+cmake -B build-visionos -G Xcode \
+    "${COMMON_CMAKE_ARGS[@]}" \
+    -DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
+    -DCMAKE_OSX_ARCHITECTURES="arm64" \
+    -DCMAKE_SYSTEM_NAME=visionOS \
+    -DCMAKE_OSX_SYSROOT=xros \
+    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
+    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
+    -S .
+cmake --build build-visionos --config Release -- -quiet
+
+echo "Building for visionOS simulator..."
+cmake -B build-visionos-sim -G Xcode \
+    "${COMMON_CMAKE_ARGS[@]}" \
+    -DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \
+    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
+    -DCMAKE_SYSTEM_NAME=visionOS \
+    -DCMAKE_OSX_SYSROOT=xrsimulator \
+    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
+    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
+    -S .
+cmake --build build-visionos-sim --config Release -- -quiet
+
+# Add tvOS builds (might need the same u_int definitions as watchOS and visionOS)
+echo "Building for tvOS simulator..."
+cmake -B build-tvos-sim -G Xcode \
+    "${COMMON_CMAKE_ARGS[@]}" \
+    -DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
+    -DCMAKE_SYSTEM_NAME=tvOS \
+    -DCMAKE_OSX_SYSROOT=appletvsimulator \
+    -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
+    -DGGML_METAL=ON \
+    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \
+    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -S .
+cmake --build build-tvos-sim --config Release -- -quiet
+
+echo "Building for tvOS devices..."
+cmake -B build-tvos-device -G Xcode \
+    "${COMMON_CMAKE_ARGS[@]}" \
+    -DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \
+    -DCMAKE_SYSTEM_NAME=tvOS \
+    -DCMAKE_OSX_SYSROOT=appletvos \
+    -DCMAKE_OSX_ARCHITECTURES="arm64" \
+    -DGGML_METAL=ON \
+    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \
+    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
+    -S .
+cmake --build build-tvos-device --config Release -- -quiet
+
+# Setup frameworks and copy binaries and headers
+echo "Setting up framework structures..."
+setup_framework_structure "build-ios-sim" ${IOS_MIN_OS_VERSION} "ios"
+setup_framework_structure "build-ios-device" ${IOS_MIN_OS_VERSION} "ios"
+setup_framework_structure "build-macos" ${MACOS_MIN_OS_VERSION} "macos"
+setup_framework_structure "build-visionos" ${VISIONOS_MIN_OS_VERSION} "visionos"
+setup_framework_structure "build-visionos-sim" ${VISIONOS_MIN_OS_VERSION} "visionos"
+setup_framework_structure "build-tvos-sim" ${TVOS_MIN_OS_VERSION} "tvos"
+setup_framework_structure "build-tvos-device" ${TVOS_MIN_OS_VERSION} "tvos"
+
+# Create dynamic libraries from static libraries
+echo "Creating dynamic libraries from static libraries..."
+combine_static_libraries "build-ios-sim" "Release-iphonesimulator" "ios" "true"
+combine_static_libraries "build-ios-device" "Release-iphoneos" "ios" "false"
+combine_static_libraries "build-macos" "Release" "macos" "false"
+combine_static_libraries "build-visionos" "Release-xros" "visionos" "false"
+combine_static_libraries "build-visionos-sim" "Release-xrsimulator" "visionos" "true"
+combine_static_libraries "build-tvos-sim" "Release-appletvsimulator" "tvos" "true"
+combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false"
+
+# Create XCFramework with correct debug symbols paths
+echo "Creating XCFramework..."
+xcodebuild -create-xcframework \
+    -framework $(pwd)/build-ios-sim/framework/llama.framework \
+    -debug-symbols $(pwd)/build-ios-sim/dSYMs/llama.dSYM \
+    -framework $(pwd)/build-ios-device/framework/llama.framework \
+    -debug-symbols $(pwd)/build-ios-device/dSYMs/llama.dSYM \
+    -framework $(pwd)/build-macos/framework/llama.framework \
+    -debug-symbols $(pwd)/build-macos/dSYMS/llama.dSYM \
+    -framework $(pwd)/build-visionos/framework/llama.framework \
+    -debug-symbols $(pwd)/build-visionos/dSYMs/llama.dSYM \
+    -framework $(pwd)/build-visionos-sim/framework/llama.framework \
+    -debug-symbols $(pwd)/build-visionos-sim/dSYMs/llama.dSYM \
+    -framework $(pwd)/build-tvos-device/framework/llama.framework \
+    -debug-symbols $(pwd)/build-tvos-device/dSYMs/llama.dSYM \
+    -framework $(pwd)/build-tvos-sim/framework/llama.framework \
+    -debug-symbols $(pwd)/build-tvos-sim/dSYMs/llama.dSYM \
+    -output $(pwd)/build-apple/llama.xcframework
diff --git a/examples/llama.swiftui/README.md b/examples/llama.swiftui/README.md
index f717886d6..5b0ee9472 100644
--- a/examples/llama.swiftui/README.md
+++ b/examples/llama.swiftui/README.md
@@ -5,6 +5,21 @@ point for more advanced projects.
 
 For usage instructions and performance stats, check the following discussion: https://github.com/ggml-org/llama.cpp/discussions/4508
 
+
+### Building
+First llama.cpp need to be built and a XCFramework needs to be created. This can be done by running
+the following script from the llama.cpp project root:
+```console
+$ ./build-xcframework.sh
+```
+Open `llama.swiftui.xcodeproj` project in Xcode and you should be able to build and run the app on
+a simulator or a real device.
+
+To use the framework with a different project, the XCFramework can be added to the project by
+adding `build-ios/llama.xcframework` by dragging and dropping it into the project navigator, or
+by manually selecting the framework in the "Frameworks, Libraries, and Embedded Content" section
+of the project settings.
+
 ![image](https://github.com/ggml-org/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299)
 
 Video demonstration:
diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
index ff3d108b2..6f08fe220 100644
--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
@@ -7,7 +7,6 @@
 	objects = {
 
 /* Begin PBXBuildFile section */
-		1809696D2D05A39F00400EE8 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = 1809696C2D05A39F00400EE8 /* llama */; };
 		549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
 		79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; };
 		7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
@@ -18,9 +17,25 @@
 		8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
 		8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
 		8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
+		DD84C9FD2D747FED007778EC /* llama.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = DD84C9FC2D747FED007778EC /* llama.xcframework */; };
+		DD84C9FE2D747FED007778EC /* llama.xcframework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = DD84C9FC2D747FED007778EC /* llama.xcframework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
 		F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
 /* End PBXBuildFile section */
 
+/* Begin PBXCopyFilesBuildPhase section */
+		DD84C9FF2D747FED007778EC /* Embed Frameworks */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 2147483647;
+			dstPath = "";
+			dstSubfolderSpec = 10;
+			files = (
+				DD84C9FE2D747FED007778EC /* llama.xcframework in Embed Frameworks */,
+			);
+			name = "Embed Frameworks";
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXCopyFilesBuildPhase section */
+
 /* Begin PBXFileReference section */
 		549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
 		79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputButton.swift; sourceTree = "<group>"; };
@@ -33,6 +48,7 @@
 		8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
 		8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
 		8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
+		DD84C9FC2D747FED007778EC /* llama.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = llama.xcframework; path = "../../build-apple/llama.xcframework"; sourceTree = "<group>"; };
 		DF2D2FE72B4A59BE00FCB72D /* llama.cpp */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = llama.cpp; path = ../..; sourceTree = "<group>"; };
 		F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LoadCustomButton.swift; sourceTree = "<group>"; };
 /* End PBXFileReference section */
@@ -42,9 +58,9 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				1809696D2D05A39F00400EE8 /* llama in Frameworks */,
 				549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
 				8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
+				DD84C9FD2D747FED007778EC /* llama.xcframework in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -86,6 +102,7 @@
 		8A39BE082AC7601000BFEB40 /* Frameworks */ = {
 			isa = PBXGroup;
 			children = (
+				DD84C9FC2D747FED007778EC /* llama.xcframework */,
 				549479CA2AC9E16000E0F78B /* Metal.framework */,
 				8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
 			);
@@ -144,6 +161,7 @@
 				8A1C836F2AC328BD0096AF73 /* Sources */,
 				8A1C83702AC328BD0096AF73 /* Frameworks */,
 				8A1C83712AC328BD0096AF73 /* Resources */,
+				DD84C9FF2D747FED007778EC /* Embed Frameworks */,
 			);
 			buildRules = (
 			);
@@ -151,7 +169,6 @@
 			);
 			name = llama.swiftui;
 			packageProductDependencies = (
-				1809696C2D05A39F00400EE8 /* llama */,
 			);
 			productName = llama.swiftui;
 			productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
@@ -427,13 +444,6 @@
 			defaultConfigurationName = Release;
 		};
 /* End XCConfigurationList section */
-
-/* Begin XCSwiftPackageProductDependency section */
-		1809696C2D05A39F00400EE8 /* llama */ = {
-			isa = XCSwiftPackageProductDependency;
-			productName = llama;
-		};
-/* End XCSwiftPackageProductDependency section */
 	};
 	rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
 }
diff --git a/scripts/apple/validate-apps.sh b/scripts/apple/validate-apps.sh
new file mode 100755
index 000000000..a571aa6fc
--- /dev/null
+++ b/scripts/apple/validate-apps.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+./scripts/apple/validate-ios.sh
+./scripts/apple/validate-macos.sh
+./scripts/apple/validate-visionos.sh
+./scripts/apple/validate-tvos.sh
diff --git a/scripts/apple/validate-ios.sh b/scripts/apple/validate-ios.sh
new file mode 100755
index 000000000..7bda1b972
--- /dev/null
+++ b/scripts/apple/validate-ios.sh
@@ -0,0 +1,820 @@
+#!/bin/bash
+# validate-ios.sh - Validate iOS Application with embedded llama.xcframework using SwiftUI
+
+# Authentication options (optional) (can be set via environment variables)
+# To use: export APPLE_ID=your.email@example.com
+#         export APPLE_PASSWORD=your-app-specific-password
+#         ./validate-ios.sh
+APPLE_ID=${APPLE_ID:-""}
+APPLE_PASSWORD=${APPLE_PASSWORD:-""}
+
+# Ensure the script exits on error
+set -e
+
+# Function to print usage instructions
+print_usage() {
+  echo "Usage: ./validate-ios.sh [OPTIONS]"
+  echo ""
+  echo "Options:"
+  echo "  --help                 Show this help message"
+  echo "  --apple-id EMAIL       Apple ID email for validation"
+  echo "  --apple-password PWD   App-specific password for Apple ID"
+  echo ""
+  echo "Environment variables:"
+  echo "  APPLE_ID               Apple ID email for validation"
+  echo "  APPLE_PASSWORD         App-specific password for Apple ID"
+  echo ""
+  echo "Notes:"
+  echo "  - Command line options take precedence over environment variables"
+  echo "  - Authentication is optional. If not provided, alternative validation will be performed"
+  echo "  - For APPLE_PASSWORD, use an app-specific password generated at https://appleid.apple.com/account/manage"
+}
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --help)
+      print_usage
+      exit 0
+      ;;
+    --apple-id)
+      APPLE_ID="$2"
+      shift 2
+      ;;
+    --apple-password)
+      APPLE_PASSWORD="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option: $1"
+      print_usage
+      exit 1
+      ;;
+  esac
+done
+
+# Function to clean up in case of error
+cleanup() {
+  # Don't clean up temp files on error to help with debugging
+  echo "===== iOS Validation Process Failed ====="
+  exit 1
+}
+
+# Set up trap to call cleanup function on error
+trap cleanup ERR
+
+set -e  # Exit on any error
+
+ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../.." && pwd )"
+BUILD_DIR="${ROOT_DIR}/validation-builds/ios"
+
+# Configuration
+APP_NAME="iOSLlamaTest"
+BUNDLE_ID="org.ggml.iOSLlamaTest"
+XCFRAMEWORK_PATH="${ROOT_DIR}/build-apple/llama.xcframework"
+TEMP_DIR="${BUILD_DIR}/temp"
+ARCHIVE_PATH="${BUILD_DIR}/${APP_NAME}.xcarchive"
+IPA_PATH="${BUILD_DIR}/${APP_NAME}.ipa"
+VALIDATION_DIR="${BUILD_DIR}/validation"
+
+# Create necessary directories
+mkdir -p "${BUILD_DIR}"
+mkdir -p "${TEMP_DIR}"
+mkdir -p "${VALIDATION_DIR}"
+
+echo "===== iOS Validation Process Started ====="
+
+# 1. Create a simple test app project
+echo "Creating test iOS app project..."
+mkdir -p "${TEMP_DIR}/${APP_NAME}/${APP_NAME}"
+cat > "${TEMP_DIR}/${APP_NAME}/${APP_NAME}/Info.plist" << EOF
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>CFBundleDevelopmentRegion</key>
+    <string>en</string>
+    <key>CFBundleExecutable</key>
+    <string>${APP_NAME}</string>
+    <key>CFBundleIdentifier</key>
+    <string>${BUNDLE_ID}</string>
+    <key>CFBundleInfoDictionaryVersion</key>
+    <string>6.0</string>
+    <key>CFBundleName</key>
+    <string>${APP_NAME}</string>
+    <key>CFBundlePackageType</key>
+    <string>APPL</string>
+    <key>CFBundleShortVersionString</key>
+    <string>1.0</string>
+    <key>CFBundleVersion</key>
+    <string>1</string>
+    <key>LSRequiresIPhoneOS</key>
+    <true/>
+    <key>UILaunchScreen</key>
+    <dict/>
+    <key>UIRequiredDeviceCapabilities</key>
+    <array>
+        <string>armv7</string>
+    </array>
+    <key>UISupportedInterfaceOrientations</key>
+    <array>
+        <string>UIInterfaceOrientationPortrait</string>
+    </array>
+</dict>
+</plist>
+EOF
+
+# Create SwiftUI app files
+mkdir -p "${TEMP_DIR}/${APP_NAME}/${APP_NAME}/Sources"
+
+# Create App.swift
+cat > "${TEMP_DIR}/${APP_NAME}/${APP_NAME}/Sources/App.swift" << EOF
+import SwiftUI
+import llama
+
+@main
+struct LlamaTestApp: App {
+    var body: some Scene {
+        WindowGroup {
+            ContentView()
+        }
+    }
+}
+EOF
+
+# Create ContentView.swift
+cat > "${TEMP_DIR}/${APP_NAME}/${APP_NAME}/Sources/ContentView.swift" << EOF
+import SwiftUI
+import llama
+
+struct ContentView: View {
+    // Test that we can initialize a llama context params struct
+    let params = llama_context_default_params()
+
+    var body: some View {
+        VStack(spacing: 20) {
+            Text("Llama Framework Test")
+                .font(.largeTitle)
+                .padding()
+
+            Text("llama_context_default_params() created successfully")
+                .font(.headline)
+                .multilineTextAlignment(.center)
+                .padding()
+
+            // Display some param values to confirm the framework is working
+            Text("n_ctx: \(params.n_ctx)")
+                .font(.body)
+
+            Text("n_batch: \(params.n_batch)")
+                .font(.body)
+
+            Spacer()
+        }
+        .padding()
+    }
+}
+
+struct ContentView_Previews: PreviewProvider {
+    static var previews: some View {
+        ContentView()
+    }
+}
+EOF
+
+# Create project.pbxproj, fixing the framework search paths issues
+mkdir -p "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj"
+cat > "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << 'EOF'
+// !$*UTF8*$!
+{
+    archiveVersion = 1;
+    classes = {
+    };
+    objectVersion = 54;
+    objects = {
+
+/* Begin PBXBuildFile section */
+        11111111111111111111111 /* App.swift in Sources */ = {isa = PBXBuildFile; fileRef = 22222222222222222222222; };
+        33333333333333333333333 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 44444444444444444444444; };
+        55555555555555555555555 /* llama.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 66666666666666666666666; };
+        77777777777777777777777 /* llama.xcframework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 66666666666666666666666; };
+/* End PBXBuildFile section */
+
+/* Begin PBXCopyFilesBuildPhase section */
+        88888888888888888888888 /* Embed Frameworks */ = {
+            isa = PBXCopyFilesBuildPhase;
+            buildActionMask = 2147483647;
+            dstPath = "";
+            dstSubfolderSpec = 10;
+            files = (
+                77777777777777777777777 /* llama.xcframework in Embed Frameworks */,
+            );
+            name = "Embed Frameworks";
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXCopyFilesBuildPhase section */
+
+/* Begin PBXFileReference section */
+EOF
+
+# Continue with the project.pbxproj file, using the APP_NAME variable appropriately
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << EOF
+        99999999999999999999999 /* ${APP_NAME}.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "${APP_NAME}.app"; sourceTree = BUILT_PRODUCTS_DIR; };
+        22222222222222222222222 /* App.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = App.swift; sourceTree = "<group>"; };
+        44444444444444444444444 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
+        AAAAAAAAAAAAAAAAAAAAAAA /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+        66666666666666666666666 /* llama.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; path = llama.xcframework; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+EOF
+
+# Add the rest of the project file with fixed framework search paths
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << 'EOF'
+/* Begin PBXFrameworksBuildPhase section */
+        BBBBBBBBBBBBBBBBBBBBBBBB /* Frameworks */ = {
+            isa = PBXFrameworksBuildPhase;
+            buildActionMask = 2147483647;
+            files = (
+                55555555555555555555555 /* llama.xcframework in Frameworks */,
+            );
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+EOF
+
+# Continue with the project.pbxproj file, using the APP_NAME variable appropriately
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << EOF
+        CCCCCCCCCCCCCCCCCCCCCCCC /* Products */ = {
+            isa = PBXGroup;
+            children = (
+                99999999999999999999999 /* ${APP_NAME}.app */,
+            );
+            name = Products;
+            sourceTree = "<group>";
+        };
+EOF
+
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << 'EOF'
+        DDDDDDDDDDDDDDDDDDDDDDDD /* Frameworks */ = {
+            isa = PBXGroup;
+            children = (
+                66666666666666666666666 /* llama.xcframework */,
+            );
+            name = Frameworks;
+            sourceTree = "<group>";
+        };
+        EEEEEEEEEEEEEEEEEEEEEEEE = {
+            isa = PBXGroup;
+            children = (
+                FFFFFFFFFFFFFFFFFFFFFFFF /* iOSLlamaTest */,
+                CCCCCCCCCCCCCCCCCCCCCCCC /* Products */,
+                DDDDDDDDDDDDDDDDDDDDDDDD /* Frameworks */,
+            );
+            sourceTree = "<group>";
+        };
+        FFFFFFFFFFFFFFFFFFFFFFFF /* iOSLlamaTest */ = {
+            isa = PBXGroup;
+            children = (
+                1111111111111111111111AA /* Sources */,
+                AAAAAAAAAAAAAAAAAAAAAAA /* Info.plist */,
+            );
+            path = "iOSLlamaTest";
+            sourceTree = "<group>";
+        };
+        1111111111111111111111AA /* Sources */ = {
+            isa = PBXGroup;
+            children = (
+                22222222222222222222222 /* App.swift */,
+                44444444444444444444444 /* ContentView.swift */,
+            );
+            path = Sources;
+            sourceTree = "<group>";
+        };
+/* End PBXGroup section */
+EOF
+
+# Continue with the project.pbxproj file, using the APP_NAME variable appropriately
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << EOF
+/* Begin PBXNativeTarget section */
+        3333333333333333333333AA /* ${APP_NAME} */ = {
+            isa = PBXNativeTarget;
+            buildConfigurationList = 4444444444444444444444AA /* Build configuration list for PBXNativeTarget "${APP_NAME}" */;
+            buildPhases = (
+                5555555555555555555555AA /* Sources */,
+                BBBBBBBBBBBBBBBBBBBBBBBB /* Frameworks */,
+                6666666666666666666666AA /* Resources */,
+                88888888888888888888888 /* Embed Frameworks */,
+            );
+            buildRules = (
+            );
+            dependencies = (
+            );
+            name = "${APP_NAME}";
+            productName = "${APP_NAME}";
+            productReference = 99999999999999999999999 /* ${APP_NAME}.app */;
+            productType = "com.apple.product-type.application";
+        };
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+        7777777777777777777777AA /* Project object */ = {
+            isa = PBXProject;
+            attributes = {
+                LastSwiftUpdateCheck = 1240;
+                LastUpgradeCheck = 1240;
+                TargetAttributes = {
+                    3333333333333333333333AA = {
+                        CreatedOnToolsVersion = 12.4;
+                    };
+                };
+            };
+            buildConfigurationList = 8888888888888888888888AA /* Build configuration list for PBXProject "${APP_NAME}" */;
+            compatibilityVersion = "Xcode 12.0";
+            developmentRegion = en;
+            hasScannedForEncodings = 0;
+            knownRegions = (
+                en,
+                Base,
+            );
+            mainGroup = EEEEEEEEEEEEEEEEEEEEEEEE;
+            productRefGroup = CCCCCCCCCCCCCCCCCCCCCCCC /* Products */;
+            projectDirPath = "";
+            projectRoot = "";
+            targets = (
+                3333333333333333333333AA /* ${APP_NAME} */,
+            );
+        };
+/* End PBXProject section */
+EOF
+
+# Add the rest of the file with correct FRAMEWORK_SEARCH_PATHS
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << 'EOF'
+/* Begin PBXResourcesBuildPhase section */
+        6666666666666666666666AA /* Resources */ = {
+            isa = PBXResourcesBuildPhase;
+            buildActionMask = 2147483647;
+            files = (
+            );
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+        5555555555555555555555AA /* Sources */ = {
+            isa = PBXSourcesBuildPhase;
+            buildActionMask = 2147483647;
+            files = (
+                33333333333333333333333 /* ContentView.swift in Sources */,
+                11111111111111111111111 /* App.swift in Sources */,
+            );
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+        9999999999999999999999AA /* Debug */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ALWAYS_SEARCH_USER_PATHS = NO;
+                CLANG_ANALYZER_NONNULL = YES;
+                CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+                CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+                CLANG_CXX_LIBRARY = "libc++";
+                CLANG_ENABLE_MODULES = YES;
+                CLANG_ENABLE_OBJC_ARC = YES;
+                CLANG_ENABLE_OBJC_WEAK = YES;
+                CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+                CLANG_WARN_BOOL_CONVERSION = YES;
+                CLANG_WARN_COMMA = YES;
+                CLANG_WARN_CONSTANT_CONVERSION = YES;
+                CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+                CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+                CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+                CLANG_WARN_EMPTY_BODY = YES;
+                CLANG_WARN_ENUM_CONVERSION = YES;
+                CLANG_WARN_INFINITE_RECURSION = YES;
+                CLANG_WARN_INT_CONVERSION = YES;
+                CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+                CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+                CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+                CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+                CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+                CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+                CLANG_WARN_STRICT_PROTOTYPES = YES;
+                CLANG_WARN_SUSPICIOUS_MOVE = YES;
+                CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+                CLANG_WARN_UNREACHABLE_CODE = YES;
+                CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+                COPY_PHASE_STRIP = NO;
+                DEBUG_INFORMATION_FORMAT = dwarf;
+                ENABLE_STRICT_OBJC_MSGSEND = YES;
+                ENABLE_TESTABILITY = YES;
+                GCC_C_LANGUAGE_STANDARD = gnu11;
+                GCC_DYNAMIC_NO_PIC = NO;
+                GCC_NO_COMMON_BLOCKS = YES;
+                GCC_OPTIMIZATION_LEVEL = 0;
+                GCC_PREPROCESSOR_DEFINITIONS = (
+                    "DEBUG=1",
+                    "$(inherited)",
+                );
+                GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+                GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+                GCC_WARN_UNDECLARED_SELECTOR = YES;
+                GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+                GCC_WARN_UNUSED_FUNCTION = YES;
+                GCC_WARN_UNUSED_VARIABLE = YES;
+                IPHONEOS_DEPLOYMENT_TARGET = 16.4;
+                MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+                MTL_FAST_MATH = YES;
+                ONLY_ACTIVE_ARCH = YES;
+                SDKROOT = iphoneos;
+                SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
+                SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+            };
+            name = Debug;
+        };
+        AAAAAAAAAAAAAAAAAAAAABBB /* Release */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ALWAYS_SEARCH_USER_PATHS = NO;
+                CLANG_ANALYZER_NONNULL = YES;
+                CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+                CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+                CLANG_CXX_LIBRARY = "libc++";
+                CLANG_ENABLE_MODULES = YES;
+                CLANG_ENABLE_OBJC_ARC = YES;
+                CLANG_ENABLE_OBJC_WEAK = YES;
+                CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+                CLANG_WARN_BOOL_CONVERSION = YES;
+                CLANG_WARN_COMMA = YES;
+                CLANG_WARN_CONSTANT_CONVERSION = YES;
+                CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+                CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+                CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+                CLANG_WARN_EMPTY_BODY = YES;
+                CLANG_WARN_ENUM_CONVERSION = YES;
+                CLANG_WARN_INFINITE_RECURSION = YES;
+                CLANG_WARN_INT_CONVERSION = YES;
+                CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+                CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+                CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+                CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+                CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+                CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+                CLANG_WARN_STRICT_PROTOTYPES = YES;
+                CLANG_WARN_SUSPICIOUS_MOVE = YES;
+                CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+                CLANG_WARN_UNREACHABLE_CODE = YES;
+                CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+                COPY_PHASE_STRIP = NO;
+                DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+                ENABLE_NS_ASSERTIONS = NO;
+                ENABLE_STRICT_OBJC_MSGSEND = YES;
+                GCC_C_LANGUAGE_STANDARD = gnu11;
+                GCC_NO_COMMON_BLOCKS = YES;
+                GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+                GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+                GCC_WARN_UNDECLARED_SELECTOR = YES;
+                GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+                GCC_WARN_UNUSED_FUNCTION = YES;
+                GCC_WARN_UNUSED_VARIABLE = YES;
+                IPHONEOS_DEPLOYMENT_TARGET = 16.4;
+                MTL_ENABLE_DEBUG_INFO = NO;
+                MTL_FAST_MATH = YES;
+                SDKROOT = iphoneos;
+                SWIFT_COMPILATION_MODE = wholemodule;
+                SWIFT_OPTIMIZATION_LEVEL = "-O";
+                VALIDATE_PRODUCT = YES;
+            };
+            name = Release;
+        };
+        BBBBBBBBBBBBBBBBBBBBBBCCC /* Debug */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+                ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+                CODE_SIGN_STYLE = Manual;
+                DEVELOPMENT_TEAM = "";
+                ENABLE_PREVIEWS = YES;
+                FRAMEWORK_SEARCH_PATHS = "$(PROJECT_DIR)";
+                INFOPLIST_FILE = "iOSLlamaTest/Info.plist";
+                LD_RUNPATH_SEARCH_PATHS = (
+                    "$(inherited)",
+                    "@executable_path/Frameworks",
+                );
+                PRODUCT_BUNDLE_IDENTIFIER = "org.ggml.iOSLlamaTest";
+                PRODUCT_NAME = "$(TARGET_NAME)";
+                PROVISIONING_PROFILE_SPECIFIER = "";
+                SWIFT_VERSION = 5.0;
+                TARGETED_DEVICE_FAMILY = "1,2";
+            };
+            name = Debug;
+        };
+        CCCCCCCCCCCCCCCCCCCCCCDDD /* Release */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+                ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+                CODE_SIGN_STYLE = Manual;
+                DEVELOPMENT_TEAM = "";
+                ENABLE_PREVIEWS = YES;
+                FRAMEWORK_SEARCH_PATHS = (
+                    "$(inherited)",
+                    "$(PROJECT_DIR)",
+                );
+                INFOPLIST_FILE = "iOSLlamaTest/Info.plist";
+                LD_RUNPATH_SEARCH_PATHS = (
+                    "$(inherited)",
+                    "@executable_path/Frameworks",
+                );
+                PRODUCT_BUNDLE_IDENTIFIER = "org.ggml.iOSLlamaTest";
+                PRODUCT_NAME = "$(TARGET_NAME)";
+                PROVISIONING_PROFILE_SPECIFIER = "";
+                SWIFT_VERSION = 5.0;
+                TARGETED_DEVICE_FAMILY = "1,2";
+            };
+            name = Release;
+        };
+/* End XCBuildConfiguration section */
+EOF
+
+# Finish the project.pbxproj file
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << EOF
+/* Begin XCConfigurationList section */
+        8888888888888888888888AA /* Build configuration list for PBXProject "${APP_NAME}" */ = {
+            isa = XCConfigurationList;
+            buildConfigurations = (
+                9999999999999999999999AA /* Debug */,
+                AAAAAAAAAAAAAAAAAAAAABBB /* Release */,
+            );
+            defaultConfigurationIsVisible = 0;
+            defaultConfigurationName = Release;
+        };
+        4444444444444444444444AA /* Build configuration list for PBXNativeTarget "${APP_NAME}" */ = {
+            isa = XCConfigurationList;
+            buildConfigurations = (
+                BBBBBBBBBBBBBBBBBBBBBBCCC /* Debug */,
+                CCCCCCCCCCCCCCCCCCCCCCDDD /* Release */,
+            );
+            defaultConfigurationIsVisible = 0;
+            defaultConfigurationName = Release;
+        };
+/* End XCConfigurationList section */
+    };
+    rootObject = 7777777777777777777777AA /* Project object */;
+}
+EOF
+
+# 2. Copy XCFramework to test project
+echo "Copying XCFramework to test project..."
+cp -R "${XCFRAMEWORK_PATH}" "${TEMP_DIR}/${APP_NAME}/"
+
+# 3. Build and archive the app
+echo "Building and archiving test app..."
+cd "${TEMP_DIR}/${APP_NAME}"
+
+# Create a simple xcscheme file to avoid xcodebuild scheme issues
+mkdir -p "${APP_NAME}.xcodeproj/xcshareddata/xcschemes"
+cat > "${APP_NAME}.xcodeproj/xcshareddata/xcschemes/${APP_NAME}.xcscheme" << EOF
+<?xml version="1.0" encoding="UTF-8"?>
+<Scheme
+   LastUpgradeVersion = "1240"
+   version = "1.3">
+   <BuildAction
+      parallelizeBuildables = "YES"
+      buildImplicitDependencies = "YES">
+      <BuildActionEntries>
+         <BuildActionEntry
+            buildForTesting = "YES"
+            buildForRunning = "YES"
+            buildForProfiling = "YES"
+            buildForArchiving = "YES"
+            buildForAnalyzing = "YES">
+            <BuildableReference
+               BuildableIdentifier = "primary"
+               BlueprintIdentifier = "3333333333333333333333AA"
+               BuildableName = "${APP_NAME}.app"
+               BlueprintName = "${APP_NAME}"
+               ReferencedContainer = "container:${APP_NAME}.xcodeproj">
+            </BuildableReference>
+         </BuildActionEntry>
+      </BuildActionEntries>
+   </BuildAction>
+   <TestAction
+      buildConfiguration = "Debug"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      shouldUseLaunchSchemeArgsEnv = "YES">
+      <Testables>
+      </Testables>
+   </TestAction>
+   <LaunchAction
+      buildConfiguration = "Debug"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      launchStyle = "0"
+      useCustomWorkingDirectory = "NO"
+      ignoresPersistentStateOnLaunch = "NO"
+      debugDocumentVersioning = "YES"
+      debugServiceExtension = "internal"
+      allowLocationSimulation = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "3333333333333333333333AA"
+            BuildableName = "${APP_NAME}.app"
+            BlueprintName = "${APP_NAME}"
+            ReferencedContainer = "container:${APP_NAME}.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </LaunchAction>
+   <ProfileAction
+      buildConfiguration = "Release"
+      shouldUseLaunchSchemeArgsEnv = "YES"
+      savedToolIdentifier = ""
+      useCustomWorkingDirectory = "NO"
+      debugDocumentVersioning = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "3333333333333333333333AA"
+            BuildableName = "${APP_NAME}.app"
+            BlueprintName = "${APP_NAME}"
+            ReferencedContainer = "container:${APP_NAME}.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </ProfileAction>
+   <AnalyzeAction
+      buildConfiguration = "Debug">
+   </AnalyzeAction>
+   <ArchiveAction
+      buildConfiguration = "Release"
+      revealArchiveInOrganizer = "YES">
+   </ArchiveAction>
+</Scheme>
+EOF
+
+# Now use xcodebuild with an explicitly defined product name
+xcodebuild -project "${APP_NAME}.xcodeproj" -scheme "${APP_NAME}" -sdk iphoneos -configuration Release archive -archivePath "${ARCHIVE_PATH}" CODE_SIGN_IDENTITY="-" CODE_SIGNING_REQUIRED=NO CODE_SIGNING_ALLOWED=NO PRODUCT_NAME="${APP_NAME}" SWIFT_OPTIMIZATION_LEVEL="-Onone" -quiet
+
+# 4. Create IPA from archive
+echo "Creating IPA from archive..."
+mkdir -p "${TEMP_DIR}/Payload"
+cp -R "${ARCHIVE_PATH}/Products/Applications/${APP_NAME}.app" "${TEMP_DIR}/Payload/"
+
+# Check and log app structure before zipping
+echo "App structure:"
+ls -la "${TEMP_DIR}/Payload/${APP_NAME}.app/"
+echo "Frameworks:"
+ls -la "${TEMP_DIR}/Payload/${APP_NAME}.app/Frameworks/" 2>/dev/null || echo "No Frameworks directory found"
+
+cd "${TEMP_DIR}"
+zip -r "${IPA_PATH}" Payload
+
+# Check embedded provisioning profile
+echo "Checking provisioning profile (if any)..."
+PROVISIONING_PROFILE=$(find "${ARCHIVE_PATH}/Products/Applications/${APP_NAME}.app" -name "embedded.mobileprovision" 2>/dev/null)
+if [ -n "$PROVISIONING_PROFILE" ]; then
+    echo "Found embedded provisioning profile:"
+    security cms -D -i "$PROVISIONING_PROFILE" || echo "Unable to decode provisioning profile"
+else
+    echo "No embedded provisioning profile found (expected for ad-hoc builds)"
+fi
+
+# 5. Validate the IPA
+echo "Validating IPA..."
+VALIDATION_OUTPUT="${VALIDATION_DIR}/validation_output.txt"
+
+# Check if authentication credentials are provided
+AUTH_ARGS=""
+if [ -n "$APPLE_ID" ] && [ -n "$APPLE_PASSWORD" ]; then
+    echo "Using Apple ID authentication for validation..."
+    AUTH_ARGS="--username \"$APPLE_ID\" --password \"$APPLE_PASSWORD\""
+else
+    echo "No authentication credentials provided. Will perform basic validation."
+    echo "To use your personal developer account, you can run the script with:"
+    echo "  APPLE_ID='your.email@example.com' APPLE_PASSWORD='your-app-specific-password' ./validate-ios.sh"
+    echo "Note: You need to create an app-specific password at https://appleid.apple.com/account/manage"
+fi
+
+# Run validation with detailed output
+echo "Running validation with altool..."
+if [ -n "$AUTH_ARGS" ]; then
+    # Use eval to properly handle the quoted arguments
+    eval "xcrun altool --validate-app -f \"${IPA_PATH}\" --type ios --output-format xml $AUTH_ARGS" 2>&1 | tee "${VALIDATION_OUTPUT}"
+else
+    xcrun altool --validate-app -f "${IPA_PATH}" --type ios --output-format xml 2>&1 | tee "${VALIDATION_OUTPUT}"
+fi
+VALIDATION_RESULT=$?
+
+# Final validation result
+FINAL_VALIDATION_RESULT=0
+
+# Check if validation failed because the app isn't in App Store Connect
+if grep -q "No suitable application records were found" "${VALIDATION_OUTPUT}"; then
+    echo "⚠️ App Store Connect Warning: The app bundle identifier is not found in App Store Connect"
+    echo "This is expected for apps that haven't been registered in App Store Connect yet."
+    echo "This doesn't indicate a problem with the build or framework."
+
+    # Perform alternative validation
+    echo "Performing alternative validation checks..."
+
+    # Check if IPA was created successfully
+    if [ -f "${IPA_PATH}" ] && [ -s "${IPA_PATH}" ]; then
+        echo "✅ IPA file created successfully"
+    else
+        echo "❌ IPA file not created or empty"
+        FINAL_VALIDATION_RESULT=1
+    fi
+
+    # Check if app binary exists and is executable
+    if [ -f "${TEMP_DIR}/Payload/${APP_NAME}.app/${APP_NAME}" ] && [ -x "${TEMP_DIR}/Payload/${APP_NAME}.app/${APP_NAME}" ]; then
+        echo "✅ App binary exists and is executable"
+    else
+        echo "❌ App binary missing or not executable"
+        FINAL_VALIDATION_RESULT=1
+    fi
+
+    # Check if framework was properly embedded
+    if [ -d "${TEMP_DIR}/Payload/${APP_NAME}.app/Frameworks/llama.framework" ]; then
+        echo "✅ llama.framework properly embedded"
+    else
+        echo "❌ llama.framework not properly embedded"
+        FINAL_VALIDATION_RESULT=1
+    fi
+
+    # Check if framework binary exists
+    if [ -f "${TEMP_DIR}/Payload/${APP_NAME}.app/Frameworks/llama.framework/llama" ]; then
+        echo "✅ Framework binary exists"
+
+        # Further validate framework by checking architecture
+        ARCHS=$(lipo -info "${TEMP_DIR}/Payload/${APP_NAME}.app/Frameworks/llama.framework/llama" 2>/dev/null | grep -o "arm64\\|armv7\\|x86_64" | tr '\n' ' ')
+        if [ -n "$ARCHS" ]; then
+            echo "✅ Framework architecture(s): $ARCHS"
+        else
+            echo "⚠️ Could not determine framework architecture"
+        fi
+    else
+        echo "❌ Framework binary missing"
+        FINAL_VALIDATION_RESULT=1
+    fi
+
+    if [ $FINAL_VALIDATION_RESULT -eq 0 ]; then
+        echo "✅ Alternative validation PASSED: App built successfully with embedded framework"
+    else
+        echo "❌ Alternative validation FAILED: Issues found with the app or framework"
+    fi
+elif grep -q "You must specify authentication credentials" "${VALIDATION_OUTPUT}" && [ -z "$AUTH_ARGS" ]; then
+    echo "✅ iOS Validation PASSED: IPA successfully validated"
+    echo "Results saved to ${VALIDATION_OUTPUT}"
+else
+    echo "❌ iOS Validation FAILED: IPA validation found issues"
+    echo "See validation output at ${VALIDATION_OUTPUT}"
+    echo ""
+    echo "==== VALIDATION ERRORS ===="
+
+    # Try to extract specific errors from the output
+    if grep -q "Error" "${VALIDATION_OUTPUT}"; then
+        grep -A 5 "Error" "${VALIDATION_OUTPUT}"
+    else
+        # If no specific error found, show the whole log
+        cat "${VALIDATION_OUTPUT}"
+    fi
+
+    # Additional debugging: check IPA contents
+    echo ""
+    echo "==== IPA CONTENTS ===="
+    mkdir -p "${TEMP_DIR}/ipa_contents"
+    unzip -q "${IPA_PATH}" -d "${TEMP_DIR}/ipa_contents"
+    ls -la "${TEMP_DIR}/ipa_contents/Payload/${APP_NAME}.app/"
+
+    # Check for code signing issues
+    echo ""
+    echo "==== CODE SIGNING INFO ===="
+    codesign -vv -d "${TEMP_DIR}/ipa_contents/Payload/${APP_NAME}.app" 2>&1 || echo "Code signing verification failed"
+
+    # Check embedded frameworks
+    echo ""
+    echo "==== FRAMEWORK INFO ===="
+    ls -la "${TEMP_DIR}/ipa_contents/Payload/${APP_NAME}.app/Frameworks/" 2>/dev/null || echo "No Frameworks directory found"
+fi
+
+# Don't clean up on error to allow inspection
+if [ $FINAL_VALIDATION_RESULT -ne 0 ]; then
+    echo ""
+    echo "Temporary files kept for inspection at: ${TEMP_DIR}"
+    echo "===== iOS Validation Process Failed ====="
+    exit 1
+fi
+
+# Clean up temporary files but keep build artifacts
+if [ $FINAL_VALIDATION_RESULT -eq 0 ]; then
+    echo "Cleaning up temporary files..."
+    #rm -rf "${TEMP_DIR}"
+fi
+
+echo "===== iOS Validation Process Completed ====="
+exit $FINAL_VALIDATION_RESULT
diff --git a/scripts/apple/validate-macos.sh b/scripts/apple/validate-macos.sh
new file mode 100755
index 000000000..6dc28e694
--- /dev/null
+++ b/scripts/apple/validate-macos.sh
@@ -0,0 +1,781 @@
+#!/bin/bash
+# validate-macos.sh - Validate macOS Application with embedded llama.xcframework using SwiftUI
+
+# Authentication options (optional) (can be set via environment variables)
+# To use: export APPLE_ID=your.email@example.com
+#         export APPLE_PASSWORD=your-app-specific-password
+#         ./validate-macos.sh
+APPLE_ID=${APPLE_ID:-""}
+APPLE_PASSWORD=${APPLE_PASSWORD:-""}
+
+# Ensure the script exits on error
+set -e
+
+# Function to print usage instructions
+print_usage() {
+  echo "Usage: ./validate-macos.sh [OPTIONS]"
+  echo ""
+  echo "Options:"
+  echo "  --help                 Show this help message"
+  echo "  --apple-id EMAIL       Apple ID email for validation"
+  echo "  --apple-password PWD   App-specific password for Apple ID"
+  echo ""
+  echo "Environment variables:"
+  echo "  APPLE_ID               Apple ID email for validation"
+  echo "  APPLE_PASSWORD         App-specific password for Apple ID"
+  echo ""
+  echo "Notes:"
+  echo "  - Command line options take precedence over environment variables"
+  echo "  - Authentication is optional. If not provided, alternative validation will be performed"
+  echo "  - For APPLE_PASSWORD, use an app-specific password generated at https://appleid.apple.com/account/manage"
+}
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --help)
+      print_usage
+      exit 0
+      ;;
+    --apple-id)
+      APPLE_ID="$2"
+      shift 2
+      ;;
+    --apple-password)
+      APPLE_PASSWORD="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option: $1"
+      print_usage
+      exit 1
+      ;;
+  esac
+done
+
+# Function to clean up in case of error
+cleanup() {
+  # Don't clean up temp files on error to help with debugging
+  echo "===== macOS Validation Process Failed ====="
+  exit 1
+}
+
+# Set up trap to call cleanup function on error
+trap cleanup ERR
+
+set -e  # Exit on any error
+
+ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../.." && pwd )"
+BUILD_DIR="${ROOT_DIR}/validation-builds/ios"
+
+# Configuration
+APP_NAME="MacOSLlamaTest"
+BUNDLE_ID="org.ggml.MacOSLlamaTest"
+XCFRAMEWORK_PATH="${ROOT_DIR}/build-apple/llama.xcframework"
+TEMP_DIR="${BUILD_DIR}/temp"
+ARCHIVE_PATH="${BUILD_DIR}/${APP_NAME}.xcarchive"
+APP_PATH="${BUILD_DIR}/${APP_NAME}.app"
+ZIP_PATH="${BUILD_DIR}/${APP_NAME}.zip"
+VALIDATION_DIR="${BUILD_DIR}/validation"
+
+# Create necessary directories
+mkdir -p "${BUILD_DIR}"
+mkdir -p "${TEMP_DIR}"
+mkdir -p "${VALIDATION_DIR}"
+
+echo "===== macOS Validation Process Started ====="
+
+# 1. Create a simple test app project
+echo "Creating test macOS app project..."
+mkdir -p "${TEMP_DIR}/${APP_NAME}/${APP_NAME}"
+cat > "${TEMP_DIR}/${APP_NAME}/${APP_NAME}/Info.plist" << EOF
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>CFBundleDevelopmentRegion</key>
+    <string>en</string>
+    <key>CFBundleExecutable</key>
+    <string>${APP_NAME}</string>
+    <key>CFBundleIdentifier</key>
+    <string>${BUNDLE_ID}</string>
+    <key>CFBundleInfoDictionaryVersion</key>
+    <string>6.0</string>
+    <key>CFBundleName</key>
+    <string>${APP_NAME}</string>
+    <key>CFBundlePackageType</key>
+    <string>APPL</string>
+    <key>CFBundleShortVersionString</key>
+    <string>1.0</string>
+    <key>CFBundleVersion</key>
+    <string>1</string>
+    <key>LSMinimumSystemVersion</key>
+    <string>12.0</string>
+    <key>NSHumanReadableCopyright</key>
+    <string>Copyright © 2025 GGML. All rights reserved.</string>
+    <key>NSPrincipalClass</key>
+    <string>NSApplication</string>
+</dict>
+</plist>
+EOF
+
+# Create SwiftUI app files
+mkdir -p "${TEMP_DIR}/${APP_NAME}/${APP_NAME}/Sources"
+
+# Create App.swift
+cat > "${TEMP_DIR}/${APP_NAME}/${APP_NAME}/Sources/App.swift" << EOF
+import SwiftUI
+import llama
+
+@main
+struct LlamaTestApp: App {
+    var body: some Scene {
+        WindowGroup {
+            ContentView()
+        }
+    }
+}
+EOF
+
+# Create ContentView.swift with macOS specific elements
+cat > "${TEMP_DIR}/${APP_NAME}/${APP_NAME}/Sources/ContentView.swift" << EOF
+import SwiftUI
+import llama
+
+struct ContentView: View {
+    // Test that we can initialize a llama context params struct
+    let params = llama_context_default_params()
+
+    var body: some View {
+        VStack(spacing: 20) {
+            Text("Llama Framework Test on macOS")
+                .font(.largeTitle)
+                .padding()
+
+            Text("llama_context_default_params() created successfully")
+                .font(.headline)
+                .multilineTextAlignment(.center)
+                .padding()
+
+            // Display some param values to confirm the framework is working
+            Text("n_ctx: \(params.n_ctx)")
+                .font(.body)
+
+            Text("n_batch: \(params.n_batch)")
+                .font(.body)
+
+            Spacer()
+        }
+        .padding()
+        .frame(width: 600, height: 400)
+    }
+}
+
+struct ContentView_Previews: PreviewProvider {
+    static var previews: some View {
+        ContentView()
+    }
+}
+EOF
+
+# Create project.pbxproj, fixing the framework search paths issues
+mkdir -p "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj"
+cat > "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << 'EOF'
+// !$*UTF8*$!
+{
+    archiveVersion = 1;
+    classes = {
+    };
+    objectVersion = 54;
+    objects = {
+
+/* Begin PBXBuildFile section */
+        11111111111111111111111 /* App.swift in Sources */ = {isa = PBXBuildFile; fileRef = 22222222222222222222222; };
+        33333333333333333333333 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 44444444444444444444444; };
+        55555555555555555555555 /* llama.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 66666666666666666666666; };
+        77777777777777777777777 /* llama.xcframework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 66666666666666666666666; };
+/* End PBXBuildFile section */
+
+/* Begin PBXCopyFilesBuildPhase section */
+        88888888888888888888888 /* Embed Frameworks */ = {
+            isa = PBXCopyFilesBuildPhase;
+            buildActionMask = 2147483647;
+            dstPath = "";
+            dstSubfolderSpec = 10;
+            files = (
+                77777777777777777777777 /* llama.xcframework in Embed Frameworks */,
+            );
+            name = "Embed Frameworks";
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXCopyFilesBuildPhase section */
+
+/* Begin PBXFileReference section */
+EOF
+
+# Continue with the project.pbxproj file, using the APP_NAME variable appropriately
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << EOF
+        99999999999999999999999 /* ${APP_NAME}.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "${APP_NAME}.app"; sourceTree = BUILT_PRODUCTS_DIR; };
+        22222222222222222222222 /* App.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = App.swift; sourceTree = "<group>"; };
+        44444444444444444444444 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
+        AAAAAAAAAAAAAAAAAAAAAAA /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+        66666666666666666666666 /* llama.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; path = llama.xcframework; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+EOF
+
+# Add the rest of the project file with fixed framework search paths
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << 'EOF'
+/* Begin PBXFrameworksBuildPhase section */
+        BBBBBBBBBBBBBBBBBBBBBBBB /* Frameworks */ = {
+            isa = PBXFrameworksBuildPhase;
+            buildActionMask = 2147483647;
+            files = (
+                55555555555555555555555 /* llama.xcframework in Frameworks */,
+            );
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+EOF
+
+# Continue with the project.pbxproj file, using the APP_NAME variable appropriately
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << EOF
+        CCCCCCCCCCCCCCCCCCCCCCCC /* Products */ = {
+            isa = PBXGroup;
+            children = (
+                99999999999999999999999 /* ${APP_NAME}.app */,
+            );
+            name = Products;
+            sourceTree = "<group>";
+        };
+EOF
+
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << 'EOF'
+        DDDDDDDDDDDDDDDDDDDDDDDD /* Frameworks */ = {
+            isa = PBXGroup;
+            children = (
+                66666666666666666666666 /* llama.xcframework */,
+            );
+            name = Frameworks;
+            sourceTree = "<group>";
+        };
+        EEEEEEEEEEEEEEEEEEEEEEEE = {
+            isa = PBXGroup;
+            children = (
+                FFFFFFFFFFFFFFFFFFFFFFFF /* MacOSLlamaTest */,
+                CCCCCCCCCCCCCCCCCCCCCCCC /* Products */,
+                DDDDDDDDDDDDDDDDDDDDDDDD /* Frameworks */,
+            );
+            sourceTree = "<group>";
+        };
+        FFFFFFFFFFFFFFFFFFFFFFFF /* MacOSLlamaTest */ = {
+            isa = PBXGroup;
+            children = (
+                1111111111111111111111AA /* Sources */,
+                AAAAAAAAAAAAAAAAAAAAAAA /* Info.plist */,
+            );
+            path = "MacOSLlamaTest";
+            sourceTree = "<group>";
+        };
+        1111111111111111111111AA /* Sources */ = {
+            isa = PBXGroup;
+            children = (
+                22222222222222222222222 /* App.swift */,
+                44444444444444444444444 /* ContentView.swift */,
+            );
+            path = Sources;
+            sourceTree = "<group>";
+        };
+/* End PBXGroup section */
+EOF
+
+# Continue with the project.pbxproj file, using the APP_NAME variable appropriately
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << EOF
+/* Begin PBXNativeTarget section */
+        3333333333333333333333AA /* ${APP_NAME} */ = {
+            isa = PBXNativeTarget;
+            buildConfigurationList = 4444444444444444444444AA /* Build configuration list for PBXNativeTarget "${APP_NAME}" */;
+            buildPhases = (
+                5555555555555555555555AA /* Sources */,
+                BBBBBBBBBBBBBBBBBBBBBBBB /* Frameworks */,
+                6666666666666666666666AA /* Resources */,
+                88888888888888888888888 /* Embed Frameworks */,
+            );
+            buildRules = (
+            );
+            dependencies = (
+            );
+            name = "${APP_NAME}";
+            productName = "${APP_NAME}";
+            productReference = 99999999999999999999999 /* ${APP_NAME}.app */;
+            productType = "com.apple.product-type.application";
+        };
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+        7777777777777777777777AA /* Project object */ = {
+            isa = PBXProject;
+            attributes = {
+                LastSwiftUpdateCheck = 1240;
+                LastUpgradeCheck = 1240;
+                TargetAttributes = {
+                    3333333333333333333333AA = {
+                        CreatedOnToolsVersion = 12.4;
+                    };
+                };
+            };
+            buildConfigurationList = 8888888888888888888888AA /* Build configuration list for PBXProject "${APP_NAME}" */;
+            compatibilityVersion = "Xcode 12.0";
+            developmentRegion = en;
+            hasScannedForEncodings = 0;
+            knownRegions = (
+                en,
+                Base,
+            );
+            mainGroup = EEEEEEEEEEEEEEEEEEEEEEEE;
+            productRefGroup = CCCCCCCCCCCCCCCCCCCCCCCC /* Products */;
+            projectDirPath = "";
+            projectRoot = "";
+            targets = (
+                3333333333333333333333AA /* ${APP_NAME} */,
+            );
+        };
+/* End PBXProject section */
+EOF
+
+# Add the rest of the file with correct FRAMEWORK_SEARCH_PATHS and macOS settings
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << 'EOF'
+/* Begin PBXResourcesBuildPhase section */
+        6666666666666666666666AA /* Resources */ = {
+            isa = PBXResourcesBuildPhase;
+            buildActionMask = 2147483647;
+            files = (
+            );
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+        5555555555555555555555AA /* Sources */ = {
+            isa = PBXSourcesBuildPhase;
+            buildActionMask = 2147483647;
+            files = (
+                33333333333333333333333 /* ContentView.swift in Sources */,
+                11111111111111111111111 /* App.swift in Sources */,
+            );
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+        9999999999999999999999AA /* Debug */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ALWAYS_SEARCH_USER_PATHS = NO;
+                CLANG_ANALYZER_NONNULL = YES;
+                CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+                CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+                CLANG_CXX_LIBRARY = "libc++";
+                CLANG_ENABLE_MODULES = YES;
+                CLANG_ENABLE_OBJC_ARC = YES;
+                CLANG_ENABLE_OBJC_WEAK = YES;
+                CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+                CLANG_WARN_BOOL_CONVERSION = YES;
+                CLANG_WARN_COMMA = YES;
+                CLANG_WARN_CONSTANT_CONVERSION = YES;
+                CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+                CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+                CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+                CLANG_WARN_EMPTY_BODY = YES;
+                CLANG_WARN_ENUM_CONVERSION = YES;
+                CLANG_WARN_INFINITE_RECURSION = YES;
+                CLANG_WARN_INT_CONVERSION = YES;
+                CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+                CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+                CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+                CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+                CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+                CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+                CLANG_WARN_STRICT_PROTOTYPES = YES;
+                CLANG_WARN_SUSPICIOUS_MOVE = YES;
+                CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+                CLANG_WARN_UNREACHABLE_CODE = YES;
+                CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+                COPY_PHASE_STRIP = NO;
+                DEBUG_INFORMATION_FORMAT = dwarf;
+                ENABLE_STRICT_OBJC_MSGSEND = YES;
+                ENABLE_TESTABILITY = YES;
+                GCC_C_LANGUAGE_STANDARD = gnu11;
+                GCC_DYNAMIC_NO_PIC = NO;
+                GCC_NO_COMMON_BLOCKS = YES;
+                GCC_OPTIMIZATION_LEVEL = 0;
+                GCC_PREPROCESSOR_DEFINITIONS = (
+                    "DEBUG=1",
+                    "$(inherited)",
+                );
+                GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+                GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+                GCC_WARN_UNDECLARED_SELECTOR = YES;
+                GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+                GCC_WARN_UNUSED_FUNCTION = YES;
+                GCC_WARN_UNUSED_VARIABLE = YES;
+                MACOSX_DEPLOYMENT_TARGET = 12.0;
+                MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+                MTL_FAST_MATH = YES;
+                ONLY_ACTIVE_ARCH = YES;
+                SDKROOT = macosx;
+                SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
+                SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+            };
+            name = Debug;
+        };
+        AAAAAAAAAAAAAAAAAAAAABBB /* Release */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ALWAYS_SEARCH_USER_PATHS = NO;
+                CLANG_ANALYZER_NONNULL = YES;
+                CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+                CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+                CLANG_CXX_LIBRARY = "libc++";
+                CLANG_ENABLE_MODULES = YES;
+                CLANG_ENABLE_OBJC_ARC = YES;
+                CLANG_ENABLE_OBJC_WEAK = YES;
+                CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+                CLANG_WARN_BOOL_CONVERSION = YES;
+                CLANG_WARN_COMMA = YES;
+                CLANG_WARN_CONSTANT_CONVERSION = YES;
+                CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+                CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+                CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+                CLANG_WARN_EMPTY_BODY = YES;
+                CLANG_WARN_ENUM_CONVERSION = YES;
+                CLANG_WARN_INFINITE_RECURSION = YES;
+                CLANG_WARN_INT_CONVERSION = YES;
+                CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+                CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+                CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+                CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+                CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+                CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+                CLANG_WARN_STRICT_PROTOTYPES = YES;
+                CLANG_WARN_SUSPICIOUS_MOVE = YES;
+                CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+                CLANG_WARN_UNREACHABLE_CODE = YES;
+                CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+                COPY_PHASE_STRIP = NO;
+                DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+                ENABLE_NS_ASSERTIONS = NO;
+                ENABLE_STRICT_OBJC_MSGSEND = YES;
+                GCC_C_LANGUAGE_STANDARD = gnu11;
+                GCC_NO_COMMON_BLOCKS = YES;
+                GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+                GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+                GCC_WARN_UNDECLARED_SELECTOR = YES;
+                GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+                GCC_WARN_UNUSED_FUNCTION = YES;
+                GCC_WARN_UNUSED_VARIABLE = YES;
+                MACOSX_DEPLOYMENT_TARGET = 12.0;
+                MTL_ENABLE_DEBUG_INFO = NO;
+                MTL_FAST_MATH = YES;
+                SDKROOT = macosx;
+                SWIFT_COMPILATION_MODE = wholemodule;
+                SWIFT_OPTIMIZATION_LEVEL = "-O";
+            };
+            name = Release;
+        };
+        BBBBBBBBBBBBBBBBBBBBBBCCC /* Debug */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+                ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+                CODE_SIGN_STYLE = Manual;
+                COMBINE_HIDPI_IMAGES = YES;
+                DEVELOPMENT_TEAM = "";
+                ENABLE_HARDENED_RUNTIME = YES;
+                ENABLE_PREVIEWS = YES;
+                FRAMEWORK_SEARCH_PATHS = "$(PROJECT_DIR)";
+                INFOPLIST_FILE = "MacOSLlamaTest/Info.plist";
+                LD_RUNPATH_SEARCH_PATHS = (
+                    "$(inherited)",
+                    "@executable_path/../Frameworks",
+                );
+                PRODUCT_BUNDLE_IDENTIFIER = "org.ggml.MacOSLlamaTest";
+                PRODUCT_NAME = "$(TARGET_NAME)";
+                PROVISIONING_PROFILE_SPECIFIER = "";
+                SWIFT_VERSION = 5.0;
+            };
+            name = Debug;
+        };
+        CCCCCCCCCCCCCCCCCCCCCCDDD /* Release */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+                ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+                CODE_SIGN_STYLE = Manual;
+                COMBINE_HIDPI_IMAGES = YES;
+                DEVELOPMENT_TEAM = "";
+                ENABLE_HARDENED_RUNTIME = YES;
+                ENABLE_PREVIEWS = YES;
+                FRAMEWORK_SEARCH_PATHS = (
+                    "$(inherited)",
+                    "$(PROJECT_DIR)",
+                );
+                INFOPLIST_FILE = "MacOSLlamaTest/Info.plist";
+                LD_RUNPATH_SEARCH_PATHS = (
+                    "$(inherited)",
+                    "@executable_path/../Frameworks",
+                );
+                PRODUCT_BUNDLE_IDENTIFIER = "org.ggml.MacOSLlamaTest";
+                PRODUCT_NAME = "$(TARGET_NAME)";
+                PROVISIONING_PROFILE_SPECIFIER = "";
+                SWIFT_VERSION = 5.0;
+            };
+            name = Release;
+        };
+/* End XCBuildConfiguration section */
+EOF
+
+# Finish the project.pbxproj file
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << EOF
+/* Begin XCConfigurationList section */
+        8888888888888888888888AA /* Build configuration list for PBXProject "${APP_NAME}" */ = {
+            isa = XCConfigurationList;
+            buildConfigurations = (
+                9999999999999999999999AA /* Debug */,
+                AAAAAAAAAAAAAAAAAAAAABBB /* Release */,
+            );
+            defaultConfigurationIsVisible = 0;
+            defaultConfigurationName = Release;
+        };
+        4444444444444444444444AA /* Build configuration list for PBXNativeTarget "${APP_NAME}" */ = {
+            isa = XCConfigurationList;
+            buildConfigurations = (
+                BBBBBBBBBBBBBBBBBBBBBBCCC /* Debug */,
+                CCCCCCCCCCCCCCCCCCCCCCDDD /* Release */,
+            );
+            defaultConfigurationIsVisible = 0;
+            defaultConfigurationName = Release;
+        };
+/* End XCConfigurationList section */
+    };
+    rootObject = 7777777777777777777777AA /* Project object */;
+}
+EOF
+
+# 2. Copy XCFramework to test project
+echo "Copying XCFramework to test project..."
+cp -R "${XCFRAMEWORK_PATH}" "${TEMP_DIR}/${APP_NAME}/"
+
+# 3. Build and archive the app
+echo "Building and archiving test app..."
+cd "${TEMP_DIR}/${APP_NAME}"
+
+# Create a simple xcscheme file to avoid xcodebuild scheme issues
+mkdir -p "${APP_NAME}.xcodeproj/xcshareddata/xcschemes"
+cat > "${APP_NAME}.xcodeproj/xcshareddata/xcschemes/${APP_NAME}.xcscheme" << EOF
+<?xml version="1.0" encoding="UTF-8"?>
+<Scheme
+   LastUpgradeVersion = "1240"
+   version = "1.3">
+   <BuildAction
+      parallelizeBuildables = "YES"
+      buildImplicitDependencies = "YES">
+      <BuildActionEntries>
+         <BuildActionEntry
+            buildForTesting = "YES"
+            buildForRunning = "YES"
+            buildForProfiling = "YES"
+            buildForArchiving = "YES"
+            buildForAnalyzing = "YES">
+            <BuildableReference
+               BuildableIdentifier = "primary"
+               BlueprintIdentifier = "3333333333333333333333AA"
+               BuildableName = "${APP_NAME}.app"
+               BlueprintName = "${APP_NAME}"
+               ReferencedContainer = "container:${APP_NAME}.xcodeproj">
+            </BuildableReference>
+         </BuildActionEntry>
+      </BuildActionEntries>
+   </BuildAction>
+   <TestAction
+      buildConfiguration = "Debug"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      shouldUseLaunchSchemeArgsEnv = "YES">
+      <Testables>
+      </Testables>
+   </TestAction>
+   <LaunchAction
+      buildConfiguration = "Debug"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      launchStyle = "0"
+      useCustomWorkingDirectory = "NO"
+      ignoresPersistentStateOnLaunch = "NO"
+      debugDocumentVersioning = "YES"
+      debugServiceExtension = "internal"
+      allowLocationSimulation = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "3333333333333333333333AA"
+            BuildableName = "${APP_NAME}.app"
+            BlueprintName = "${APP_NAME}"
+            ReferencedContainer = "container:${APP_NAME}.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </LaunchAction>
+   <ProfileAction
+      buildConfiguration = "Release"
+      shouldUseLaunchSchemeArgsEnv = "YES"
+      savedToolIdentifier = ""
+      useCustomWorkingDirectory = "NO"
+      debugDocumentVersioning = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "3333333333333333333333AA"
+            BuildableName = "${APP_NAME}.app"
+            BlueprintName = "${APP_NAME}"
+            ReferencedContainer = "container:${APP_NAME}.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </ProfileAction>
+   <AnalyzeAction
+      buildConfiguration = "Debug">
+   </AnalyzeAction>
+   <ArchiveAction
+      buildConfiguration = "Release"
+      revealArchiveInOrganizer = "YES">
+   </ArchiveAction>
+</Scheme>
+EOF
+
+# Now use xcodebuild with an explicitly defined product name for macOS
+xcodebuild -project "${APP_NAME}.xcodeproj" -scheme "${APP_NAME}" -sdk macosx -configuration Release archive -archivePath "${ARCHIVE_PATH}" CODE_SIGN_IDENTITY="-" CODE_SIGNING_REQUIRED=NO CODE_SIGNING_ALLOWED=NO PRODUCT_NAME="${APP_NAME}" SWIFT_OPTIMIZATION_LEVEL="-Onone" -quiet
+
+# 4. Create a package for distribution
+echo "Creating distributable package from archive..."
+cp -R "${ARCHIVE_PATH}/Products/Applications/${APP_NAME}.app" "${APP_PATH}"
+
+# Check and log app structure
+echo "App structure:"
+ls -la "${APP_PATH}"
+echo "Frameworks:"
+ls -la "${APP_PATH}/Contents/Frameworks/" 2>/dev/null || echo "No Frameworks directory found"
+
+# Create a zip file for potential distribution
+cd "${BUILD_DIR}"
+zip -r "${ZIP_PATH}" "${APP_NAME}.app"
+
+# Check embedded provisioning profile
+echo "Checking provisioning profile (if any)..."
+PROVISIONING_PROFILE=$(find "${APP_PATH}/Contents" -name "embedded.provisionprofile" 2>/dev/null)
+if [ -n "$PROVISIONING_PROFILE" ]; then
+    echo "Found embedded provisioning profile:"
+    security cms -D -i "$PROVISIONING_PROFILE" || echo "Unable to decode provisioning profile"
+else
+    echo "No embedded provisioning profile found (expected for ad-hoc builds)"
+fi
+
+# 5. Validate the app
+echo "Validating macOS app..."
+VALIDATION_OUTPUT="${VALIDATION_DIR}/validation_output.txt"
+
+# Check if authentication credentials are provided
+AUTH_ARGS=""
+if [ -n "$APPLE_ID" ] && [ -n "$APPLE_PASSWORD" ]; then
+    echo "Using Apple ID authentication for validation..."
+    AUTH_ARGS="--username \"$APPLE_ID\" --password \"$APPLE_PASSWORD\""
+else
+    echo "No authentication credentials provided. Will perform basic validation."
+    echo "To use your personal developer account, you can run the script with:"
+    echo "  APPLE_ID='your.email@example.com' APPLE_PASSWORD='your-app-specific-password' ./validate-macos.sh"
+    echo "Note: You need to create an app-specific password at https://appleid.apple.com/account/manage"
+fi
+
+# For macOS we need to use notarytool or alternative checks because altool doesn't support macOS apps in the same way
+echo "Note: For macOS, formal notarization process would require Apple Developer credentials."
+echo "Performing alternative validation checks..."
+
+# Final validation result
+FINAL_VALIDATION_RESULT=0
+
+# Check if app was created successfully
+if [ -d "${APP_PATH}" ] && [ -s "${APP_PATH}/Contents/MacOS/${APP_NAME}" ]; then
+    echo "✅ App package created successfully"
+else
+    echo "❌ App package not created or binary missing"
+    FINAL_VALIDATION_RESULT=1
+fi
+
+# Check if app binary exists and is executable
+if [ -f "${APP_PATH}/Contents/MacOS/${APP_NAME}" ] && [ -x "${APP_PATH}/Contents/MacOS/${APP_NAME}" ]; then
+    echo "✅ App binary exists and is executable"
+else
+    echo "❌ App binary missing or not executable"
+    FINAL_VALIDATION_RESULT=1
+fi
+
+# Check if framework was properly embedded
+if [ -d "${APP_PATH}/Contents/Frameworks/llama.framework" ]; then
+    echo "✅ llama.framework properly embedded"
+else
+    echo "❌ llama.framework not properly embedded"
+    FINAL_VALIDATION_RESULT=1
+fi
+
+# Check if framework binary exists
+if [ -f "${APP_PATH}/Contents/Frameworks/llama.framework/Versions/A/llama" ]; then
+    echo "✅ Framework binary exists"
+
+    # Further validate framework by checking architecture
+    ARCHS=$(lipo -info "${APP_PATH}/Contents/Frameworks/llama.framework/Versions/A/llama" 2>/dev/null | grep -o "arm64\\|x86_64" | tr '\n' ' ')
+    if [ -n "$ARCHS" ]; then
+        echo "✅ Framework architecture(s): $ARCHS"
+    else
+        echo "⚠️ Could not determine framework architecture"
+    fi
+else
+    echo "❌ Framework binary missing"
+    FINAL_VALIDATION_RESULT=1
+fi
+
+# Check code signing
+echo ""
+echo "==== CODE SIGNING INFO ===="
+codesign -vv -d "${APP_PATH}" 2>&1 || echo "Code signing verification not available (expected for ad-hoc builds)"
+
+if [ $FINAL_VALIDATION_RESULT -eq 0 ]; then
+    if [ -n "$AUTH_ARGS" ]; then
+        echo ""
+        echo "To notarize this app with Apple (requires Apple Developer account):"
+        echo "xcrun notarytool submit \"${ZIP_PATH}\" --apple-id \"your-apple-id\" --password \"your-app-specific-password\" --team-id \"your-team-id\" --wait"
+        echo ""
+    fi
+    echo "✅ Validation PASSED: macOS app built successfully with embedded framework"
+else
+    echo "❌ Validation FAILED: Issues found with the app or framework"
+fi
+
+# Don't clean up on error to allow inspection
+if [ $FINAL_VALIDATION_RESULT -ne 0 ]; then
+    echo ""
+    echo "Temporary files kept for inspection at: ${TEMP_DIR}"
+    echo "===== macOS Validation Process Failed ====="
+    exit 1
+fi
+
+# Clean up temporary files but keep build artifacts
+if [ $FINAL_VALIDATION_RESULT -eq 0 ]; then
+    echo "Cleaning up temporary files..."
+    #rm -rf "${TEMP_DIR}"
+fi
+
+echo "===== macOS Validation Process Completed ====="
+echo "App package available at: ${APP_PATH}"
+echo "Zipped app available at: ${ZIP_PATH}"
+exit $FINAL_VALIDATION_RESULT
diff --git a/scripts/apple/validate-tvos.sh b/scripts/apple/validate-tvos.sh
new file mode 100755
index 000000000..6120189e8
--- /dev/null
+++ b/scripts/apple/validate-tvos.sh
@@ -0,0 +1,813 @@
+#!/bin/bash
+# validate-tvos.sh - Validate tvOS Application with embedded llama.xcframework using SwiftUI
+
+# Authentication options (optional) (can be set via environment variables)
+# To use: export APPLE_ID=your.email@example.com
+#         export APPLE_PASSWORD=your-app-specific-password
+#         ./validate-tvos.sh
+APPLE_ID=${APPLE_ID:-""}
+APPLE_PASSWORD=${APPLE_PASSWORD:-""}
+
+# Ensure the script exits on error
+set -e
+
+# Function to print usage instructions
+print_usage() {
+  echo "Usage: ./validate-tvos.sh [OPTIONS]"
+  echo ""
+  echo "Options:"
+  echo "  --help                 Show this help message"
+  echo "  --apple-id EMAIL       Apple ID email for validation"
+  echo "  --apple-password PWD   App-specific password for Apple ID"
+  echo ""
+  echo "Environment variables:"
+  echo "  APPLE_ID               Apple ID email for validation"
+  echo "  APPLE_PASSWORD         App-specific password for Apple ID"
+  echo ""
+  echo "Notes:"
+  echo "  - Command line options take precedence over environment variables"
+  echo "  - Authentication is optional. If not provided, alternative validation will be performed"
+  echo "  - For APPLE_PASSWORD, use an app-specific password generated at https://appleid.apple.com/account/manage"
+}
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --help)
+      print_usage
+      exit 0
+      ;;
+    --apple-id)
+      APPLE_ID="$2"
+      shift 2
+      ;;
+    --apple-password)
+      APPLE_PASSWORD="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option: $1"
+      print_usage
+      exit 1
+      ;;
+  esac
+done
+
+# Function to clean up in case of error
+cleanup() {
+  # Don't clean up temp files on error to help with debugging
+  echo "===== tvOS Validation Process Failed ====="
+  exit 1
+}
+
+# Set up trap to call cleanup function on error
+trap cleanup ERR
+
+set -e  # Exit on any error
+
+ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../.." && pwd )"
+BUILD_DIR="${ROOT_DIR}/validation-builds/ios"
+
+# Configuration
+APP_NAME="TVOSLlamaTest"
+BUNDLE_ID="org.ggml.TVOSLlamaTest"
+XCFRAMEWORK_PATH="${ROOT_DIR}/build-apple/llama.xcframework"
+TEMP_DIR="${BUILD_DIR}/temp"
+ARCHIVE_PATH="${BUILD_DIR}/${APP_NAME}.xcarchive"
+IPA_PATH="${BUILD_DIR}/${APP_NAME}.ipa"
+VALIDATION_DIR="${BUILD_DIR}/validation"
+
+# Create necessary directories
+mkdir -p "${BUILD_DIR}"
+mkdir -p "${TEMP_DIR}"
+mkdir -p "${VALIDATION_DIR}"
+
+echo "===== tvOS Validation Process Started ====="
+
+# 1. Create a simple test app project
+echo "Creating test tvOS app project..."
+mkdir -p "${TEMP_DIR}/${APP_NAME}/${APP_NAME}"
+cat > "${TEMP_DIR}/${APP_NAME}/${APP_NAME}/Info.plist" << EOF
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>CFBundleDevelopmentRegion</key>
+    <string>en</string>
+    <key>CFBundleExecutable</key>
+    <string>${APP_NAME}</string>
+    <key>CFBundleIdentifier</key>
+    <string>${BUNDLE_ID}</string>
+    <key>CFBundleInfoDictionaryVersion</key>
+    <string>6.0</string>
+    <key>CFBundleName</key>
+    <string>${APP_NAME}</string>
+    <key>CFBundlePackageType</key>
+    <string>APPL</string>
+    <key>CFBundleShortVersionString</key>
+    <string>1.0</string>
+    <key>CFBundleVersion</key>
+    <string>1</string>
+    <key>UIRequiredDeviceCapabilities</key>
+    <array>
+        <string>arm64</string>
+    </array>
+</dict>
+</plist>
+EOF
+
+# Create SwiftUI app files
+mkdir -p "${TEMP_DIR}/${APP_NAME}/${APP_NAME}/Sources"
+
+# Create App.swift
+cat > "${TEMP_DIR}/${APP_NAME}/${APP_NAME}/Sources/App.swift" << EOF
+import SwiftUI
+import llama
+
+@main
+struct LlamaTestApp: App {
+    var body: some Scene {
+        WindowGroup {
+            ContentView()
+        }
+    }
+}
+EOF
+
+# Create ContentView.swift with tvOS specific elements
+cat > "${TEMP_DIR}/${APP_NAME}/${APP_NAME}/Sources/ContentView.swift" << EOF
+import SwiftUI
+import llama
+
+struct ContentView: View {
+    // Test that we can initialize a llama context params struct
+    let params = llama_context_default_params()
+
+    var body: some View {
+        VStack(spacing: 40) {
+            Text("Llama Framework Test on tvOS")
+                .font(.largeTitle)
+                .padding()
+
+            Text("llama_context_default_params() created successfully")
+                .font(.headline)
+                .multilineTextAlignment(.center)
+                .padding()
+
+            // Display some param values to confirm the framework is working
+            Text("n_ctx: \(params.n_ctx)")
+                .font(.title2)
+
+            Text("n_batch: \(params.n_batch)")
+                .font(.title2)
+
+            Spacer()
+        }
+        .padding(50)
+        // Larger size suitable for TV display
+    }
+}
+
+struct ContentView_Previews: PreviewProvider {
+    static var previews: some View {
+        ContentView()
+    }
+}
+EOF
+
+# Create project.pbxproj, fixing the framework search paths issues
+mkdir -p "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj"
+cat > "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << 'EOF'
+// !$*UTF8*$!
+{
+    archiveVersion = 1;
+    classes = {
+    };
+    objectVersion = 54;
+    objects = {
+
+/* Begin PBXBuildFile section */
+        11111111111111111111111 /* App.swift in Sources */ = {isa = PBXBuildFile; fileRef = 22222222222222222222222; };
+        33333333333333333333333 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 44444444444444444444444; };
+        55555555555555555555555 /* llama.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 66666666666666666666666; };
+        77777777777777777777777 /* llama.xcframework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 66666666666666666666666; };
+/* End PBXBuildFile section */
+
+/* Begin PBXCopyFilesBuildPhase section */
+        88888888888888888888888 /* Embed Frameworks */ = {
+            isa = PBXCopyFilesBuildPhase;
+            buildActionMask = 2147483647;
+            dstPath = "";
+            dstSubfolderSpec = 10;
+            files = (
+                77777777777777777777777 /* llama.xcframework in Embed Frameworks */,
+            );
+            name = "Embed Frameworks";
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXCopyFilesBuildPhase section */
+
+/* Begin PBXFileReference section */
+EOF
+
+# Continue with the project.pbxproj file, using the APP_NAME variable appropriately
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << EOF
+        99999999999999999999999 /* ${APP_NAME}.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "${APP_NAME}.app"; sourceTree = BUILT_PRODUCTS_DIR; };
+        22222222222222222222222 /* App.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = App.swift; sourceTree = "<group>"; };
+        44444444444444444444444 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
+        AAAAAAAAAAAAAAAAAAAAAAA /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+        66666666666666666666666 /* llama.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; path = llama.xcframework; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+EOF
+
+# Add the rest of the project file with fixed framework search paths
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << 'EOF'
+/* Begin PBXFrameworksBuildPhase section */
+        BBBBBBBBBBBBBBBBBBBBBBBB /* Frameworks */ = {
+            isa = PBXFrameworksBuildPhase;
+            buildActionMask = 2147483647;
+            files = (
+                55555555555555555555555 /* llama.xcframework in Frameworks */,
+            );
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+EOF
+
+# Continue with the project.pbxproj file, using the APP_NAME variable appropriately
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << EOF
+        CCCCCCCCCCCCCCCCCCCCCCCC /* Products */ = {
+            isa = PBXGroup;
+            children = (
+                99999999999999999999999 /* ${APP_NAME}.app */,
+            );
+            name = Products;
+            sourceTree = "<group>";
+        };
+EOF
+
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << 'EOF'
+        DDDDDDDDDDDDDDDDDDDDDDDD /* Frameworks */ = {
+            isa = PBXGroup;
+            children = (
+                66666666666666666666666 /* llama.xcframework */,
+            );
+            name = Frameworks;
+            sourceTree = "<group>";
+        };
+        EEEEEEEEEEEEEEEEEEEEEEEE = {
+            isa = PBXGroup;
+            children = (
+                FFFFFFFFFFFFFFFFFFFFFFFF /* TVOSLlamaTest */,
+                CCCCCCCCCCCCCCCCCCCCCCCC /* Products */,
+                DDDDDDDDDDDDDDDDDDDDDDDD /* Frameworks */,
+            );
+            sourceTree = "<group>";
+        };
+        FFFFFFFFFFFFFFFFFFFFFFFF /* TVOSLlamaTest */ = {
+            isa = PBXGroup;
+            children = (
+                1111111111111111111111AA /* Sources */,
+                AAAAAAAAAAAAAAAAAAAAAAA /* Info.plist */,
+            );
+            path = "TVOSLlamaTest";
+            sourceTree = "<group>";
+        };
+        1111111111111111111111AA /* Sources */ = {
+            isa = PBXGroup;
+            children = (
+                22222222222222222222222 /* App.swift */,
+                44444444444444444444444 /* ContentView.swift */,
+            );
+            path = Sources;
+            sourceTree = "<group>";
+        };
+/* End PBXGroup section */
+EOF
+
+# Continue with the project.pbxproj file, using the APP_NAME variable appropriately
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << EOF
+/* Begin PBXNativeTarget section */
+        3333333333333333333333AA /* ${APP_NAME} */ = {
+            isa = PBXNativeTarget;
+            buildConfigurationList = 4444444444444444444444AA /* Build configuration list for PBXNativeTarget "${APP_NAME}" */;
+            buildPhases = (
+                5555555555555555555555AA /* Sources */,
+                BBBBBBBBBBBBBBBBBBBBBBBB /* Frameworks */,
+                6666666666666666666666AA /* Resources */,
+                88888888888888888888888 /* Embed Frameworks */,
+            );
+            buildRules = (
+            );
+            dependencies = (
+            );
+            name = "${APP_NAME}";
+            productName = "${APP_NAME}";
+            productReference = 99999999999999999999999 /* ${APP_NAME}.app */;
+            productType = "com.apple.product-type.application";
+        };
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+        7777777777777777777777AA /* Project object */ = {
+            isa = PBXProject;
+            attributes = {
+                LastSwiftUpdateCheck = 1240;
+                LastUpgradeCheck = 1240;
+                TargetAttributes = {
+                    3333333333333333333333AA = {
+                        CreatedOnToolsVersion = 12.4;
+                    };
+                };
+            };
+            buildConfigurationList = 8888888888888888888888AA /* Build configuration list for PBXProject "${APP_NAME}" */;
+            compatibilityVersion = "Xcode 12.0";
+            developmentRegion = en;
+            hasScannedForEncodings = 0;
+            knownRegions = (
+                en,
+                Base,
+            );
+            mainGroup = EEEEEEEEEEEEEEEEEEEEEEEE;
+            productRefGroup = CCCCCCCCCCCCCCCCCCCCCCCC /* Products */;
+            projectDirPath = "";
+            projectRoot = "";
+            targets = (
+                3333333333333333333333AA /* ${APP_NAME} */,
+            );
+        };
+/* End PBXProject section */
+EOF
+
+# Add the rest of the file with correct FRAMEWORK_SEARCH_PATHS and tvOS settings
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << 'EOF'
+/* Begin PBXResourcesBuildPhase section */
+        6666666666666666666666AA /* Resources */ = {
+            isa = PBXResourcesBuildPhase;
+            buildActionMask = 2147483647;
+            files = (
+            );
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+        5555555555555555555555AA /* Sources */ = {
+            isa = PBXSourcesBuildPhase;
+            buildActionMask = 2147483647;
+            files = (
+                33333333333333333333333 /* ContentView.swift in Sources */,
+                11111111111111111111111 /* App.swift in Sources */,
+            );
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+        9999999999999999999999AA /* Debug */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ALWAYS_SEARCH_USER_PATHS = NO;
+                CLANG_ANALYZER_NONNULL = YES;
+                CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+                CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+                CLANG_CXX_LIBRARY = "libc++";
+                CLANG_ENABLE_MODULES = YES;
+                CLANG_ENABLE_OBJC_ARC = YES;
+                CLANG_ENABLE_OBJC_WEAK = YES;
+                CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+                CLANG_WARN_BOOL_CONVERSION = YES;
+                CLANG_WARN_COMMA = YES;
+                CLANG_WARN_CONSTANT_CONVERSION = YES;
+                CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+                CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+                CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+                CLANG_WARN_EMPTY_BODY = YES;
+                CLANG_WARN_ENUM_CONVERSION = YES;
+                CLANG_WARN_INFINITE_RECURSION = YES;
+                CLANG_WARN_INT_CONVERSION = YES;
+                CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+                CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+                CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+                CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+                CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+                CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+                CLANG_WARN_STRICT_PROTOTYPES = YES;
+                CLANG_WARN_SUSPICIOUS_MOVE = YES;
+                CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+                CLANG_WARN_UNREACHABLE_CODE = YES;
+                CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+                COPY_PHASE_STRIP = NO;
+                DEBUG_INFORMATION_FORMAT = dwarf;
+                ENABLE_STRICT_OBJC_MSGSEND = YES;
+                ENABLE_TESTABILITY = YES;
+                GCC_C_LANGUAGE_STANDARD = gnu11;
+                GCC_DYNAMIC_NO_PIC = NO;
+                GCC_NO_COMMON_BLOCKS = YES;
+                GCC_OPTIMIZATION_LEVEL = 0;
+                GCC_PREPROCESSOR_DEFINITIONS = (
+                    "DEBUG=1",
+                    "$(inherited)",
+                );
+                GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+                GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+                GCC_WARN_UNDECLARED_SELECTOR = YES;
+                GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+                GCC_WARN_UNUSED_FUNCTION = YES;
+                GCC_WARN_UNUSED_VARIABLE = YES;
+                TVOS_DEPLOYMENT_TARGET = 15.0;
+                MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+                MTL_FAST_MATH = YES;
+                ONLY_ACTIVE_ARCH = YES;
+                SDKROOT = appletvos;
+                SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
+                SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+            };
+            name = Debug;
+        };
+        AAAAAAAAAAAAAAAAAAAAABBB /* Release */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ALWAYS_SEARCH_USER_PATHS = NO;
+                CLANG_ANALYZER_NONNULL = YES;
+                CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+                CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+                CLANG_CXX_LIBRARY = "libc++";
+                CLANG_ENABLE_MODULES = YES;
+                CLANG_ENABLE_OBJC_ARC = YES;
+                CLANG_ENABLE_OBJC_WEAK = YES;
+                CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+                CLANG_WARN_BOOL_CONVERSION = YES;
+                CLANG_WARN_COMMA = YES;
+                CLANG_WARN_CONSTANT_CONVERSION = YES;
+                CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+                CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+                CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+                CLANG_WARN_EMPTY_BODY = YES;
+                CLANG_WARN_ENUM_CONVERSION = YES;
+                CLANG_WARN_INFINITE_RECURSION = YES;
+                CLANG_WARN_INT_CONVERSION = YES;
+                CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+                CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+                CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+                CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+                CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+                CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+                CLANG_WARN_STRICT_PROTOTYPES = YES;
+                CLANG_WARN_SUSPICIOUS_MOVE = YES;
+                CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+                CLANG_WARN_UNREACHABLE_CODE = YES;
+                CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+                COPY_PHASE_STRIP = NO;
+                DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+                ENABLE_NS_ASSERTIONS = NO;
+                ENABLE_STRICT_OBJC_MSGSEND = YES;
+                GCC_C_LANGUAGE_STANDARD = gnu11;
+                GCC_NO_COMMON_BLOCKS = YES;
+                GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+                GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+                GCC_WARN_UNDECLARED_SELECTOR = YES;
+                GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+                GCC_WARN_UNUSED_FUNCTION = YES;
+                GCC_WARN_UNUSED_VARIABLE = YES;
+                TVOS_DEPLOYMENT_TARGET = 15.0;
+                MTL_ENABLE_DEBUG_INFO = NO;
+                MTL_FAST_MATH = YES;
+                SDKROOT = appletvos;
+                SWIFT_COMPILATION_MODE = wholemodule;
+                SWIFT_OPTIMIZATION_LEVEL = "-O";
+                VALIDATE_PRODUCT = YES;
+            };
+            name = Release;
+        };
+        BBBBBBBBBBBBBBBBBBBBBBCCC /* Debug */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+                ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+                CODE_SIGN_STYLE = Manual;
+                DEVELOPMENT_TEAM = "";
+                ENABLE_PREVIEWS = YES;
+                FRAMEWORK_SEARCH_PATHS = "$(PROJECT_DIR)";
+                INFOPLIST_FILE = "TVOSLlamaTest/Info.plist";
+                LD_RUNPATH_SEARCH_PATHS = (
+                    "$(inherited)",
+                    "@executable_path/Frameworks",
+                );
+                PRODUCT_BUNDLE_IDENTIFIER = "org.ggml.TVOSLlamaTest";
+                PRODUCT_NAME = "$(TARGET_NAME)";
+                PROVISIONING_PROFILE_SPECIFIER = "";
+                SWIFT_VERSION = 5.0;
+                TARGETED_DEVICE_FAMILY = 3;
+            };
+            name = Debug;
+        };
+        CCCCCCCCCCCCCCCCCCCCCCDDD /* Release */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+                ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+                CODE_SIGN_STYLE = Manual;
+                DEVELOPMENT_TEAM = "";
+                ENABLE_PREVIEWS = YES;
+                FRAMEWORK_SEARCH_PATHS = (
+                    "$(inherited)",
+                    "$(PROJECT_DIR)",
+                );
+                INFOPLIST_FILE = "TVOSLlamaTest/Info.plist";
+                LD_RUNPATH_SEARCH_PATHS = (
+                    "$(inherited)",
+                    "@executable_path/Frameworks",
+                );
+                PRODUCT_BUNDLE_IDENTIFIER = "org.ggml.TVOSLlamaTest";
+                PRODUCT_NAME = "$(TARGET_NAME)";
+                PROVISIONING_PROFILE_SPECIFIER = "";
+                SWIFT_VERSION = 5.0;
+                TARGETED_DEVICE_FAMILY = 3;
+            };
+            name = Release;
+        };
+/* End XCBuildConfiguration section */
+EOF
+
+# Finish the project.pbxproj file
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << EOF
+/* Begin XCConfigurationList section */
+        8888888888888888888888AA /* Build configuration list for PBXProject "${APP_NAME}" */ = {
+            isa = XCConfigurationList;
+            buildConfigurations = (
+                9999999999999999999999AA /* Debug */,
+                AAAAAAAAAAAAAAAAAAAAABBB /* Release */,
+            );
+            defaultConfigurationIsVisible = 0;
+            defaultConfigurationName = Release;
+        };
+        4444444444444444444444AA /* Build configuration list for PBXNativeTarget "${APP_NAME}" */ = {
+            isa = XCConfigurationList;
+            buildConfigurations = (
+                BBBBBBBBBBBBBBBBBBBBBBCCC /* Debug */,
+                CCCCCCCCCCCCCCCCCCCCCCDDD /* Release */,
+            );
+            defaultConfigurationIsVisible = 0;
+            defaultConfigurationName = Release;
+        };
+/* End XCConfigurationList section */
+    };
+    rootObject = 7777777777777777777777AA /* Project object */;
+}
+EOF
+
+# 2. Copy XCFramework to test project
+echo "Copying XCFramework to test project..."
+cp -R "${XCFRAMEWORK_PATH}" "${TEMP_DIR}/${APP_NAME}/"
+
+# 3. Build and archive the app
+echo "Building and archiving test app..."
+cd "${TEMP_DIR}/${APP_NAME}"
+
+# Create a simple xcscheme file to avoid xcodebuild scheme issues
+mkdir -p "${APP_NAME}.xcodeproj/xcshareddata/xcschemes"
+cat > "${APP_NAME}.xcodeproj/xcshareddata/xcschemes/${APP_NAME}.xcscheme" << EOF
+<?xml version="1.0" encoding="UTF-8"?>
+<Scheme
+   LastUpgradeVersion = "1240"
+   version = "1.3">
+   <BuildAction
+      parallelizeBuildables = "YES"
+      buildImplicitDependencies = "YES">
+      <BuildActionEntries>
+         <BuildActionEntry
+            buildForTesting = "YES"
+            buildForRunning = "YES"
+            buildForProfiling = "YES"
+            buildForArchiving = "YES"
+            buildForAnalyzing = "YES">
+            <BuildableReference
+               BuildableIdentifier = "primary"
+               BlueprintIdentifier = "3333333333333333333333AA"
+               BuildableName = "${APP_NAME}.app"
+               BlueprintName = "${APP_NAME}"
+               ReferencedContainer = "container:${APP_NAME}.xcodeproj">
+            </BuildableReference>
+         </BuildActionEntry>
+      </BuildActionEntries>
+   </BuildAction>
+   <TestAction
+      buildConfiguration = "Debug"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      shouldUseLaunchSchemeArgsEnv = "YES">
+      <Testables>
+      </Testables>
+   </TestAction>
+   <LaunchAction
+      buildConfiguration = "Debug"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      launchStyle = "0"
+      useCustomWorkingDirectory = "NO"
+      ignoresPersistentStateOnLaunch = "NO"
+      debugDocumentVersioning = "YES"
+      debugServiceExtension = "internal"
+      allowLocationSimulation = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "3333333333333333333333AA"
+            BuildableName = "${APP_NAME}.app"
+            BlueprintName = "${APP_NAME}"
+            ReferencedContainer = "container:${APP_NAME}.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </LaunchAction>
+   <ProfileAction
+      buildConfiguration = "Release"
+      shouldUseLaunchSchemeArgsEnv = "YES"
+      savedToolIdentifier = ""
+      useCustomWorkingDirectory = "NO"
+      debugDocumentVersioning = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "3333333333333333333333AA"
+            BuildableName = "${APP_NAME}.app"
+            BlueprintName = "${APP_NAME}"
+            ReferencedContainer = "container:${APP_NAME}.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </ProfileAction>
+   <AnalyzeAction
+      buildConfiguration = "Debug">
+   </AnalyzeAction>
+   <ArchiveAction
+      buildConfiguration = "Release"
+      revealArchiveInOrganizer = "YES">
+   </ArchiveAction>
+</Scheme>
+EOF
+
+# Now use xcodebuild with an explicitly defined product name for tvOS
+xcodebuild -project "${APP_NAME}.xcodeproj" -scheme "${APP_NAME}" -sdk appletvos -configuration Release archive -archivePath "${ARCHIVE_PATH}" CODE_SIGN_IDENTITY="-" CODE_SIGNING_REQUIRED=NO CODE_SIGNING_ALLOWED=NO PRODUCT_NAME="${APP_NAME}" SWIFT_OPTIMIZATION_LEVEL="-Onone" -quiet
+
+# 4. Create IPA from archive
+echo "Creating IPA from archive..."
+mkdir -p "${TEMP_DIR}/Payload"
+cp -R "${ARCHIVE_PATH}/Products/Applications/${APP_NAME}.app" "${TEMP_DIR}/Payload/"
+
+# Check and log app structure before zipping
+echo "App structure:"
+ls -la "${TEMP_DIR}/Payload/${APP_NAME}.app/"
+echo "Frameworks:"
+ls -la "${TEMP_DIR}/Payload/${APP_NAME}.app/Frameworks/" 2>/dev/null || echo "No Frameworks directory found"
+
+cd "${TEMP_DIR}"
+zip -r "${IPA_PATH}" Payload
+
+# Check embedded provisioning profile
+echo "Checking provisioning profile (if any)..."
+PROVISIONING_PROFILE=$(find "${ARCHIVE_PATH}/Products/Applications/${APP_NAME}.app" -name "embedded.mobileprovision" 2>/dev/null)
+if [ -n "$PROVISIONING_PROFILE" ]; then
+    echo "Found embedded provisioning profile:"
+    security cms -D -i "$PROVISIONING_PROFILE" || echo "Unable to decode provisioning profile"
+else
+    echo "No embedded provisioning profile found (expected for ad-hoc builds)"
+fi
+
+# 5. Validate the IPA
+echo "Validating IPA..."
+VALIDATION_OUTPUT="${VALIDATION_DIR}/validation_output.txt"
+
+# Check if authentication credentials are provided
+AUTH_ARGS=""
+if [ -n "$APPLE_ID" ] && [ -n "$APPLE_PASSWORD" ]; then
+    echo "Using Apple ID authentication for validation..."
+    AUTH_ARGS="--username \"$APPLE_ID\" --password \"$APPLE_PASSWORD\""
+else
+    echo "No authentication credentials provided. Will perform basic validation."
+    echo "To use your personal developer account, you can run the script with:"
+    echo "  APPLE_ID='your.email@example.com' APPLE_PASSWORD='your-app-specific-password' ./validate-tvos.sh"
+    echo "Note: You need to create an app-specific password at https://appleid.apple.com/account/manage"
+fi
+
+# Run validation with detailed output
+echo "Running validation with altool..."
+if [ -n "$AUTH_ARGS" ]; then
+    # Use eval to properly handle the quoted arguments
+    eval "xcrun altool --validate-app -f \"${IPA_PATH}\" --type tvos --output-format xml $AUTH_ARGS" 2>&1 | tee "${VALIDATION_OUTPUT}"
+else
+    xcrun altool --validate-app -f "${IPA_PATH}" --type tvos --output-format xml 2>&1 | tee "${VALIDATION_OUTPUT}"
+fi
+VALIDATION_RESULT=$?
+
+# Final validation result
+FINAL_VALIDATION_RESULT=0
+
+# Check if validation failed because the app isn't in App Store Connect
+if grep -q "No suitable application records were found" "${VALIDATION_OUTPUT}"; then
+    echo "⚠️ App Store Connect Warning: The app bundle identifier is not found in App Store Connect"
+    echo "This is expected for apps that haven't been registered in App Store Connect yet."
+    echo "This doesn't indicate a problem with the build or framework."
+
+    # Perform alternative validation
+    echo "Performing alternative validation checks..."
+
+    # Check if IPA was created successfully
+    if [ -f "${IPA_PATH}" ] && [ -s "${IPA_PATH}" ]; then
+        echo "✅ IPA file created successfully"
+    else
+        echo "❌ IPA file not created or empty"
+        FINAL_VALIDATION_RESULT=1
+    fi
+
+    # Check if app binary exists and is executable
+    if [ -f "${TEMP_DIR}/Payload/${APP_NAME}.app/${APP_NAME}" ] && [ -x "${TEMP_DIR}/Payload/${APP_NAME}.app/${APP_NAME}" ]; then
+        echo "✅ App binary exists and is executable"
+    else
+        echo "❌ App binary missing or not executable"
+        FINAL_VALIDATION_RESULT=1
+    fi
+
+    # Check if framework was properly embedded
+    if [ -d "${TEMP_DIR}/Payload/${APP_NAME}.app/Frameworks/llama.framework" ]; then
+        echo "✅ llama.framework properly embedded"
+    else
+        echo "❌ llama.framework not properly embedded"
+        FINAL_VALIDATION_RESULT=1
+    fi
+
+    # Check if framework binary exists
+    if [ -f "${TEMP_DIR}/Payload/${APP_NAME}.app/Frameworks/llama.framework/llama" ]; then
+        echo "✅ Framework binary exists"
+
+        # Further validate framework by checking architecture
+        ARCHS=$(lipo -info "${TEMP_DIR}/Payload/${APP_NAME}.app/Frameworks/llama.framework/llama" 2>/dev/null | grep -o "arm64\\|x86_64" | tr '\n' ' ')
+        if [ -n "$ARCHS" ]; then
+            echo "✅ Framework architecture(s): $ARCHS"
+        else
+            echo "⚠️ Could not determine framework architecture"
+        fi
+    else
+        echo "❌ Framework binary missing"
+        FINAL_VALIDATION_RESULT=1
+    fi
+
+    if [ $FINAL_VALIDATION_RESULT -eq 0 ]; then
+        echo "✅ Alternative validation PASSED: App built successfully with embedded framework"
+    else
+        echo "❌ Alternative validation FAILED: Issues found with the app or framework"
+    fi
+elif grep -q "You must specify authentication credentials" "${VALIDATION_OUTPUT}" && [ -z "$AUTH_ARGS" ]; then
+    echo "✅ tvOS Validation PASSED: IPA successfully validated"
+    echo "Results saved to ${VALIDATION_OUTPUT}"
+else
+    echo "❌ tvOS Validation FAILED: IPA validation found issues"
+    echo "See validation output at ${VALIDATION_OUTPUT}"
+    echo ""
+    echo "==== VALIDATION ERRORS ===="
+
+    # Try to extract specific errors from the output
+    if grep -q "Error" "${VALIDATION_OUTPUT}"; then
+        grep -A 5 "Error" "${VALIDATION_OUTPUT}"
+    else
+        # If no specific error found, show the whole log
+        cat "${VALIDATION_OUTPUT}"
+    fi
+
+    # Additional debugging: check IPA contents
+    echo ""
+    echo "==== IPA CONTENTS ===="
+    mkdir -p "${TEMP_DIR}/ipa_contents"
+    unzip -q "${IPA_PATH}" -d "${TEMP_DIR}/ipa_contents"
+    ls -la "${TEMP_DIR}/ipa_contents/Payload/${APP_NAME}.app/"
+
+    # Check for code signing issues
+    echo ""
+    echo "==== CODE SIGNING INFO ===="
+    codesign -vv -d "${TEMP_DIR}/ipa_contents/Payload/${APP_NAME}.app" 2>&1 || echo "Code signing verification failed"
+
+    # Check embedded frameworks
+    echo ""
+    echo "==== FRAMEWORK INFO ===="
+    ls -la "${TEMP_DIR}/ipa_contents/Payload/${APP_NAME}.app/Frameworks/" 2>/dev/null || echo "No Frameworks directory found"
+fi
+
+# Don't clean up on error to allow inspection
+if [ $FINAL_VALIDATION_RESULT -ne 0 ]; then
+    echo ""
+    echo "Temporary files kept for inspection at: ${TEMP_DIR}"
+    echo "===== tvOS Validation Process Failed ====="
+    exit 1
+fi
+
+# Clean up temporary files but keep build artifacts
+if [ $FINAL_VALIDATION_RESULT -eq 0 ]; then
+    echo "Cleaning up temporary files..."
+    #rm -rf "${TEMP_DIR}"
+fi
+
+echo "===== tvOS Validation Process Completed ====="
+exit $FINAL_VALIDATION_RESULT
diff --git a/scripts/apple/validate-visionos.sh b/scripts/apple/validate-visionos.sh
new file mode 100755
index 000000000..a18ddcce4
--- /dev/null
+++ b/scripts/apple/validate-visionos.sh
@@ -0,0 +1,811 @@
+#!/bin/bash
+# validate-visionos.sh - Validate visionOS Application with embedded llama.xcframework using SwiftUI
+
+# Authentication options (optional) (can be set via environment variables)
+# To use: export APPLE_ID=your.email@example.com
+#         export APPLE_PASSWORD=your-app-specific-password
+#         ./validate-visionos.sh
+APPLE_ID=${APPLE_ID:-""}
+APPLE_PASSWORD=${APPLE_PASSWORD:-""}
+
+# Ensure the script exits on error
+set -e
+
+# Function to print usage instructions
+print_usage() {
+  echo "Usage: ./validate-visionos.sh [OPTIONS]"
+  echo ""
+  echo "Options:"
+  echo "  --help                 Show this help message"
+  echo "  --apple-id EMAIL       Apple ID email for validation"
+  echo "  --apple-password PWD   App-specific password for Apple ID"
+  echo ""
+  echo "Environment variables:"
+  echo "  APPLE_ID               Apple ID email for validation"
+  echo "  APPLE_PASSWORD         App-specific password for Apple ID"
+  echo ""
+  echo "Notes:"
+  echo "  - Command line options take precedence over environment variables"
+  echo "  - Authentication is optional. If not provided, alternative validation will be performed"
+  echo "  - For APPLE_PASSWORD, use an app-specific password generated at https://appleid.apple.com/account/manage"
+}
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --help)
+      print_usage
+      exit 0
+      ;;
+    --apple-id)
+      APPLE_ID="$2"
+      shift 2
+      ;;
+    --apple-password)
+      APPLE_PASSWORD="$2"
+      shift 2
+      ;;
+    *)
+      echo "Unknown option: $1"
+      print_usage
+      exit 1
+      ;;
+  esac
+done
+
+# Function to clean up in case of error
+cleanup() {
+  # Don't clean up temp files on error to help with debugging
+  echo "===== visionOS Validation Process Failed ====="
+  exit 1
+}
+
+# Set up trap to call cleanup function on error
+trap cleanup ERR
+
+set -e  # Exit on any error
+
+ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../.." && pwd )"
+BUILD_DIR="${ROOT_DIR}/validation-builds/visionos"
+
+# Configuration
+APP_NAME="VisionOSLlamaTest"
+BUNDLE_ID="org.ggml.VisionOSLlamaTest"
+XCFRAMEWORK_PATH="${ROOT_DIR}/build-apple/llama.xcframework"
+TEMP_DIR="${BUILD_DIR}/temp"
+ARCHIVE_PATH="${BUILD_DIR}/${APP_NAME}.xcarchive"
+IPA_PATH="${BUILD_DIR}/${APP_NAME}.ipa"
+VALIDATION_DIR="${BUILD_DIR}/validation"
+
+# Create necessary directories
+mkdir -p "${BUILD_DIR}"
+mkdir -p "${TEMP_DIR}"
+mkdir -p "${VALIDATION_DIR}"
+
+echo "===== visionOS Validation Process Started ====="
+
+# 1. Create a simple test app project
+echo "Creating test visionOS app project..."
+mkdir -p "${TEMP_DIR}/${APP_NAME}/${APP_NAME}"
+cat > "${TEMP_DIR}/${APP_NAME}/${APP_NAME}/Info.plist" << EOF
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>CFBundleDevelopmentRegion</key>
+    <string>en</string>
+    <key>CFBundleExecutable</key>
+    <string>${APP_NAME}</string>
+    <key>CFBundleIdentifier</key>
+    <string>${BUNDLE_ID}</string>
+    <key>CFBundleInfoDictionaryVersion</key>
+    <string>6.0</string>
+    <key>CFBundleName</key>
+    <string>${APP_NAME}</string>
+    <key>CFBundlePackageType</key>
+    <string>APPL</string>
+    <key>CFBundleShortVersionString</key>
+    <string>1.0</string>
+    <key>CFBundleVersion</key>
+    <string>1</string>
+</dict>
+</plist>
+EOF
+
+# Create SwiftUI app files
+mkdir -p "${TEMP_DIR}/${APP_NAME}/${APP_NAME}/Sources"
+
+# Create App.swift
+cat > "${TEMP_DIR}/${APP_NAME}/${APP_NAME}/Sources/App.swift" << EOF
+import SwiftUI
+import llama
+
+@main
+struct LlamaTestApp: App {
+    var body: some Scene {
+        WindowGroup {
+            ContentView()
+        }
+    }
+}
+EOF
+
+# Create ContentView.swift with visionOS specific elements
+cat > "${TEMP_DIR}/${APP_NAME}/${APP_NAME}/Sources/ContentView.swift" << EOF
+import SwiftUI
+import llama
+
+struct ContentView: View {
+    // Test that we can initialize a llama context params struct
+    let params = llama_context_default_params()
+
+    var body: some View {
+        VStack(spacing: 20) {
+            Text("Llama Framework Test on visionOS")
+                .font(.largeTitle)
+                .padding()
+
+            Text("llama_context_default_params() created successfully")
+                .font(.headline)
+                .multilineTextAlignment(.center)
+                .padding()
+
+            // Display some param values to confirm the framework is working
+            Text("n_ctx: \(params.n_ctx)")
+                .font(.body)
+
+            Text("n_batch: \(params.n_batch)")
+                .font(.body)
+
+            Spacer()
+        }
+        .padding()
+        .frame(width: 500, height: 400)
+    }
+}
+
+struct ContentView_Previews: PreviewProvider {
+    static var previews: some View {
+        ContentView()
+    }
+}
+EOF
+
+# Create project.pbxproj, fixing the framework search paths issues
+mkdir -p "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj"
+cat > "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << 'EOF'
+// !$*UTF8*$!
+{
+    archiveVersion = 1;
+    classes = {
+    };
+    objectVersion = 54;
+    objects = {
+
+/* Begin PBXBuildFile section */
+        11111111111111111111111 /* App.swift in Sources */ = {isa = PBXBuildFile; fileRef = 22222222222222222222222; };
+        33333333333333333333333 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 44444444444444444444444; };
+        55555555555555555555555 /* llama.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = 66666666666666666666666; };
+        77777777777777777777777 /* llama.xcframework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = 66666666666666666666666; };
+/* End PBXBuildFile section */
+
+/* Begin PBXCopyFilesBuildPhase section */
+        88888888888888888888888 /* Embed Frameworks */ = {
+            isa = PBXCopyFilesBuildPhase;
+            buildActionMask = 2147483647;
+            dstPath = "";
+            dstSubfolderSpec = 10;
+            files = (
+                77777777777777777777777 /* llama.xcframework in Embed Frameworks */,
+            );
+            name = "Embed Frameworks";
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXCopyFilesBuildPhase section */
+
+/* Begin PBXFileReference section */
+EOF
+
+# Continue with the project.pbxproj file, using the APP_NAME variable appropriately
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << EOF
+        99999999999999999999999 /* ${APP_NAME}.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = "${APP_NAME}.app"; sourceTree = BUILT_PRODUCTS_DIR; };
+        22222222222222222222222 /* App.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = App.swift; sourceTree = "<group>"; };
+        44444444444444444444444 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
+        AAAAAAAAAAAAAAAAAAAAAAA /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+        66666666666666666666666 /* llama.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; path = llama.xcframework; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+EOF
+
+# Add the rest of the project file with fixed framework search paths
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << 'EOF'
+/* Begin PBXFrameworksBuildPhase section */
+        BBBBBBBBBBBBBBBBBBBBBBBB /* Frameworks */ = {
+            isa = PBXFrameworksBuildPhase;
+            buildActionMask = 2147483647;
+            files = (
+                55555555555555555555555 /* llama.xcframework in Frameworks */,
+            );
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+EOF
+
+# Continue with the project.pbxproj file, using the APP_NAME variable appropriately
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << EOF
+        CCCCCCCCCCCCCCCCCCCCCCCC /* Products */ = {
+            isa = PBXGroup;
+            children = (
+                99999999999999999999999 /* ${APP_NAME}.app */,
+            );
+            name = Products;
+            sourceTree = "<group>";
+        };
+EOF
+
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << 'EOF'
+        DDDDDDDDDDDDDDDDDDDDDDDD /* Frameworks */ = {
+            isa = PBXGroup;
+            children = (
+                66666666666666666666666 /* llama.xcframework */,
+            );
+            name = Frameworks;
+            sourceTree = "<group>";
+        };
+        EEEEEEEEEEEEEEEEEEEEEEEE = {
+            isa = PBXGroup;
+            children = (
+                FFFFFFFFFFFFFFFFFFFFFFFF /* VisionOSLlamaTest */,
+                CCCCCCCCCCCCCCCCCCCCCCCC /* Products */,
+                DDDDDDDDDDDDDDDDDDDDDDDD /* Frameworks */,
+            );
+            sourceTree = "<group>";
+        };
+        FFFFFFFFFFFFFFFFFFFFFFFF /* VisionOSLlamaTest */ = {
+            isa = PBXGroup;
+            children = (
+                1111111111111111111111AA /* Sources */,
+                AAAAAAAAAAAAAAAAAAAAAAA /* Info.plist */,
+            );
+            path = "VisionOSLlamaTest";
+            sourceTree = "<group>";
+        };
+        1111111111111111111111AA /* Sources */ = {
+            isa = PBXGroup;
+            children = (
+                22222222222222222222222 /* App.swift */,
+                44444444444444444444444 /* ContentView.swift */,
+            );
+            path = Sources;
+            sourceTree = "<group>";
+        };
+/* End PBXGroup section */
+EOF
+
+# Continue with the project.pbxproj file, using the APP_NAME variable appropriately
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << EOF
+/* Begin PBXNativeTarget section */
+        3333333333333333333333AA /* ${APP_NAME} */ = {
+            isa = PBXNativeTarget;
+            buildConfigurationList = 4444444444444444444444AA /* Build configuration list for PBXNativeTarget "${APP_NAME}" */;
+            buildPhases = (
+                5555555555555555555555AA /* Sources */,
+                BBBBBBBBBBBBBBBBBBBBBBBB /* Frameworks */,
+                6666666666666666666666AA /* Resources */,
+                88888888888888888888888 /* Embed Frameworks */,
+            );
+            buildRules = (
+            );
+            dependencies = (
+            );
+            name = "${APP_NAME}";
+            productName = "${APP_NAME}";
+            productReference = 99999999999999999999999 /* ${APP_NAME}.app */;
+            productType = "com.apple.product-type.application";
+        };
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+        7777777777777777777777AA /* Project object */ = {
+            isa = PBXProject;
+            attributes = {
+                LastSwiftUpdateCheck = 1510;
+                LastUpgradeCheck = 1510;
+                TargetAttributes = {
+                    3333333333333333333333AA = {
+                        CreatedOnToolsVersion = 15.1;
+                    };
+                };
+            };
+            buildConfigurationList = 8888888888888888888888AA /* Build configuration list for PBXProject "${APP_NAME}" */;
+            compatibilityVersion = "Xcode 15.0";
+            developmentRegion = en;
+            hasScannedForEncodings = 0;
+            knownRegions = (
+                en,
+                Base,
+            );
+            mainGroup = EEEEEEEEEEEEEEEEEEEEEEEE;
+            productRefGroup = CCCCCCCCCCCCCCCCCCCCCCCC /* Products */;
+            projectDirPath = "";
+            projectRoot = "";
+            targets = (
+                3333333333333333333333AA /* ${APP_NAME} */,
+            );
+        };
+/* End PBXProject section */
+EOF
+
+# Add the rest of the file with correct FRAMEWORK_SEARCH_PATHS
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << 'EOF'
+/* Begin PBXResourcesBuildPhase section */
+        6666666666666666666666AA /* Resources */ = {
+            isa = PBXResourcesBuildPhase;
+            buildActionMask = 2147483647;
+            files = (
+            );
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+        5555555555555555555555AA /* Sources */ = {
+            isa = PBXSourcesBuildPhase;
+            buildActionMask = 2147483647;
+            files = (
+                33333333333333333333333 /* ContentView.swift in Sources */,
+                11111111111111111111111 /* App.swift in Sources */,
+            );
+            runOnlyForDeploymentPostprocessing = 0;
+        };
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+        9999999999999999999999AA /* Debug */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ALWAYS_SEARCH_USER_PATHS = NO;
+                CLANG_ANALYZER_NONNULL = YES;
+                CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+                CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+                CLANG_CXX_LIBRARY = "libc++";
+                CLANG_ENABLE_MODULES = YES;
+                CLANG_ENABLE_OBJC_ARC = YES;
+                CLANG_ENABLE_OBJC_WEAK = YES;
+                CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+                CLANG_WARN_BOOL_CONVERSION = YES;
+                CLANG_WARN_COMMA = YES;
+                CLANG_WARN_CONSTANT_CONVERSION = YES;
+                CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+                CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+                CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+                CLANG_WARN_EMPTY_BODY = YES;
+                CLANG_WARN_ENUM_CONVERSION = YES;
+                CLANG_WARN_INFINITE_RECURSION = YES;
+                CLANG_WARN_INT_CONVERSION = YES;
+                CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+                CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+                CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+                CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+                CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+                CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+                CLANG_WARN_STRICT_PROTOTYPES = YES;
+                CLANG_WARN_SUSPICIOUS_MOVE = YES;
+                CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+                CLANG_WARN_UNREACHABLE_CODE = YES;
+                CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+                COPY_PHASE_STRIP = NO;
+                DEBUG_INFORMATION_FORMAT = dwarf;
+                ENABLE_STRICT_OBJC_MSGSEND = YES;
+                ENABLE_TESTABILITY = YES;
+                GCC_C_LANGUAGE_STANDARD = gnu11;
+                GCC_DYNAMIC_NO_PIC = NO;
+                GCC_NO_COMMON_BLOCKS = YES;
+                GCC_OPTIMIZATION_LEVEL = 0;
+                GCC_PREPROCESSOR_DEFINITIONS = (
+                    "DEBUG=1",
+                    "$(inherited)",
+                );
+                GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+                GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+                GCC_WARN_UNDECLARED_SELECTOR = YES;
+                GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+                GCC_WARN_UNUSED_FUNCTION = YES;
+                GCC_WARN_UNUSED_VARIABLE = YES;
+                MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
+                MTL_FAST_MATH = YES;
+                ONLY_ACTIVE_ARCH = YES;
+                SDKROOT = xros;
+                SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
+                SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+                XROS_DEPLOYMENT_TARGET = 1.0;
+            };
+            name = Debug;
+        };
+        AAAAAAAAAAAAAAAAAAAAABBB /* Release */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ALWAYS_SEARCH_USER_PATHS = NO;
+                CLANG_ANALYZER_NONNULL = YES;
+                CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+                CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+                CLANG_CXX_LIBRARY = "libc++";
+                CLANG_ENABLE_MODULES = YES;
+                CLANG_ENABLE_OBJC_ARC = YES;
+                CLANG_ENABLE_OBJC_WEAK = YES;
+                CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+                CLANG_WARN_BOOL_CONVERSION = YES;
+                CLANG_WARN_COMMA = YES;
+                CLANG_WARN_CONSTANT_CONVERSION = YES;
+                CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+                CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+                CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+                CLANG_WARN_EMPTY_BODY = YES;
+                CLANG_WARN_ENUM_CONVERSION = YES;
+                CLANG_WARN_INFINITE_RECURSION = YES;
+                CLANG_WARN_INT_CONVERSION = YES;
+                CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+                CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+                CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+                CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+                CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
+                CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+                CLANG_WARN_STRICT_PROTOTYPES = YES;
+                CLANG_WARN_SUSPICIOUS_MOVE = YES;
+                CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+                CLANG_WARN_UNREACHABLE_CODE = YES;
+                CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+                COPY_PHASE_STRIP = NO;
+                DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+                ENABLE_NS_ASSERTIONS = NO;
+                ENABLE_STRICT_OBJC_MSGSEND = YES;
+                GCC_C_LANGUAGE_STANDARD = gnu11;
+                GCC_NO_COMMON_BLOCKS = YES;
+                GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+                GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+                GCC_WARN_UNDECLARED_SELECTOR = YES;
+                GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+                GCC_WARN_UNUSED_FUNCTION = YES;
+                GCC_WARN_UNUSED_VARIABLE = YES;
+                MTL_ENABLE_DEBUG_INFO = NO;
+                MTL_FAST_MATH = YES;
+                SDKROOT = xros;
+                SWIFT_COMPILATION_MODE = wholemodule;
+                SWIFT_OPTIMIZATION_LEVEL = "-O";
+                VALIDATE_PRODUCT = YES;
+                XROS_DEPLOYMENT_TARGET = 1.0;
+            };
+            name = Release;
+        };
+        BBBBBBBBBBBBBBBBBBBBBBCCC /* Debug */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+                ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+                CODE_SIGN_STYLE = Manual;
+                DEVELOPMENT_TEAM = "";
+                ENABLE_PREVIEWS = YES;
+                FRAMEWORK_SEARCH_PATHS = "$(PROJECT_DIR)";
+                INFOPLIST_FILE = "VisionOSLlamaTest/Info.plist";
+                LD_RUNPATH_SEARCH_PATHS = (
+                    "$(inherited)",
+                    "@executable_path/Frameworks",
+                );
+                PRODUCT_BUNDLE_IDENTIFIER = "org.ggml.VisionOSLlamaTest";
+                PRODUCT_NAME = "$(TARGET_NAME)";
+                PROVISIONING_PROFILE_SPECIFIER = "";
+                SUPPORTED_PLATFORMS = "xros xrsimulator";
+                SWIFT_VERSION = 5.0;
+                TARGETED_DEVICE_FAMILY = "1,2,7";
+            };
+            name = Debug;
+        };
+        CCCCCCCCCCCCCCCCCCCCCCDDD /* Release */ = {
+            isa = XCBuildConfiguration;
+            buildSettings = {
+                ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+                ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
+                CODE_SIGN_STYLE = Manual;
+                DEVELOPMENT_TEAM = "";
+                ENABLE_PREVIEWS = YES;
+                FRAMEWORK_SEARCH_PATHS = (
+                    "$(inherited)",
+                    "$(PROJECT_DIR)",
+                );
+                INFOPLIST_FILE = "VisionOSLlamaTest/Info.plist";
+                LD_RUNPATH_SEARCH_PATHS = (
+                    "$(inherited)",
+                    "@executable_path/Frameworks",
+                );
+                PRODUCT_BUNDLE_IDENTIFIER = "org.ggml.VisionOSLlamaTest";
+                PRODUCT_NAME = "$(TARGET_NAME)";
+                PROVISIONING_PROFILE_SPECIFIER = "";
+                SUPPORTED_PLATFORMS = "xros xrsimulator";
+                SWIFT_VERSION = 5.0;
+                TARGETED_DEVICE_FAMILY = "1,2,7";
+            };
+            name = Release;
+        };
+/* End XCBuildConfiguration section */
+EOF
+
+# Finish the project.pbxproj file
+cat >> "${TEMP_DIR}/${APP_NAME}/${APP_NAME}.xcodeproj/project.pbxproj" << EOF
+/* Begin XCConfigurationList section */
+        8888888888888888888888AA /* Build configuration list for PBXProject "${APP_NAME}" */ = {
+            isa = XCConfigurationList;
+            buildConfigurations = (
+                9999999999999999999999AA /* Debug */,
+                AAAAAAAAAAAAAAAAAAAAABBB /* Release */,
+            );
+            defaultConfigurationIsVisible = 0;
+            defaultConfigurationName = Release;
+        };
+        4444444444444444444444AA /* Build configuration list for PBXNativeTarget "${APP_NAME}" */ = {
+            isa = XCConfigurationList;
+            buildConfigurations = (
+                BBBBBBBBBBBBBBBBBBBBBBCCC /* Debug */,
+                CCCCCCCCCCCCCCCCCCCCCCDDD /* Release */,
+            );
+            defaultConfigurationIsVisible = 0;
+            defaultConfigurationName = Release;
+        };
+/* End XCConfigurationList section */
+    };
+    rootObject = 7777777777777777777777AA /* Project object */;
+}
+EOF
+
+# 2. Copy XCFramework to test project
+echo "Copying XCFramework to test project..."
+cp -R "${XCFRAMEWORK_PATH}" "${TEMP_DIR}/${APP_NAME}/"
+
+# 3. Build and archive the app
+echo "Building and archiving test app..."
+cd "${TEMP_DIR}/${APP_NAME}"
+
+# Create a simple xcscheme file to avoid xcodebuild scheme issues
+mkdir -p "${APP_NAME}.xcodeproj/xcshareddata/xcschemes"
+cat > "${APP_NAME}.xcodeproj/xcshareddata/xcschemes/${APP_NAME}.xcscheme" << EOF
+<?xml version="1.0" encoding="UTF-8"?>
+<Scheme
+   LastUpgradeVersion = "1510"
+   version = "1.3">
+   <BuildAction
+      parallelizeBuildables = "YES"
+      buildImplicitDependencies = "YES">
+      <BuildActionEntries>
+         <BuildActionEntry
+            buildForTesting = "YES"
+            buildForRunning = "YES"
+            buildForProfiling = "YES"
+            buildForArchiving = "YES"
+            buildForAnalyzing = "YES">
+            <BuildableReference
+               BuildableIdentifier = "primary"
+               BlueprintIdentifier = "3333333333333333333333AA"
+               BuildableName = "${APP_NAME}.app"
+               BlueprintName = "${APP_NAME}"
+               ReferencedContainer = "container:${APP_NAME}.xcodeproj">
+            </BuildableReference>
+         </BuildActionEntry>
+      </BuildActionEntries>
+   </BuildAction>
+   <TestAction
+      buildConfiguration = "Debug"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      shouldUseLaunchSchemeArgsEnv = "YES">
+      <Testables>
+      </Testables>
+   </TestAction>
+   <LaunchAction
+      buildConfiguration = "Debug"
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      launchStyle = "0"
+      useCustomWorkingDirectory = "NO"
+      ignoresPersistentStateOnLaunch = "NO"
+      debugDocumentVersioning = "YES"
+      debugServiceExtension = "internal"
+      allowLocationSimulation = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "3333333333333333333333AA"
+            BuildableName = "${APP_NAME}.app"
+            BlueprintName = "${APP_NAME}"
+            ReferencedContainer = "container:${APP_NAME}.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </LaunchAction>
+   <ProfileAction
+      buildConfiguration = "Release"
+      shouldUseLaunchSchemeArgsEnv = "YES"
+      savedToolIdentifier = ""
+      useCustomWorkingDirectory = "NO"
+      debugDocumentVersioning = "YES">
+      <BuildableProductRunnable
+         runnableDebuggingMode = "0">
+         <BuildableReference
+            BuildableIdentifier = "primary"
+            BlueprintIdentifier = "3333333333333333333333AA"
+            BuildableName = "${APP_NAME}.app"
+            BlueprintName = "${APP_NAME}"
+            ReferencedContainer = "container:${APP_NAME}.xcodeproj">
+         </BuildableReference>
+      </BuildableProductRunnable>
+   </ProfileAction>
+   <AnalyzeAction
+      buildConfiguration = "Debug">
+   </AnalyzeAction>
+   <ArchiveAction
+      buildConfiguration = "Release"
+      revealArchiveInOrganizer = "YES">
+   </ArchiveAction>
+</Scheme>
+EOF
+
+# Now use xcodebuild with an explicitly defined product name for visionOS
+xcodebuild -project "${APP_NAME}.xcodeproj" -scheme "${APP_NAME}" -sdk xros -configuration Release archive -archivePath "${ARCHIVE_PATH}" CODE_SIGN_IDENTITY="-" CODE_SIGNING_REQUIRED=NO CODE_SIGNING_ALLOWED=NO PRODUCT_NAME="${APP_NAME}" SWIFT_OPTIMIZATION_LEVEL="-Onone" -quiet
+
+# 4. Create IPA from archive
+echo "Creating IPA from archive..."
+mkdir -p "${TEMP_DIR}/Payload"
+cp -R "${ARCHIVE_PATH}/Products/Applications/${APP_NAME}.app" "${TEMP_DIR}/Payload/"
+
+# Check and log app structure before zipping
+echo "App structure:"
+ls -la "${TEMP_DIR}/Payload/${APP_NAME}.app/"
+echo "Frameworks:"
+ls -la "${TEMP_DIR}/Payload/${APP_NAME}.app/Frameworks/" 2>/dev/null || echo "No Frameworks directory found"
+
+cd "${TEMP_DIR}"
+zip -r "${IPA_PATH}" Payload
+
+# Check embedded provisioning profile
+echo "Checking provisioning profile (if any)..."
+PROVISIONING_PROFILE=$(find "${ARCHIVE_PATH}/Products/Applications/${APP_NAME}.app" -name "embedded.mobileprovision" 2>/dev/null)
+if [ -n "$PROVISIONING_PROFILE" ]; then
+    echo "Found embedded provisioning profile:"
+    security cms -D -i "$PROVISIONING_PROFILE" || echo "Unable to decode provisioning profile"
+else
+    echo "No embedded provisioning profile found (expected for ad-hoc builds)"
+fi
+
+# 5. Validate the IPA
+echo "Validating IPA..."
+VALIDATION_OUTPUT="${VALIDATION_DIR}/validation_output.txt"
+
+# Check if authentication credentials are provided
+AUTH_ARGS=""
+if [ -n "$APPLE_ID" ] && [ -n "$APPLE_PASSWORD" ]; then
+    echo "Using Apple ID authentication for validation..."
+    AUTH_ARGS="--username \"$APPLE_ID\" --password \"$APPLE_PASSWORD\""
+else
+    echo "No authentication credentials provided. Will perform basic validation."
+    echo "To use your personal developer account, you can run the script with:"
+    echo "  APPLE_ID='your.email@example.com' APPLE_PASSWORD='your-app-specific-password' ./validate-visionos.sh"
+    echo "Note: You need to create an app-specific password at https://appleid.apple.com/account/manage"
+fi
+
+# Run validation with detailed output
+echo "Running validation with altool..."
+if [ -n "$AUTH_ARGS" ]; then
+    # Use eval to properly handle the quoted arguments
+    eval "xcrun altool --validate-app -f \"${IPA_PATH}\" --type visionos --output-format xml $AUTH_ARGS" 2>&1 | tee "${VALIDATION_OUTPUT}"
+else
+    xcrun altool --validate-app -f "${IPA_PATH}" --type visionos --output-format xml 2>&1 | tee "${VALIDATION_OUTPUT}"
+fi
+VALIDATION_RESULT=$?
+
+# Final validation result
+FINAL_VALIDATION_RESULT=0
+
+# Check if validation failed because the app isn't in App Store Connect
+if grep -q "No suitable application records were found" "${VALIDATION_OUTPUT}"; then
+    echo "⚠️ App Store Connect Warning: The app bundle identifier is not found in App Store Connect"
+    echo "This is expected for apps that haven't been registered in App Store Connect yet."
+    echo "This doesn't indicate a problem with the build or framework."
+
+    # Perform alternative validation
+    echo "Performing alternative validation checks..."
+
+    # Check if IPA was created successfully
+    if [ -f "${IPA_PATH}" ] && [ -s "${IPA_PATH}" ]; then
+        echo "✅ IPA file created successfully"
+    else
+        echo "❌ IPA file not created or empty"
+        FINAL_VALIDATION_RESULT=1
+    fi
+
+    # Check if app binary exists and is executable
+    if [ -f "${TEMP_DIR}/Payload/${APP_NAME}.app/${APP_NAME}" ] && [ -x "${TEMP_DIR}/Payload/${APP_NAME}.app/${APP_NAME}" ]; then
+        echo "✅ App binary exists and is executable"
+    else
+        echo "❌ App binary missing or not executable"
+        FINAL_VALIDATION_RESULT=1
+    fi
+
+    # Check if framework was properly embedded
+    if [ -d "${TEMP_DIR}/Payload/${APP_NAME}.app/Frameworks/llama.framework" ]; then
+        echo "✅ llama.framework properly embedded"
+    else
+        echo "❌ llama.framework not properly embedded"
+        FINAL_VALIDATION_RESULT=1
+    fi
+
+    # Check if framework binary exists
+    if [ -f "${TEMP_DIR}/Payload/${APP_NAME}.app/Frameworks/llama.framework/llama" ]; then
+        echo "✅ Framework binary exists"
+
+        # Further validate framework by checking architecture
+        ARCHS=$(lipo -info "${TEMP_DIR}/Payload/${APP_NAME}.app/Frameworks/llama.framework/llama" 2>/dev/null | grep -o "arm64\\|x86_64" | tr '\n' ' ')
+        if [ -n "$ARCHS" ]; then
+            echo "✅ Framework architecture(s): $ARCHS"
+        else
+            echo "⚠️ Could not determine framework architecture"
+        fi
+    else
+        echo "❌ Framework binary missing"
+        FINAL_VALIDATION_RESULT=1
+    fi
+
+    if [ $FINAL_VALIDATION_RESULT -eq 0 ]; then
+        echo "✅ Alternative validation PASSED: App built successfully with embedded framework"
+    else
+        echo "❌ Alternative validation FAILED: Issues found with the app or framework"
+    fi
+elif grep -q "You must specify authentication credentials" "${VALIDATION_OUTPUT}" && [ -z "$AUTH_ARGS" ]; then
+    echo "✅ visionOS Validation PASSED: IPA successfully validated"
+    echo "Results saved to ${VALIDATION_OUTPUT}"
+else
+    echo "❌ visionOS Validation FAILED: IPA validation found issues"
+    echo "See validation output at ${VALIDATION_OUTPUT}"
+    echo ""
+    echo "==== VALIDATION ERRORS ===="
+
+    # Try to extract specific errors from the output
+    if grep -q "Error" "${VALIDATION_OUTPUT}"; then
+        grep -A 5 "Error" "${VALIDATION_OUTPUT}"
+    else
+        # If no specific error found, show the whole log
+        cat "${VALIDATION_OUTPUT}"
+    fi
+
+    # Additional debugging: check IPA contents
+    echo ""
+    echo "==== IPA CONTENTS ===="
+    mkdir -p "${TEMP_DIR}/ipa_contents"
+    unzip -q "${IPA_PATH}" -d "${TEMP_DIR}/ipa_contents"
+    ls -la "${TEMP_DIR}/ipa_contents/Payload/${APP_NAME}.app/"
+
+    # Check for code signing issues
+    echo ""
+    echo "==== CODE SIGNING INFO ===="
+    codesign -vv -d "${TEMP_DIR}/ipa_contents/Payload/${APP_NAME}.app" 2>&1 || echo "Code signing verification failed"
+
+    # Check embedded frameworks
+    echo ""
+    echo "==== FRAMEWORK INFO ===="
+    ls -la "${TEMP_DIR}/ipa_contents/Payload/${APP_NAME}.app/Frameworks/" 2>/dev/null || echo "No Frameworks directory found"
+fi
+
+# Don't clean up on error to allow inspection
+if [ $FINAL_VALIDATION_RESULT -ne 0 ]; then
+    echo ""
+    echo "Temporary files kept for inspection at: ${TEMP_DIR}"
+    echo "===== visionOS Validation Process Failed ====="
+    exit 1
+fi
+
+# Clean up temporary files but keep build artifacts
+if [ $FINAL_VALIDATION_RESULT -eq 0 ]; then
+    echo "Cleaning up temporary files..."
+    #rm -rf "${TEMP_DIR}"
+fi
+
+echo "===== visionOS Validation Process Completed ====="
+exit $FINAL_VALIDATION_RESULT
diff --git a/spm-headers/ggml-alloc.h b/spm-headers/ggml-alloc.h
deleted file mode 120000
index 0361ffc38..000000000
--- a/spm-headers/ggml-alloc.h
+++ /dev/null
@@ -1 +0,0 @@
-../ggml/include/ggml-alloc.h
\ No newline at end of file
diff --git a/spm-headers/ggml-backend.h b/spm-headers/ggml-backend.h
deleted file mode 120000
index 7295f0f0d..000000000
--- a/spm-headers/ggml-backend.h
+++ /dev/null
@@ -1 +0,0 @@
-../ggml/include/ggml-backend.h
\ No newline at end of file
diff --git a/spm-headers/ggml-cpp.h b/spm-headers/ggml-cpp.h
deleted file mode 120000
index 8a8604cc2..000000000
--- a/spm-headers/ggml-cpp.h
+++ /dev/null
@@ -1 +0,0 @@
-../ggml/include/ggml-cpp.h
\ No newline at end of file
diff --git a/spm-headers/ggml-cpu.h b/spm-headers/ggml-cpu.h
deleted file mode 120000
index 66e629607..000000000
--- a/spm-headers/ggml-cpu.h
+++ /dev/null
@@ -1 +0,0 @@
-../ggml/include/ggml-cpu.h
\ No newline at end of file
diff --git a/spm-headers/ggml-metal.h b/spm-headers/ggml-metal.h
deleted file mode 120000
index aefad5fa0..000000000
--- a/spm-headers/ggml-metal.h
+++ /dev/null
@@ -1 +0,0 @@
-../ggml/include/ggml-metal.h
\ No newline at end of file
diff --git a/spm-headers/ggml.h b/spm-headers/ggml.h
deleted file mode 120000
index 0bdfeacbd..000000000
--- a/spm-headers/ggml.h
+++ /dev/null
@@ -1 +0,0 @@
-../ggml/include/ggml.h
\ No newline at end of file
diff --git a/spm-headers/llama.h b/spm-headers/llama.h
deleted file mode 120000
index b31388f0d..000000000
--- a/spm-headers/llama.h
+++ /dev/null
@@ -1 +0,0 @@
-../include/llama.h
\ No newline at end of file
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index c13b3755a..3970b7485 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -35,6 +35,10 @@
     #include <io.h>
 #endif
 
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#endif
+
 // TODO: consider moving to llama-impl.h if needed in more places
 #if defined(_WIN32)
 static std::string llama_format_win_err(DWORD err) {
@@ -472,7 +476,11 @@ struct llama_mlock::impl {
 
         char* errmsg = std::strerror(errno);
         bool suggest = (errno == ENOMEM);
-
+#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV)
+        // visionOS/tvOS dont't support RLIMIT_MEMLOCK
+        // Skip resource limit checks on visionOS/tvOS
+        suggest = false;
+#else
         struct rlimit lock_limit;
         if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
             suggest = false;
@@ -480,6 +488,7 @@ struct llama_mlock::impl {
         if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
             suggest = false;
         }
+#endif
 
         LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
                 size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");

From 06a92a193a07afe445929607be9d5e4d033956fb Mon Sep 17 00:00:00 2001
From: Clauszy <zhangyub@uniontech.com>
Date: Wed, 5 Mar 2025 15:25:45 +0800
Subject: [PATCH 05/19] server : fix cache reuse logic (#12161)

The first kv shift offsets the positions of all tokens after head_c.
When using llama_kv_cache_seq_rm next, using head_c will remove the valid tokens because their positions have already been offset.
---
 examples/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 2306dc26f..e4f7e43fd 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3003,7 +3003,7 @@ struct server_context {
                                             const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
 
                                             llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
-                                            llama_kv_cache_seq_add(ctx, slot.id, head_c, -1,     kv_shift);
+                                            llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
 
                                             for (size_t i = 0; i < n_match; i++) {
                                                 slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];

From 3ccbfe5a71c74ac574b00607067d0aa0a49df04c Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Wed, 5 Mar 2025 08:34:02 +0100
Subject: [PATCH 06/19] ci : remove xframework upload (#12190)

* ci : remove xframework upload

This commit removes the upload of the xframework zip file as an
artifact.

The motivation for this change is that the xframework zip file is
currently being uploaded as part of strategy and will therefore be
attempted to be uploaded multiple times and will fail the build.

The uploading should be moved to somewhere else in the build to avoid
this.

* ci : add xcframework upload to macos-latest job
---
 .github/workflows/build.yml | 50 ++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b0c180f26..ad6b33d9b 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -716,31 +716,6 @@ jobs:
         run: |
           ./build-xcframework.sh
 
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
-
   windows-msys2:
     runs-on: windows-latest
 
@@ -1369,6 +1344,31 @@ jobs:
       - name: Build Xcode project
         run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
 
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
+
   android-build:
     runs-on: ubuntu-latest
 

From fa31c438e0e709242ab7334d26fc3be3dcda07a0 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Wed, 5 Mar 2025 10:22:29 +0100
Subject: [PATCH 07/19] ci : fix xcframework artifact tag (#12191)

The commit add the name parameter to the upload-artifact action to
ensure that the artifact is uploaded with the correct name.

The motivation for this is that currently the uploaded xcframework
is named as llama-b1-xcframework.zip. With this change the name of this
artifact should contain the build number like the other artifacts.
---
 .github/workflows/build.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ad6b33d9b..65f068a9b 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1368,6 +1368,7 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
+          name: llama-${{ steps.tag.outputs.name }}-xcframework
 
   android-build:
     runs-on: ubuntu-latest

From 669912d9a5bf927312c553332ff997f0a99da8fb Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@users.noreply.github.com>
Date: Wed, 5 Mar 2025 13:05:13 +0000
Subject: [PATCH 08/19] `tool-call`: fix Qwen 2.5 Coder support, add micro
 benchmarks, support trigger patterns for lazy grammars (#12034)

* sampler: turn lazy grammar trigger words to regexes

* add scripts/tool_bench.sh & .py

* constrain llama json output regardless of function name if matches at beginning

* update relaxed newline space rule in grammar tests

* support add_generation_prompt query parameter (useful for /apply_template)

* Update src/llama-grammar.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 README.md                                     |   2 +-
 common/chat.cpp                               | 437 ++++++++++++------
 common/common.cpp                             |  28 +-
 common/common.h                               |  21 +-
 common/json-schema-to-grammar.cpp             |   9 +-
 common/json-schema-to-grammar.h               |   1 -
 common/sampling.cpp                           |  51 +-
 examples/json_schema_to_grammar.py            |   2 +-
 .../public_legacy/json-schema-to-grammar.mjs  |   2 +-
 examples/server/server.cpp                    |  62 +--
 examples/server/tests/unit/test_tool_call.py  | 236 ++++++----
 examples/server/tests/utils.py                |  20 +-
 examples/server/utils.hpp                     |   6 +-
 include/llama.h                               |  22 +-
 models/templates/README.md                    |   2 +-
 requirements.txt                              |   1 +
 requirements/requirements-all.txt             |   1 +
 requirements/requirements-tool_bench.txt      |  12 +
 scripts/fetch_server_test_models.py           |   2 +-
 scripts/tool_bench.py                         | 368 +++++++++++++++
 scripts/tool_bench.sh                         |  66 +++
 src/llama-grammar.cpp                         |  44 +-
 src/llama-grammar.h                           |  15 +-
 src/llama-sampling.cpp                        |  53 ++-
 tests/test-chat.cpp                           | 133 +++++-
 tests/test-json-schema-to-grammar.cpp         | 126 ++---
 26 files changed, 1314 insertions(+), 408 deletions(-)
 mode change 100644 => 100755 examples/server/tests/unit/test_tool_call.py
 create mode 100644 requirements/requirements-tool_bench.txt
 create mode 100755 scripts/tool_bench.py
 create mode 100755 scripts/tool_bench.sh

diff --git a/README.md b/README.md
index ffb59a422..d73b0495d 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 
 - **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
 - **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
-- Universal tool call support in `llama-server`: https://github.com/ggml-org/llama.cpp/pull/9639
+- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
 - Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
 - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
diff --git a/common/chat.cpp b/common/chat.cpp
index 9ebe4c578..1b10219cc 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -449,12 +449,6 @@ std::string common_chat_format_name(common_chat_format format) {
     }
 }
 
-const common_grammar_options grammar_options {
-    /* .dotall = */ false,
-    /* .compact_spaces = */ false,
-    // /* .compact_spaces = */ true,
-};
-
 static bool parse_json(std::string::const_iterator & it, const std::string::const_iterator & end, json & out) {
     // // https://json.nlohmann.me/features/parsing/sax_interface/
     struct json_error_locator : public nlohmann::json_sax<json> {
@@ -500,6 +494,34 @@ static bool parse_json(std::string::const_iterator & it, const std::string::cons
     }
 }
 
+static bool parse_literal(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) {
+    auto expected_it = expected.begin();
+    auto tmp_it = it;
+    while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) {
+        ++tmp_it;
+        ++expected_it;
+    }
+    if (expected_it == expected.end()) {
+        it = tmp_it;
+        return true;
+    }
+    return false;
+}
+
+static std::optional<std::smatch> parse_pattern(std::string::const_iterator & it, const std::string::const_iterator & end, const std::regex & expected) {
+    std::smatch match;
+    if (std::regex_match(it, end, match, expected)) {
+        it = match.suffix().first;
+        return match;
+    }
+    return std::nullopt;
+}
+
+static void consume_spaces(std::string::const_iterator & it, const std::string::const_iterator & end) {
+    while (it != end && std::isspace(*it)) {
+        ++it;
+    }
+}
 
 /**
  * Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between.
@@ -509,7 +531,8 @@ static common_chat_msg parse_json_tool_calls(
     const std::string& input,
     const std::optional<std::regex> & trigger_opt,
     const std::regex & function_regex,
-    const std::regex & close_regex) {
+    const std::regex & close_regex,
+    bool allow_raw_python = false) {
     std::smatch match;
 
     common_chat_msg result;
@@ -540,14 +563,19 @@ static common_chat_msg parse_json_tool_calls(
         it = rit->suffix().first;
 
         json arguments;
-        if (!parse_json(it, end, arguments)) {
+        if (parse_json(it, end, arguments)) {
+            if (!std::regex_search(it, end, match, close_regex)) {
+                throw std::runtime_error("Malformed input, missing closing pattern: " + input);
+            }
+            it = match.suffix().first;
+            result.tool_calls.push_back({name, arguments.is_string() ? arguments.get<std::string>() : arguments.dump(), /* id= */ ""});
+        } else {
+            if (allow_raw_python && name == "python") {
+                result.tool_calls.push_back({name, json({{"code", std::string(it, end)}}).dump(), /* id= */ ""});
+                break;
+            }
             throw std::runtime_error("Failed to parse json tool call arguments: " + input);
         }
-        if (!std::regex_search(it, end, match, close_regex)) {
-            throw std::runtime_error("Malformed input, missing closing pattern: " + input);
-        }
-        it = match.suffix().first;
-        result.tool_calls.push_back({name, arguments.is_string() ? arguments.get<std::string>() : arguments.dump(), /* id= */ ""});
     }
 
     if (!result.tool_calls.empty()) {
@@ -559,29 +587,29 @@ static common_chat_msg parse_json_tool_calls(
     return result;
 }
 
+static common_chat_tool_call process_tool_call(const json & tool_call) {
+    const auto & arguments = tool_call.at("arguments");
+    return {
+        /* .name = */ tool_call.at("name"),
+        /* .arguments = */ arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
+        /* .id = */ tool_call.contains("id") ? tool_call.at("id") : "",
+    };
+}
 static common_chat_msg parse_prefixed_json_tool_call_array(const std::string& input, const std::string & prefix, size_t rstrip_prefix = 0) {
     auto content_end = input.find(prefix);
     size_t tc_start = std::string::npos;
 
     common_chat_msg result;
     result.role = "assistant";
-    const auto process_tool_calls = [&](const json & tool_calls) {
-        for (const auto & tool_call : tool_calls) {
-            const auto & arguments = tool_call.at("arguments");
-            result.tool_calls.push_back({
-                tool_call.at("name"),
-                arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
-                tool_call.contains("id") ? tool_call.at("id") : "",
-            });
-        }
-    };
     if (content_end == std::string::npos) {
         result.content = input;
     } else {
         tc_start = content_end + prefix.size() - rstrip_prefix;
         result.content = input.substr(0, content_end);
         auto tool_calls = json::parse(input.substr(tc_start));
-        process_tool_calls(tool_calls);
+        for (const auto & tool_call : tool_calls) {
+            result.tool_calls.emplace_back(process_tool_call(tool_call));
+        }
     }
     return result;
 }
@@ -700,7 +728,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
     data.grammar_lazy = false;
     data.grammar = build_grammar([&](const common_grammar_builder & builder) {
         builder.add_schema("root", schema);
-    }, grammar_options);
+    });
 
     auto tweaked_messages = common_chat_template::add_system(
         inputs.messages,
@@ -770,8 +798,11 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
             schema["maxItems"] = 1;
         }
         builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
-    }, grammar_options);
-    data.grammar_triggers.push_back({"[TOOL_CALLS]", /* .at_start = */ true});
+    });
+    data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"});
+    data.preserved_tokens = {
+        "[TOOL_CALLS]",
+    };
     data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
     data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
     return data;
@@ -813,14 +844,18 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
             schema["maxItems"] = 1;
         }
         builder.add_rule("root", "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\"");
-    }, grammar_options);
-    data.grammar_triggers.push_back({"<|START_ACTION|>", /* .at_start = */ false});
+    });
+    data.grammar_triggers.push_back({
+        COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
+        "<|START_ACTION|>",
+    });
     data.preserved_tokens = {
+        "<|START_ACTION|>",
+        "<|END_ACTION|>",
         "<|START_RESPONSE|>",
         "<|END_RESPONSE|>",
         "<|START_THINKING|>",
         "<|END_THINKING|>",
-        "<|END_ACTION|>",
     };
     auto adjusted_messages = json::array();
     for (const auto & msg : inputs.messages) {
@@ -840,9 +875,9 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
     return data;
 }
 static common_chat_msg common_chat_parse_command_r7b(const std::string & input, bool extract_reasoning) {
-    static std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|>)([\\s\\S\\n\\r]*)");
-    static std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>");
-    static std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>");
+    static std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S]*?)<\\|END_THINKING\\|>)([\\s\\S]*)");
+    static std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S]*?)<\\|END_ACTION\\|>");
+    static std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S]*?)<\\|END_RESPONSE\\|>");
 
     std::smatch match;
 
@@ -945,23 +980,23 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com
                 builder.add_rule(
                     name + "-call",
                     "\"{\" space "
-                    "( \"\\\"type\\\":\" space \"\\\"function\\\",\" space )? "
-                    "\"\\\"name\\\": \\\"" + name + "\\\", \\\"parameters\\\": \" " +
-                        builder.add_schema(name + "-args", parameters) +
-                    " \"}\""));
-            data.grammar_triggers.push_back({"{\"name\": \"" + name + "\"", /* .at_start = */ true});
+                    "( \"\\\"type\\\"\"       space \":\" space \"\\\"function\\\"\"     space \",\" space )? "
+                    "  \"\\\"name\\\"\"       space \":\" space \"\\\"" + name + "\\\"\" space \",\" space "
+                    "  \"\\\"parameters\\\"\" space \":\" space " + builder.add_schema(name + "-args", parameters) + " "
+                    "\"}\" space"));
+        });
+        // Small models may hallucinate function names so we match anything (*at the start*) that looks like the JSON of a function call, regardless of the name.
+        data.grammar_triggers.push_back({
+            COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
+            "\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"", // + name + "\"[\\s\\S]*",
         });
-        data.grammar_triggers.push_back({"{\"name\":", /* .at_start = */ true});
-        data.grammar_triggers.push_back({"{\n  \"name\":", /* .at_start = */ true});
-        data.grammar_triggers.push_back({"{\n    \"name\":", /* .at_start = */ true});
-        data.grammar_triggers.push_back({"{\"type\": \"function\"", /* .at_start = */ true});
-        data.grammar_triggers.push_back({"{\n  \"type\": \"function\"", /* .at_start = */ true});
-        data.grammar_triggers.push_back({"{\n    \"type\": \"function\"", /* .at_start = */ true});
         if (!builtin_tools.empty()) {
-            data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
+            data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
+            data.preserved_tokens.push_back("<|python_tag|>");
         }
+        // Allow a few empty lines on top of the usual constrained json schema space rule.
         builder.add_rule("root", string_join(tool_rules, " | "));
-    }, grammar_options);
+    });
     data.additional_stops.push_back("<|eom_id|>");
     data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {
         {"tools_in_user_message", false},
@@ -974,33 +1009,33 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com
 }
 static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) {
     // TODO: tighten & simplify the parser, don't accept leading text context.
-    static std::regex function_regex("\\{[\\s\\n\\r]*(?:\"type\"[\\s\\n\\r]*:[\\s\\n\\r]*\"function\"[\\s\\n\\r]*,[\\s\\n\\r]*|[\\s\\n\\r]*)\"name\"[\\s\\n\\r]*:[\\s\\n\\r]*\"([^\"]+)\"[\\s\\n\\r]*,[\\s\\n\\r]*\"parameters\": ");
-    static std::regex close_regex("\\}");
-    static std::regex builtin_call_regex("<\\|python_tag\\|>([^.(]+)\\.call\\((.*)\\)");
+    static std::regex function_regex(
+        "\\s*\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"parameters\"\\s*: ");
+    static std::regex close_regex("\\}\\s*");
+    static std::regex builtin_call_regex("<\\|python_tag\\|>\\s*([^.(]+)\\s*\\.\\s*call\\s*\\(\\s*([\\w]+)\\s*=\\s*([\\s\\S]*?)\\)");
 
     if (with_builtin_tools) {
         std::smatch match;
         if (std::regex_match(input, match, builtin_call_regex)) {
-            auto name = match[1].str();
-            auto raw_args = match[2].str();
+            try {
+                auto name = match[1].str();
+                auto arg_name = match[2].str();
+                auto arg_value_str = match[3].str();
+                auto arg_value = json::parse(arg_value_str);
 
-            // TODO: if/when builtin tools start accepting more than 1 argument, use parse_json for real parsing.
-            auto it_eq = raw_args.find('=');
-            auto arg_name = raw_args.substr(0, it_eq);
-            auto arg_value_str = raw_args.substr(it_eq + 1);
-            auto arg_value = json::parse(arg_value_str);
-
-            common_chat_msg msg;
-            msg.role = "assistant";
-            msg.content = match.prefix().str();
-            msg.tool_calls.push_back({
-                /* .name = */ name,
-                /* .arguments = */ (json {
-                    {arg_name, arg_value},
-                }).dump(),
-                /* .id = */ "",
-            });
-            return msg;
+                common_chat_msg msg;
+                msg.role = "assistant";
+                msg.tool_calls.push_back({
+                    /* .name = */ name,
+                    /* .arguments = */ (json {
+                        {arg_name, arg_value},
+                    }).dump(),
+                    /* .id = */ "",
+                });
+                return msg;
+            } catch (const std::exception & e) {
+                LOG_WRN("Failed to parse builtin tool call arguments (%s): %s", e.what(), input.c_str());
+            }
         }
     }
     return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex);
@@ -1017,10 +1052,10 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
                 std::string name = function.at("name");
                 auto parameters = function.at("parameters");
                 builder.resolve_refs(parameters);
-                auto args_rule = builder.add_schema(name + "-args", parameters);
                 tool_rules.push_back(builder.add_rule(name + "-call",
                     "\"<｜tool▁call▁begin｜>function<｜tool▁sep｜>" + name + "\\n"
-                    "```json\\n\" " + args_rule + " \"```<｜tool▁call▁end｜>\""));
+                    "```json\\n\" " + builder.add_schema(name + "-args", parameters) + " "
+                    "\"```<｜tool▁call▁end｜>\""));
             });
             // Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
             // so we accept common variants (then it's all constrained)
@@ -1029,18 +1064,20 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
                 "(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
                 "\"<｜tool▁calls▁end｜>\""
                 " space");
-            data.grammar_triggers.push_back({"<｜tool▁calls▁begin｜>", /* .at_start = */ false});
-            data.grammar_triggers.push_back({"<｜tool_calls_begin｜>", /* .at_start = */ false});
-            data.grammar_triggers.push_back({"<｜tool calls begin｜>", /* .at_start = */ false});
-            data.grammar_triggers.push_back({"<｜tool\\_calls\\_begin｜>", /* .at_start = */ false});
+            data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<｜tool▁calls▁begin｜>"});
+            data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<｜tool_calls_begin｜>"});
+            data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<｜tool calls begin｜>"});
+            data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<｜tool\\_calls\\_begin｜>"});
             data.preserved_tokens = {
                 "<think>",
                 "</think>",
+                "<｜tool▁calls▁begin｜>",
+                "<｜tool▁call▁begin｜>",
                 "<｜tool▁sep｜>",
-                "<｜tool▁calls▁end｜",
                 "<｜tool▁call▁end｜>",
+                "<｜tool▁calls▁end｜",
             };
-        }, grammar_options);
+        });
     }
     auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
 
@@ -1129,8 +1166,11 @@ static common_chat_params common_chat_params_init_firefunction_v2(const common_c
                 schema["maxItems"] = 1;
             }
             builder.add_rule("root", "\" functools\"? " + builder.add_schema("tool_calls", schema));
-        }, grammar_options);
-        data.grammar_triggers.push_back({" functools[", /* .at_start = */ false});
+        });
+        data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, " functools["});
+        data.preserved_tokens = {
+            " functools[",
+        };
         data.format = COMMON_CHAT_FORMAT_FIREFUNCTION_V2;
     } else {
         data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
@@ -1158,11 +1198,28 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
                 auto parameters = function.at("parameters");
                 builder.resolve_refs(parameters);
                 auto args_rule = builder.add_schema(name + "-args", parameters);
-                first_tool_rules.push_back(builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule));
+                first_tool_rules.push_back(builder.add_rule(name + "-call", "( \"assistant<|end_header_id|>\\n\" )? \"" + name + "\\n\" " + args_rule));
                 subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule));
-                data.grammar_triggers.push_back({name, /* .at_start = */ true});
-                data.grammar_triggers.push_back({">>>" + name, /* .at_start = */ false});
+                data.grammar_triggers.push_back({
+                    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
+                    regex_escape(name + "\n"),
+                });
+                data.grammar_triggers.push_back({
+                    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
+                    regex_escape("assistant<|end_header_id|>\n" + name + "\n"),
+                });
+                data.grammar_triggers.push_back({
+                    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
+                    regex_escape(">>>" + name + "\n"),
+                });
+                data.grammar_triggers.push_back({
+                    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
+                    ">>>assistant<|end_header_id|>\n" + name,
+                });
             });
+            data.preserved_tokens = {
+                "<|end_header_id|>",
+            };
             auto first_rule = first_tool_rules.empty() ? "" : builder.add_rule("first_tool_call", string_join(first_tool_rules, " | ")) + " space";
             if (inputs.parallel_tool_calls) {
                 auto subsequent_rule = builder.add_rule("subsequent_tool_call", string_join(subsequent_tool_rules, " | ")) + " space";
@@ -1171,34 +1228,20 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
                 builder.add_rule("root", first_rule);
             }
 
-        }, grammar_options);
+        });
     }
     return data;
 }
 
-static bool consume(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) {
-    auto expected_it = expected.begin();
-    auto tmp_it = it;
-    while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) {
-        ++tmp_it;
-        ++expected_it;
-    }
-    if (expected_it == expected.end()) {
-        it = tmp_it;
-        return true;
-    }
-    return false;
-}
-
 static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) {
-    static std::regex function_regex(R"((?:>>>)?(\w+)\n)");
+    static std::regex function_regex(R"((?:>>>)?(?:assistant<|end_header_id|>\n)?(\w+)\n)");
     static std::regex close_regex(R"($|(?=>>>))");
 
     std::string content;
     auto it = input.begin();
     const auto end = input.end();
 
-    if (consume(it, end, "all\n")) {
+    if (parse_literal(it, end, "all\n")) {
         std::smatch match;
         if (std::regex_search(it, end, match, function_regex)) {
             auto fun_it = match.prefix().second;
@@ -1213,7 +1256,7 @@ static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & in
     }
     // TODO: tighten & simplify.
     try {
-        auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex);
+        auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex, /* allow_raw_python= */ true);
         res.content = content + res.content;
         return res;
     } catch (const std::exception & e) {
@@ -1266,12 +1309,13 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
         });
         if (has_raw_python) {
             tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*"));
-            data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false});
+            data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"});
+            data.preserved_tokens.push_back("<|python_tag|>");
         }
         auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space";
         builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
-        data.grammar_triggers.push_back({"<function=", /* .at_start = */ false});
-    }, grammar_options);
+        data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function="});
+    });
 
     data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
     // TODO: if (has_raw_python)
@@ -1306,6 +1350,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
     data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
     data.grammar = build_grammar([&](const common_grammar_builder & builder) {
         std::vector<std::string> tool_rules;
+        std::vector<std::string> tool_call_alts;
         foreach_function(inputs.tools, [&](const json & tool) {
             const auto & function = tool.at("function");
             std::string name = function.at("name");
@@ -1319,57 +1364,173 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
                 }},
                 {"required", json::array({"name", "arguments"})},
             }));
+            tool_call_alts.push_back(builder.add_rule(
+                name + "-function-tag",
+                "\"<function\" ( \"=" + name + "\" | \" name=\\\"" + name + "\\\"\" ) \">\" space " +
+                builder.add_schema(name + "-args", parameters) + " "
+                "\"</function>\" space"));
+
+            data.grammar_triggers.push_back({
+                COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
+                "<function=" + name + ">",
+            });
+            auto escaped_name = regex_escape(name);
+            data.grammar_triggers.push_back({
+                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
+                "<function\\s+name\\s*=\\s*\"" + escaped_name + "\"",
+            });
         });
-        auto tool_call = "\"<tool_call>\" space " + builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " \"</tool_call>\" space";
+        auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
+        std::vector<std::string> alt_tags {
+            any_tool_call,
+            "\"<tool_call>\" space "     + any_tool_call + " \"</tool_call>\"",
+            // The rest is just to accommodate common "good bad" outputs.
+            "\"<function_call>\" space " + any_tool_call + " \"</function_call>\"",
+            "\"<response>\"  space "     + any_tool_call + " \"</response>\"",
+            "\"<tools>\"     space "     + any_tool_call + " \"</tools>\"",
+            "\"<json>\"      space "     + any_tool_call + " \"</json>\"",
+            "\"<xml>\"      space "     + any_tool_call + " \"</xml>\"",
+            "\"<JSON>\"      space "     + any_tool_call + " \"</JSON>\"",
+        };
+        auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
+        tool_call_alts.push_back(wrappable_tool_call);
+        tool_call_alts.push_back(
+            "( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
+        auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
         builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call);
-        data.grammar_triggers.push_back({"<tool_call>", /* .at_start = */ false});
-        data.preserved_tokens = { "</tool_call>" };
-    }, grammar_options);
+        data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<tool_call>"});
+        data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<function"});
+        // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
+        data.grammar_triggers.push_back({
+            COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
+            "(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?\\s*\\{\\s*\"", //name\"\\s*:\\s*\"" + escaped_name + "\"",
+        });
+        data.preserved_tokens = {
+            "<tool_call>",
+            "</tool_call>",
+            "<function",
+            "<tools>",
+            "</tools>",
+            "<response>",
+            "</response>",
+            "<function_call>",
+            "</function_call>",
+            "<json>",
+            "</json>",
+            "<JSON>",
+            "</JSON>",
+            "```",
+            "```json",
+            "```xml",
+        };
+    });
 
     data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
     data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
     return data;
 }
-static common_chat_msg common_chat_parse_hermes_2_pro(const std::string & input) {
+static common_chat_msg common_chat_parse_hermes_2_pro(const std::string& input) {
+    const static std::regex open_regex(
+        "(?:"
+        "(```(?:xml|json)?\\n\\s*)?"         // match 1 (block_start)
+        "(<tool_call>"                   // match 2 (open_tag)
+        "|<function_call>"
+        "|<tool>"
+        "|<tools>"
+        "|<response>"
+        "|<json>"
+        "|<xml>"
+        "|<JSON>"
+        ")?"
+        "(\\s*\\{\\s*\"name\"\\s*:[\\s\\S]*)"    // match 3 (named tool call + rest)
+        ")"
+        "|"
+        "(?:<function=([^>]+)>"            // match 4 (function name)
+        "|<function name=\"([^\"]+)\">)" // match 5 (function name again)
+        "([\\s\\S]*)"                   // match 6 (function arguments + rest)})"
+    );
+
     try {
-        std::regex start_pattern(R"([\n\s]*<tool_call>)");
-        std::regex middle_pattern(R"([\n\s]*</tool_call>[\n\s]*<tool_call>)");
-        std::regex end_pattern(R"([\n\s]*</tool_call>[\n\s]*$)");
 
         common_chat_msg msg;
         msg.role = "assistant";
 
-        auto end = input.end();
-        std::sregex_iterator rend;
-        std::sregex_iterator rit(input.begin(), end, start_pattern);
-        if (rit == rend) {
-            msg.content = input;
-            return msg;
-        }
+        std::string::const_iterator it = input.begin();
+        const std::string::const_iterator end = input.end();
+        std::smatch match;
 
-        msg.content = rit->prefix();
-
-        auto it = rit->suffix().first;
         while (it != end) {
-            json call;
-            if (!parse_json(it, end, call)) {
-                throw std::runtime_error("Failed to parse json tool call");
-            }
-            const auto & arguments = call.at("arguments");
-            msg.tool_calls.push_back({
-                call.at("name"),
-                arguments.dump(),
-                // arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
-                /* id= */ "",
-            });
-            rit = {it, end, middle_pattern};
-            if (rit != rend) {
-                it = rit->suffix().first;
-            } else {
-                rit = {it, end, end_pattern};
-                if (rit == rend) {
-                    throw std::runtime_error("Malformed input, missing </tool_call>");
+            if (std::regex_search(it, end, match, open_regex)) {
+                // Add content before the match
+                msg.content += std::string(it, match[0].first);
+
+                auto block_start = match[1].str();
+                std::string block_end = block_start.empty() ? "" : "```";
+
+                auto open_tag = match[2].str();
+                std::string close_tag;
+
+                if (match[3].matched) {
+                    close_tag = open_tag.empty() ? "" : "</" + open_tag.substr(1);
+                    auto json_it = match[3].first;
+                    json tool_call;
+                    if (parse_json(json_it, end, tool_call) && tool_call.contains("name") && tool_call.contains("arguments")) {
+
+                        msg.tool_calls.emplace_back(process_tool_call(tool_call));
+                        it = json_it;  // Move iterator past parsed JSON
+
+                        // Handle close tags
+                        consume_spaces(it, end);
+                        if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
+                            throw std::runtime_error("Failed to parse closing tag");
+                        }
+                        consume_spaces(it, end);
+                        if (!block_end.empty() && !parse_literal(it, end, block_end)) {
+                            throw std::runtime_error("Failed to parse block end");
+                        }
+                        consume_spaces(it, end);
+                    } else {
+                        // Not a valid tool call, treat as content
+                        msg.content += std::string(match[0].first, match[0].second);
+                        it = match[0].second;
+                    }
+                } else {
+                    auto function_name = match[4].str();
+                    if (function_name.empty()) {
+                        function_name = match[5].str();
+                    }
+                    GGML_ASSERT(!function_name.empty());
+
+                    close_tag = "</function>";
+                    // Start parsing from after the opening tags
+                    auto json_it = match[6].first;
+                    json arguments;
+                    if (parse_json(json_it, end, arguments)) {
+                        msg.tool_calls.emplace_back(process_tool_call({
+                            {"name", function_name},
+                            {"arguments", arguments},
+                        }));
+                        it = json_it;  // Move iterator past parsed JSON
+
+                        // Handle close tags
+                        consume_spaces(it, end);
+                        if (!close_tag.empty() && !parse_literal(it, end, close_tag)) {
+                            throw std::runtime_error("Failed to parse closing tag");
+                        }
+                        consume_spaces(it, end);
+                        if (!block_end.empty() && !parse_literal(it, end, block_end)) {
+                            throw std::runtime_error("Failed to parse block end");
+                        }
+                        consume_spaces(it, end);
+                    } else {
+                        // Not a valid tool call, treat as content
+                        msg.content += std::string(match[0].first, match[0].second);
+                        it = match[0].second;
+                    }
                 }
+            } else {
+                // Add remaining content
+                msg.content += std::string(it, end);
                 break;
             }
         }
diff --git a/common/common.cpp b/common/common.cpp
index d2b0d50e3..6448b7b03 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -10,7 +10,6 @@
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
-#include "json-schema-to-grammar.h"
 #include "llama.h"
 
 #include <algorithm>
@@ -483,6 +482,11 @@ void string_replace_all(std::string & s, const std::string & search, const std::
     s = std::move(builder);
 }
 
+std::string regex_escape(const std::string & s) {
+    static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
+    return std::regex_replace(s, special_chars, "\\$0");
+}
+
 std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
     std::ostringstream result;
     for (size_t i = 0; i < values.size(); ++i) {
@@ -2026,3 +2030,25 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
     return result;
 }
 
+template <>
+json common_grammar_trigger::to_json() const {
+    json out {
+        {"type", (int) type},
+        {"value", value},
+    };
+    if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+        out["token"] = (int) token;
+    }
+    return out;
+}
+
+template <>
+common_grammar_trigger common_grammar_trigger::from_json(const json & in) {
+    common_grammar_trigger out;
+    out.type = (common_grammar_trigger_type) in.at("type").get<int>();
+    out.value = in.at("value").get<std::string>();
+    if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+        out.token = (llama_token) in.at("token").get<int>();
+    }
+    return out;
+}
diff --git a/common/common.h b/common/common.h
index f4b4a96fb..733f7f1c8 100644
--- a/common/common.h
+++ b/common/common.h
@@ -110,9 +110,21 @@ enum common_conversation_mode {
     COMMON_CONVERSATION_MODE_AUTO     = 2,
 };
 
+enum common_grammar_trigger_type {
+    COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
+    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
+    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
+    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
+};
+
 struct common_grammar_trigger {
-    std::string word;
-    bool at_start;
+    common_grammar_trigger_type type;
+    std::string value;
+    llama_token token = LLAMA_TOKEN_NULL;
+
+    // T can only be nlohmann::ordered_json
+    template <class T> T to_json() const;
+    template <class T> static common_grammar_trigger from_json(const T & in);
 };
 
 // sampling parameters
@@ -163,8 +175,7 @@ struct common_params_sampling {
 
     std::string                         grammar; // optional BNF-like grammar to constrain sampling
     bool                                grammar_lazy = false;
-    std::vector<common_grammar_trigger> grammar_trigger_words;  // optional trigger words to trigger lazy grammar
-    std::vector<llama_token>            grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
+    std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
     std::set<llama_token>               preserved_tokens;
 
     std::vector<llama_logit_bias> logit_bias; // logit biases to apply
@@ -458,6 +469,8 @@ std::string string_repeat(const std::string & str, size_t n);
 
 void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
 
+std::string regex_escape(const std::string & s);
+
 template<class T>
 static std::vector<T> string_split(const std::string & str, char delim) {
     static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 3ebcc3d9f..906798225 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -264,7 +264,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
     throw std::runtime_error("At least one of min_value or max_value must be set");
 }
 
-const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
+const std::string SPACE_RULE = "| \" \" | \"\\n\"{1,2} [ \\t]{0,20}";
 
 struct BuiltinRule {
     std::string content;
@@ -764,11 +764,10 @@ private:
 public:
     SchemaConverter(
         const std::function<json(const std::string &)> & fetch_json,
-        bool dotall,
-        bool compact_spaces)
+        bool dotall)
           : _fetch_json(fetch_json), _dotall(dotall)
     {
-        _rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE;
+        _rules["space"] = SPACE_RULE;
     }
 
     void resolve_refs(json & schema, const std::string & url) {
@@ -1007,7 +1006,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
 }
 
 std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
-    SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
+    SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
     common_grammar_builder builder {
         /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
             return converter._add_rule(name, rule);
diff --git a/common/json-schema-to-grammar.h b/common/json-schema-to-grammar.h
index 62a3b0a44..4613f5d9f 100644
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -16,7 +16,6 @@ struct common_grammar_builder {
 
 struct common_grammar_options {
     bool dotall = false;
-    bool compact_spaces = false;
 };
 
 std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 972fbe9da..baf22066d 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -160,16 +160,53 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
         GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
     } else {
-        std::vector<const char *> trigger_words;
-        trigger_words.reserve(params.grammar_trigger_words.size());
-        for (const auto & str : params.grammar_trigger_words) {
-            trigger_words.push_back(str.word.c_str());
+        std::vector<std::string> patterns_at_start;
+        std::vector<std::string> patterns_anywhere;
+        std::vector<llama_token> trigger_tokens;
+        for (const auto & trigger : params.grammar_triggers) {
+            switch (trigger.type) {
+                case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
+                {
+                    const auto & word = trigger.value;
+                    patterns_anywhere.push_back(regex_escape(word));
+                    break;
+                }
+                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
+                case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
+                {
+                    const auto & pattern = trigger.value;
+                    (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
+                    break;
+                }
+                case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
+                {
+                    const auto token = trigger.token;
+                    trigger_tokens.push_back(token);
+                    break;
+                }
+                default:
+                    GGML_ASSERT(false && "unknown trigger type");
+            }
+        }
+
+        std::vector<std::string> trigger_patterns;
+        if (!patterns_at_start.empty()) {
+            trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
+        }
+        if (!patterns_anywhere.empty()) {
+            trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
+        }
+
+        std::vector<const char *> trigger_patterns_c;
+        trigger_patterns_c.reserve(trigger_patterns.size());
+        for (const auto & regex : trigger_patterns) {
+            trigger_patterns_c.push_back(regex.c_str());
         }
 
         grmr = params.grammar_lazy
-             ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
-                                               trigger_words.data(), trigger_words.size(),
-                                               params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
+             ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
+                                                        trigger_patterns_c.data(), trigger_patterns_c.size(),
+                                                        trigger_tokens.data(), trigger_tokens.size())
              :      llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
     }
 
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index fc9f0097f..55f94c0b0 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -195,7 +195,7 @@ class BuiltinRule:
         self.deps = deps or []
 
 # Constraining spaces to prevent model "running away".
-SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
+SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}'
 
 PRIMITIVE_RULES = {
     'boolean'      : BuiltinRule('("true" | "false") space', []),
diff --git a/examples/server/public_legacy/json-schema-to-grammar.mjs b/examples/server/public_legacy/json-schema-to-grammar.mjs
index e67bb15c1..f767ce7b7 100644
--- a/examples/server/public_legacy/json-schema-to-grammar.mjs
+++ b/examples/server/public_legacy/json-schema-to-grammar.mjs
@@ -1,5 +1,5 @@
 // WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
-const SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}';
+const SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}';
 
 function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
   if (minItems === 0 && maxItems === 1) {
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index e4f7e43fd..2a526b0e7 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -131,9 +131,9 @@ struct slot_params {
             lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
         }
 
-        std::vector<std::string> grammar_trigger_words;
-        for (const auto & trigger : sampling.grammar_trigger_words) {
-            grammar_trigger_words.push_back(trigger.word);
+        auto grammar_triggers = json::array();
+        for (const auto & trigger : sampling.grammar_triggers) {
+            grammar_triggers.push_back(trigger.to_json<json>());
         }
 
         return json {
@@ -170,8 +170,8 @@ struct slot_params {
             {"n_probs",                   sampling.n_probs},
             {"min_keep",                  sampling.min_keep},
             {"grammar",                   sampling.grammar},
-            {"grammar_trigger_words",     grammar_trigger_words},
-            {"grammar_trigger_tokens",    sampling.grammar_trigger_tokens},
+            {"grammar_lazy",              sampling.grammar_lazy},
+            {"grammar_triggers",          grammar_triggers},
             {"preserved_tokens",          sampling.preserved_tokens},
             {"chat_format",               common_chat_format_name(oaicompat_chat_format)},
             {"samplers",                  samplers},
@@ -356,24 +356,6 @@ struct server_task {
         }
 
         {
-            const auto grammar_triggers = data.find("grammar_triggers");
-            if (grammar_triggers != data.end()) {
-                for (const auto & t : *grammar_triggers) {
-                    common_grammar_trigger trigger;
-                    trigger.word = t.at("word");
-                    trigger.at_start = t.at("at_start");
-
-                    auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true);
-                    if (ids.size() == 1) {
-                        SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str());
-                        params.sampling.grammar_trigger_tokens.push_back(ids[0]);
-                        params.sampling.preserved_tokens.insert(ids[0]);
-                        continue;
-                    }
-                    SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str());
-                    params.sampling.grammar_trigger_words.push_back(trigger);
-                }
-            }
             const auto preserved_tokens = data.find("preserved_tokens");
             if (preserved_tokens != data.end()) {
                 for (const auto & t : *preserved_tokens) {
@@ -383,12 +365,38 @@ struct server_task {
                         params.sampling.preserved_tokens.insert(ids[0]);
                     } else {
                         // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens.
-                        SRV_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str());
+                        SRV_DBG("Not preserved because more than 1 token: %s\n", t.get<std::string>().c_str());
                     }
                 }
             }
-            if (params.sampling.grammar_lazy) {
-                GGML_ASSERT(params.sampling.grammar_trigger_tokens.size() > 0 || params.sampling.grammar_trigger_words.size() > 0);
+            const auto grammar_triggers = data.find("grammar_triggers");
+            if (grammar_triggers != data.end()) {
+                for (const auto & t : *grammar_triggers) {
+                    auto ct = common_grammar_trigger::from_json(t);
+                    if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
+                        const auto & word = ct.value;
+                        auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
+                        if (ids.size() == 1) {
+                            auto token = ids[0];
+                            if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) {
+                                throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word);
+                            }
+                            SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str());
+                            common_grammar_trigger trigger;
+                            trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN;
+                            trigger.value = (llama_token) token;
+                            params.sampling.grammar_triggers.push_back(trigger);
+                        } else {
+                            SRV_DBG("Grammar trigger word: `%s`\n", word.c_str());
+                            params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
+                        }
+                    } else {
+                        params.sampling.grammar_triggers.push_back(ct);
+                    }
+                }
+            }
+            if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) {
+                throw std::runtime_error("Error: no triggers set for lazy grammar!");
             }
         }
 
@@ -2045,7 +2053,7 @@ struct server_context {
 
         if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
             // Might be better to reject the request with a 400 ?
-            SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.params.n_predict, slot.n_predict);
+            SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d\n", slot.params.n_predict, slot.n_predict);
             slot.params.n_predict = slot.n_predict;
         }
 
diff --git a/examples/server/tests/unit/test_tool_call.py b/examples/server/tests/unit/test_tool_call.py
old mode 100644
new mode 100755
index a91a2f333..25bddbaee
--- a/examples/server/tests/unit/test_tool_call.py
+++ b/examples/server/tests/unit/test_tool_call.py
@@ -1,4 +1,12 @@
+#!/usr/bin/env python
 import pytest
+
+# ensure grandparent path is in sys.path
+from pathlib import Path
+import sys
+path = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(path))
+
 from utils import *
 
 server: ServerProcess
@@ -66,15 +74,8 @@ WEATHER_TOOL = {
 }
 
 
-def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, argument_key: str | None):
-    global server
-    n_predict = 512
-    # server = ServerPreset.stories15m_moe()
-    server.jinja = True
-    server.n_predict = n_predict
-    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
-    server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    res = server.make_request("POST", "/chat/completions", data={
+def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict, argument_key: str | None, n_predict, **kwargs):
+    res = server.make_request("POST", "/v1/chat/completions", data={
         "max_tokens": n_predict,
         "messages": [
             {"role": "system", "content": "You are a coding assistant."},
@@ -83,16 +84,14 @@ def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, a
         "tool_choice": "required",
         "tools": [tool],
         "parallel_tool_calls": False,
-        "temperature": 0.0,
-        "top_k": 1,
-        "top_p": 1.0,
+        **kwargs,
     })
     assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
     choice = res.body["choices"][0]
     tool_calls = choice["message"].get("tool_calls")
     assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
     tool_call = tool_calls[0]
-    assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
+    assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
     expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
     assert expected_function_name == tool_call["function"]["name"]
     actual_arguments = tool_call["function"]["arguments"]
@@ -108,7 +107,14 @@ def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, a
     ("meta-llama-Llama-3.3-70B-Instruct",             PYTHON_TOOL,          "code"),
 ])
 def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None):
-    do_test_completion_with_required_tool_tiny(template_name, tool, argument_key)
+    global server
+    n_predict = 512
+    # server = ServerPreset.stories15m_moe()
+    server.jinja = True
+    server.n_predict = n_predict
+    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+    server.start(timeout_seconds=TIMEOUT_SERVER_START)
+    do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, temperature=0.0, top_k=1, top_p=1.0)
 
 
 @pytest.mark.slow
@@ -130,10 +136,17 @@ def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict,
     ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      TEST_TOOL,            "success"),
     ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B",      PYTHON_TOOL,          "code"),
     ("fireworks-ai-llama-3-firefunction-v2",          TEST_TOOL,            "success"),
-    ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "code"),
+    # ("fireworks-ai-llama-3-firefunction-v2",          PYTHON_TOOL,          "code"),
 ])
 def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None):
-    do_test_completion_with_required_tool_tiny(template_name, tool, argument_key)
+    global server
+    n_predict = 512
+    # server = ServerPreset.stories15m_moe()
+    server.jinja = True
+    server.n_predict = n_predict
+    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+    server.start(timeout_seconds=TIMEOUT_SERVER_START)
+    do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict)
 
 
 @pytest.mark.slow
@@ -142,25 +155,33 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,
     (PYTHON_TOOL,  "code",     "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
     (PYTHON_TOOL,  "code",     "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
 
-    # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
     (TEST_TOOL,    "success",  "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
     (PYTHON_TOOL,  "code",     "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+    (PYTHON_TOOL,  "code",     "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              "chatml"),
 
     (TEST_TOOL,    "success",  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
     (PYTHON_TOOL,  "code",     "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
     (PYTHON_TOOL,  "code",     "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
 
+    (TEST_TOOL,    "success",  "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      None),
+    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      None),
+    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      None),
+    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      None),
+    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
     (TEST_TOOL,    "success",  "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
     (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
     (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
 
     (TEST_TOOL,    "success",  "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
     (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
-    # (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
+    (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
 
     (TEST_TOOL,    "success",  "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
     (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
-    # (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   "chatml"),
+    (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   "chatml"),
 
     (TEST_TOOL,    "success",  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
     (PYTHON_TOOL,  "code",     "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
@@ -176,10 +197,10 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,
 
     (TEST_TOOL,    "success",  "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
     (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
-    # (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"),
-    # TODO: fix these
-    # (TEST_TOOL,    "success",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-    # (PYTHON_TOOL,  "code",     "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+    (PYTHON_TOOL,  "code",     "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
 ])
 def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
     global server
@@ -197,7 +218,7 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
     elif isinstance(template_override, str):
         server.chat_template = template_override
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    res = server.make_request("POST", "/chat/completions", data={
+    res = server.make_request("POST", "/v1/chat/completions", data={
         "max_tokens": n_predict,
         "messages": [
             {"role": "system", "content": "You are a coding assistant."},
@@ -215,7 +236,7 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
     tool_calls = choice["message"].get("tool_calls")
     assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
     tool_call = tool_calls[0]
-    assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
+    # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
     expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"]
     assert expected_function_name == tool_call["function"]["name"]
     actual_arguments = tool_call["function"]["arguments"]
@@ -225,13 +246,8 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
         assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}"
 
 
-def do_test_completion_without_tool_call(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
-    global server
-    server.jinja = True
-    server.n_predict = n_predict
-    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
-    server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    res = server.make_request("POST", "/chat/completions", data={
+def do_test_completion_without_tool_call(server: ServerProcess, n_predict: int, tools: list[dict], tool_choice: str | None, **kwargs):
+    res = server.make_request("POST", "/v1/chat/completions", data={
         "max_tokens": n_predict,
         "messages": [
             {"role": "system", "content": "You are a coding assistant."},
@@ -239,9 +255,7 @@ def do_test_completion_without_tool_call(template_name: str, n_predict: int, too
         ],
         "tools": tools if tools else None,
         "tool_choice": tool_choice,
-        "temperature": 0.0,
-        "top_k": 1,
-        "top_p": 1.0,
+        **kwargs,
     }, timeout=TIMEOUT_HTTP_REQUEST)
     assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
     choice = res.body["choices"][0]
@@ -254,7 +268,12 @@ def do_test_completion_without_tool_call(template_name: str, n_predict: int, too
     ("meta-llama-Llama-3.3-70B-Instruct",         128, [PYTHON_TOOL], 'none'),
 ])
 def test_completion_without_tool_call_fast(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
-    do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice)
+    global server
+    server.jinja = True
+    server.n_predict = n_predict
+    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+    server.start(timeout_seconds=TIMEOUT_SERVER_START)
+    do_test_completion_without_tool_call(server, n_predict, tools, tool_choice)
 
 
 @pytest.mark.slow
@@ -270,7 +289,12 @@ def test_completion_without_tool_call_fast(template_name: str, n_predict: int, t
     ("meta-llama-Llama-3.2-3B-Instruct",              256, [PYTHON_TOOL], 'none'),
 ])
 def test_completion_without_tool_call_slow(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None):
-    do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice)
+    global server
+    server.jinja = True
+    server.n_predict = n_predict
+    server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+    server.start(timeout_seconds=TIMEOUT_SERVER_START)
+    do_test_completion_without_tool_call(server, n_predict, tools, tool_choice)
 
 
 @pytest.mark.slow
@@ -281,6 +305,12 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t
     ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
     ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
 
+    ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      None),
+    ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
+    ("bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      None),
+    ("bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
     ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
     ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
 
@@ -324,48 +354,52 @@ def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] |
     elif isinstance(template_override, str):
         server.chat_template = template_override
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    res = server.make_request("POST", "/chat/completions", data={
-        "max_tokens": n_predict,
+    do_test_weather(server, max_tokens=n_predict)
+
+
+def do_test_weather(server: ServerProcess, **kwargs):
+    res = server.make_request("POST", "/v1/chat/completions", data={
         "messages": [
             {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."},
             {"role": "user", "content": "What is the weather in Istanbul?"},
         ],
         "tools": [WEATHER_TOOL],
+        **kwargs,
     }, timeout=TIMEOUT_HTTP_REQUEST)
     assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
     choice = res.body["choices"][0]
     tool_calls = choice["message"].get("tool_calls")
     assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
     tool_call = tool_calls[0]
-    assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
-    assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"]
+    # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
+    assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"], f'Expected weather tool call, got {tool_call["function"]["name"]}'
     actual_arguments = json.loads(tool_call["function"]["arguments"])
     assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}"
     location = actual_arguments["location"]
     assert isinstance(location, str), f"Expected location to be a string, got {type(location)}: {json.dumps(location)}"
-    assert re.match('^Istanbul(, (TR|Turkey|Türkiye))?$', location), f'Expected Istanbul for location, got {location}'
+    assert re.match('^Istanbul(( |, ?)(TR|Turkey|Türkiye))?$', location), f'Expected Istanbul for location, got {location}'
 
 
 @pytest.mark.slow
 @pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [
     (None,                                           128,  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       "chatml"),
-    (None,                                           128,  "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",         None),
+    (None,                                           128,  "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None),
+    (None,                                           128,  "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"),
     (None,                                           128,  "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",         "chatml"),
     (None,                                           128,  "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",     ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
     (None,                                           128,  "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",       ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
     (None,                                           128,  "bartowski/functionary-small-v3.2-GGUF:Q8_0",        ("meetkai/functionary-medium-v3.2", None)),
-    (None,                                           128,  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M",  None),
     (None,                                           128,  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M",  None),
     (None,                                           128,  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M",  "chatml"),
     (None,                                           128,  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
+    ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)",            8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
 
     # TODO: fix these (wrong results, either didn't respect decimal instruction or got wrong value)
-    ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)",            8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-    # ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)",            8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
+    # (None,                                           128,  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M",  None),
+    # ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)",            8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
 ])
 def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
     global server
-    # n_predict = 512
     server.n_slots = 1
     server.jinja = True
     server.n_ctx = 8192 * 2
@@ -379,10 +413,14 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str,
     elif isinstance(template_override, str):
         server.chat_template = template_override
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    res = server.make_request("POST", "/chat/completions", data={
+    do_test_calc_result(server, result_override, n_predict)
+
+
+def do_test_calc_result(server: ServerProcess, result_override: str | None, n_predict: int, **kwargs):
+    res = server.make_request("POST", "/v1/chat/completions", data={
         "max_tokens": n_predict,
         "messages": [
-            {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things, and provide very concise answers. Do not explain your reasoning to the user. Provide any numerical values back to the user with at most two decimals."},
+            {"role": "system", "content": "You are a tools-calling assistant. You express numerical values with at most two decimals."},
             {"role": "user", "content": "What's the y coordinate of a point on the unit sphere at angle 30 degrees?"},
             {
                 "role": "assistant",
@@ -423,7 +461,8 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str,
                     }
                 }
             }
-        ]
+        ],
+        **kwargs,
     }, timeout=TIMEOUT_HTTP_REQUEST)
     assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
     choice = res.body["choices"][0]
@@ -434,19 +473,19 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str,
     if result_override is not None:
         assert re.match(result_override, content), f'Expected {result_override}, got {content}'
     else:
-        assert re.match('^[\\s\\S]*?The (y[ -])?coordinate [\\s\\S]*?is (approximately )?0\\.56\\b|^0\\.56$', content), \
+        assert re.match('^[\\s\\S]*?((That\'s|\\bis) (approximately )?)?\\b0\\.(5\\b|56\\b|556)', content), \
             f'Expected something like "The y coordinate is 0.56.", got {content}'
 
 
 @pytest.mark.slow
 @pytest.mark.parametrize("n_predict,reasoning_format,expect_content,expect_reasoning_content,hf_repo,template_override", [
-    (128, 'deepseek',  "^The sum of 102 and 7 is 109.*",                        None,                                          "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
-    (128,  None,        "^The sum of 102 and 7 is 109.*",                       None,                                          "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
+    (128, 'deepseek',  "^The sum of 102 and 7 is 109[\\s\\S]*",                        None,                                          "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
+    (128,  None,        "^The sum of 102 and 7 is 109[\\s\\S]*",                       None,                                          "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
 
-    (1024, 'deepseek',  "To find the sum of.*",                                 "I need to calculate the sum of 102 and 7.*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-    (1024, 'none',      "^I need[\\s\\S]*?</think>\n?To find.*",                None,                                          "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+    (1024, 'deepseek',  "To find the sum of[\\s\\S]*",                                 "I need to calculate the sum of 102 and 7[\\s\\S]*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+    (1024, 'none',      "^(<think>\\s*)?I need[\\s\\S]*?</think>\\s*To find[\\s\\S]*",                None,                                          "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
 
-    (1024, 'deepseek',  "To find the sum of.*",                                 "First, I [\\s\\S]*",                          "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
+    (1024, 'deepseek',  "To find the sum of[\\s\\S]*",                                 "First, I [\\s\\S]*",                          "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
 ])
 def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] | None, expect_content: str | None, expect_reasoning_content: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
     global server
@@ -464,7 +503,7 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none']
     elif isinstance(template_override, str):
         server.chat_template = template_override
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    res = server.make_request("POST", "/chat/completions", data={
+    res = server.make_request("POST", "/v1/chat/completions", data={
         "max_tokens": n_predict,
         "messages": [
             {"role": "user", "content": "What's the sum of 102 and 7?"},
@@ -476,7 +515,7 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none']
 
     content = choice["message"].get("content")
     if expect_content is None:
-        assert content is None, f'Expected no content in {choice["message"]}'
+        assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
     else:
         assert re.match(expect_content, content), f'Expected {expect_content}, got {content}'
 
@@ -488,46 +527,46 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none']
 
 
 @pytest.mark.slow
-@pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [
-    (None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-    # (None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", "chatml"),
+@pytest.mark.parametrize("hf_repo,template_override", [
+    ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
 
-    (None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
-    (None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
+    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
 
-    (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai-functionary-medium-v3.2", None)),
-    (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
+    ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai-functionary-medium-v3.2", None)),
+    ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
 
-    ('{"code":"print("}',  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
-    (None,                 "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
+    # ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
+    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
 
-    (None,                 "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
-    (None,                 "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"),
+    ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
+    ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      None),
 
-    ('{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
-    (None,                 "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
+    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      None),
 
-    (None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
-    (None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
+    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
+    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
 
-    (None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
-    (None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
+    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
 
-    (None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
-    (None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
+    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
+    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
 
-    (None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
-    (None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+    ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
 
-    # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
-    (None,                 "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+    ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+    ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              "chatml"),
 ])
-def test_hello_world(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
+def test_hello_world(hf_repo: str, template_override: str | Tuple[str, str | None] | None):
     global server
+    n_predict = 512 # High because of DeepSeek R1
     server.n_slots = 1
     server.jinja = True
     server.n_ctx = 8192
-    server.n_predict = 512 # High because of DeepSeek R1
+    server.n_predict = n_predict
     server.model_hf_repo = hf_repo
     server.model_hf_file = None
     if isinstance(template_override, tuple):
@@ -537,31 +576,28 @@ def test_hello_world(expected_arguments_override: str | None, hf_repo: str, temp
     elif isinstance(template_override, str):
         server.chat_template = template_override
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
-    res = server.make_request("POST", "/chat/completions", data={
-        "max_tokens": 256,
+
+    do_test_hello_world(server, max_tokens=n_predict)
+
+
+def do_test_hello_world(server: ServerProcess, **kwargs):
+    res = server.make_request("POST", "/v1/chat/completions", data={
         "messages": [
-            {"role": "system", "content": "You are a coding assistant."},
+            {"role": "system", "content": "You are a tool-calling agent."},
             {"role": "user", "content": "say hello world with python"},
         ],
         "tools": [PYTHON_TOOL],
-        # Note: without these greedy params, Functionary v3.2 writes `def hello_world():\n    print("Hello, World!")\nhello_world()` which is correct but a pain to test.
-        "temperature": 0.0,
-        "top_k": 1,
-        "top_p": 1.0,
+        **kwargs,
     }, timeout=TIMEOUT_HTTP_REQUEST)
     assert res.status_code == 200, f"Expected status code 200, got {res.status_code}"
     choice = res.body["choices"][0]
     tool_calls = choice["message"].get("tool_calls")
     assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}'
     tool_call = tool_calls[0]
-    assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}'
+    # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}'
     assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"]
-    actual_arguments = tool_call["function"]["arguments"]
-    if expected_arguments_override is not None:
-        assert actual_arguments == expected_arguments_override
-    else:
-        actual_arguments = json.loads(actual_arguments)
-        assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}"
-        code = actual_arguments["code"]
-        assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}"
-        assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', code), f'Expected hello world, got {code}'
+    actual_arguments = json.loads(tool_call["function"]["arguments"])
+    assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}"
+    code = actual_arguments["code"]
+    assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}"
+    assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', code), f'Expected hello world, got {code}'
diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py
index f32a439f6..ec2d8ec55 100644
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@@ -67,6 +67,9 @@ class ServerProcess:
     id_slot: int | None = None
     cache_prompt: bool | None = None
     n_slots: int | None = None
+    ctk: str | None = None
+    ctv: str | None = None
+    fa: bool | None = None
     server_continuous_batching: bool | None = False
     server_embeddings: bool | None = False
     server_reranking: bool | None = False
@@ -84,6 +87,7 @@ class ServerProcess:
     reasoning_format: Literal['deepseek', 'none'] | None = None
     chat_template: str | None = None
     chat_template_file: str | None = None
+    server_path: str | None = None
 
     # session variables
     process: subprocess.Popen | None = None
@@ -97,7 +101,9 @@ class ServerProcess:
             self.server_port = int(os.environ["PORT"])
 
     def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
-        if "LLAMA_SERVER_BIN_PATH" in os.environ:
+        if self.server_path is not None:
+            server_path = self.server_path
+        elif "LLAMA_SERVER_BIN_PATH" in os.environ:
             server_path = os.environ["LLAMA_SERVER_BIN_PATH"]
         elif os.name == "nt":
             server_path = "../../../build/bin/Release/llama-server.exe"
@@ -151,6 +157,12 @@ class ServerProcess:
             server_args.extend(["--ctx-size", self.n_ctx])
         if self.n_slots:
             server_args.extend(["--parallel", self.n_slots])
+        if self.ctk:
+            server_args.extend(["-ctk", self.ctk])
+        if self.ctv:
+            server_args.extend(["-ctv", self.ctv])
+        if self.fa is not None:
+            server_args.append("-fa")
         if self.n_predict:
             server_args.extend(["--n-predict", self.n_predict])
         if self.slot_save_path:
@@ -184,7 +196,7 @@ class ServerProcess:
             server_args.extend(["--chat-template-file", self.chat_template_file])
 
         args = [str(arg) for arg in [server_path, *server_args]]
-        print(f"bench: starting server with: {' '.join(args)}")
+        print(f"tests: starting server with: {' '.join(args)}")
 
         flags = 0
         if "nt" == os.name:
@@ -215,6 +227,10 @@ class ServerProcess:
                     return  # server is ready
             except Exception as e:
                 pass
+            # Check if process died
+            if self.process.poll() is not None:
+                raise RuntimeError(f"Server process died with return code {self.process.returncode}")
+
             print(f"Waiting for server to start...")
             time.sleep(0.5)
         raise TimeoutError(f"Server did not start within {timeout_seconds} seconds")
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 144d914c2..393e3927c 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -607,6 +607,7 @@ static json oaicompat_completion_params_parse(
     inputs.use_jinja             = use_jinja;
     inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", false);
     inputs.extract_reasoning     = reasoning_format != COMMON_REASONING_FORMAT_NONE;
+    inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
     if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
         throw std::runtime_error("Cannot use custom grammar constraints with tools.");
     }
@@ -620,10 +621,7 @@ static json oaicompat_completion_params_parse(
     llama_params["grammar_lazy"]     = chat_params.grammar_lazy;
     auto grammar_triggers = json::array();
     for (const auto & trigger : chat_params.grammar_triggers) {
-        grammar_triggers.push_back({
-            {"word", trigger.word},
-            {"at_start", trigger.at_start},
-        });
+        grammar_triggers.push_back(trigger.to_json<json>());
     }
     llama_params["grammar_triggers"] = grammar_triggers;
     llama_params["preserved_tokens"] = chat_params.preserved_tokens;
diff --git a/include/llama.h b/include/llama.h
index ee6e73915..d62792c0a 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1205,17 +1205,29 @@ extern "C" {
                           const char * grammar_str,
                           const char * grammar_root);
 
-    /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
-    /// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future.
-    /// @param trigger_tokens A list of tokens that will trigger the grammar sampler.
-    LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
+    DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
             const struct llama_vocab * vocab,
                           const char * grammar_str,
                           const char * grammar_root,
                          const char ** trigger_words,
                                 size_t num_trigger_words,
                    const llama_token * trigger_tokens,
-                                size_t num_trigger_tokens);
+                                size_t num_trigger_tokens),
+        "use llama_sampler_init_grammar_lazy_patterns instead");
+
+
+    /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
+    /// @param trigger_patterns A list of patterns that will trigger the grammar sampler. Pattern will be matched from the start of the generation output, and grammar sampler will be fed content starting from its first match group.
+    /// @param trigger_tokens A list of tokens that will trigger the grammar sampler. Grammar sampler will be fed content starting from the trigger token included.
+    LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
+        const struct llama_vocab * vocab,
+                      const char * grammar_str,
+                      const char * grammar_root,
+                     const char ** trigger_patterns,
+                            size_t num_trigger_patterns,
+               const llama_token * trigger_tokens,
+                            size_t num_trigger_tokens);
+
 
     /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
     LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
diff --git a/models/templates/README.md b/models/templates/README.md
index 72c30d1e1..e4fd104fc 100644
--- a/models/templates/README.md
+++ b/models/templates/README.md
@@ -9,7 +9,7 @@ These templates can be updated with the following commands:
 ./scripts/get_chat_template.py deepseek-ai/DeepSeek-R1-Distill-Qwen-32B      > models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja
 ./scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2          > models/templates/fireworks-ai-llama-3-firefunction-v2.jinja
 ./scripts/get_chat_template.py google/gemma-2-2b-it                          > models/templates/google-gemma-2-2b-it.jinja
-./scripts/get_chat_template.py meetkai/functionary-medium-v3.                > models/templates/meetkai-functionary-medium-v3.jinja
+./scripts/get_chat_template.py meetkai/functionary-medium-v3.1               > models/templates/meetkai-functionary-medium-v3.1.jinja
 ./scripts/get_chat_template.py meetkai/functionary-medium-v3.2               > models/templates/meetkai-functionary-medium-v3.2.jinja
 ./scripts/get_chat_template.py meta-llama/Llama-3.1-8B-Instruct              > models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja
 ./scripts/get_chat_template.py meta-llama/Llama-3.2-3B-Instruct              > models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja
diff --git a/requirements.txt b/requirements.txt
index 9e190ae27..f2a18d628 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,4 @@
 -r ./requirements/requirements-convert_hf_to_gguf_update.txt
 -r ./requirements/requirements-convert_llama_ggml_to_gguf.txt
 -r ./requirements/requirements-convert_lora_to_gguf.txt
+-r ./requirements/requirements-tool_bench.txt
diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt
index 94de59d7e..439db8886 100644
--- a/requirements/requirements-all.txt
+++ b/requirements/requirements-all.txt
@@ -10,3 +10,4 @@
 -r ./requirements-convert_hf_to_gguf_update.txt
 -r ./requirements-convert_legacy_llama.txt
 -r ./requirements-convert_llama_ggml_to_gguf.txt
+-r ./requirements-tool_bench.txt
diff --git a/requirements/requirements-tool_bench.txt b/requirements/requirements-tool_bench.txt
new file mode 100644
index 000000000..b94521fc7
--- /dev/null
+++ b/requirements/requirements-tool_bench.txt
@@ -0,0 +1,12 @@
+aiohttp~=3.9.3
+pytest~=8.3.3
+huggingface_hub~=0.23.2
+matplotlib~=3.10.0
+numpy~=1.26.4
+openai~=1.55.3
+pandas~=2.2.3
+prometheus-client~=0.20.0
+requests~=2.32.3
+wget~=3.2
+typer~=0.15.1
+seaborn~=0.13.2
diff --git a/scripts/fetch_server_test_models.py b/scripts/fetch_server_test_models.py
index 05690b138..e6775bfc5 100755
--- a/scripts/fetch_server_test_models.py
+++ b/scripts/fetch_server_test_models.py
@@ -75,7 +75,7 @@ if __name__ == '__main__':
         logging.info(f'  - {m.hf_repo} / {m.hf_file}')
 
     cli_path = os.environ.get(
-        'LLAMA_SERVER_BIN_PATH',
+        'LLAMA_CLI_BIN_PATH',
         os.path.join(
             os.path.dirname(__file__),
             '../build/bin/Release/llama-cli.exe' if os.name == 'nt' else '../build/bin/llama-cli'))
diff --git a/scripts/tool_bench.py b/scripts/tool_bench.py
new file mode 100755
index 000000000..0f406bc42
--- /dev/null
+++ b/scripts/tool_bench.py
@@ -0,0 +1,368 @@
+#!/usr/bin/env uv run
+'''
+    Simplistic tool call benchmarks for llama-server and ollama.
+
+    Essentially runs the tests at server/examples/server/tests/unit/test_tool_call.py N times, at different temperatures and on different backends (current llama-server, baseline llama-server and ollama),
+    and plots the results of multiple runs (from same .jsonl file or multiple ones) as a success rate heatmap.
+
+    Simple usage example:
+
+        cmake -B build -DLLAMA_CURL=1 && cmake --build build --config Release -j -t llama-server
+
+        export LLAMA_SERVER_BIN_PATH=$PWD/build/bin/llama-server
+        export LLAMA_CACHE=${LLAMA_CACHE:-$HOME/Library/Caches/llama.cpp}
+
+        ./scripts/tool_bench.py run --n 30 --temp -1 --temp 0 --temp 1 --model "Qwen 2.5 1.5B Q4_K_M"      --output qwen1.5b.jsonl  --hf bartowski/Qwen2.5-1.5B-Instruct-GGUF      --ollama qwen2.5:1.5b-instruct-q4_K_M
+        ./scripts/tool_bench.py run --n 30 --temp -1 --temp 0 --temp 1 --model "Qwen 2.5 Coder 7B Q4_K_M"  --output qwenc7b.jsonl   --hf bartowski/Qwen2.5-Coder-7B-Instruct-GGUF  --ollama qwen2.5-coder:7b
+
+        ./scripts/tool_bench.py plot *.jsonl                         # Opens window w/ heatmap
+        ./scripts/tool_bench.py plot qwen*.jsonl  --output qwen.png  # Saves heatmap to qwen.png
+
+    (please see ./scripts/tool_bench.sh for a more complete example)
+'''
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "pytest",
+#     "pandas",
+#     "matplotlib",
+#     "seaborn",
+#     "requests",
+#     "wget",
+#     "typer",
+# ]
+# ///
+from contextlib import contextmanager
+from pathlib import Path
+import re
+from statistics import mean, median
+from typing import Annotated, Dict, List, Optional, Tuple
+import atexit
+import json
+import logging
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import subprocess
+import sys
+import time
+import typer
+
+sys.path.insert(0, Path(__file__).parent.parent.as_posix())
+if True:
+    from examples.server.tests.utils import ServerProcess
+    from examples.server.tests.unit.test_tool_call import TIMEOUT_SERVER_START, do_test_calc_result, do_test_hello_world, do_test_weather
+
+
+@contextmanager
+def scoped_server(sp: ServerProcess):
+    def stop():
+        nonlocal sp
+        if sp is not None:
+            sp.stop()
+            sp = None # type: ignore
+    atexit.register(stop)
+    yield sp
+    stop()
+
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+app = typer.Typer()
+
+
+@app.command()
+def plot(files: List[Path], output: Optional[Path] = None, test_regex: Optional[str] = None, server_regex: Optional[str] = None):
+
+    lines: List[Dict] = []
+    for file in files:
+        if not file.exists():
+            logger.error(f"File not found: {file}")
+            continue
+
+        try:
+            with file.open() as f:
+                raw_data = f.read()
+            logger.info(f"Reading {file} ({len(raw_data)} bytes)")
+
+            for line_num, line in enumerate(raw_data.split('\n'), 1):
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    record = json.loads(line)
+                    lines.append(record)
+                except json.JSONDecodeError as e:
+                    logger.warning(f"Invalid JSON at {file}:{line_num} - {e}")
+        except Exception as e:
+            logger.error(f"Error processing {file}: {e}")
+
+    if not lines:
+        raise Exception("No valid data was loaded")
+
+    data_dict: Dict[Tuple, float] = {}
+    models: List[str] = []
+    temps = set()
+    tests = set()
+    server_names = set()
+    total_counts = set()
+    for rec in lines:
+        try:
+            model = rec["model"]
+            temp = rec["temp"]
+            server_name = rec["server_name"]
+            test = rec["test"]
+            success = rec["success_ratio"]
+            success_count = rec["success_count"]
+            failure_count = rec["failure_count"]
+            total_count = success_count + failure_count
+            total_counts.add(total_count)
+
+            if test_regex and not re.search(test_regex, test):
+                continue
+
+            if server_regex and not re.search(server_regex, server_name):
+                continue
+
+            data_dict[(model, temp, server_name, test)] = success
+
+            if model not in models:
+                models.append(model)
+            temps.add(temp)
+            tests.add(test)
+            server_names.add(server_name)
+
+        except KeyError as e:
+            logger.warning(f"Missing required field in record: {e}")
+
+    if len(total_counts) > 1:
+        logger.warning(f"Total counts are not consistent: {total_counts}")
+
+    # Sort the collected values
+    temps = list(sorted(temps, key=lambda x: x if x is not None else -1))
+    tests = list(sorted(tests))
+    server_names = list(sorted(server_names))
+
+    logger.info(f"Processed {len(lines)} lines")
+    logger.info(f"Found {len(data_dict)} valid data points")
+    logger.info(f"Models: {models}")
+    logger.info(f"Temperatures: {temps}")
+    logger.info(f"Tests: {tests}")
+    logger.info(f"Servers: {server_names}")
+
+    matrix: list[list[float]] = []
+    index: list[str] = []
+
+    all_cols = [
+        (server_name, test)
+        for server_name in server_names
+        for test in tests
+    ]
+    for model in models:
+        for temp in temps:
+            index.append(f"{model} @ {temp}")
+            row_vals = [
+                data_dict.get((model, temp, server_name, test), np.nan)
+                for server_name, test in all_cols
+            ]
+            matrix.append(row_vals)
+
+    columns: list[str] = [f"{server_name}\n{test}" for server_name, test in all_cols]
+
+    df = pd.DataFrame(matrix, index=np.array(index), columns=np.array(columns))
+
+    plt.figure(figsize=(12, 6))
+
+    sns.heatmap(
+        df, annot=True, cmap="RdYlGn", vmin=0.0, vmax=1.0, cbar=True, fmt=".2f", center=0.5, square=True, linewidths=0.5,
+        cbar_kws={"label": "Success Ratio"},
+    )
+
+    plt.title(f"Tool Call Bench (n = {str(min(total_counts)) if len(total_counts) == 1 else f'{min(total_counts)}-{max(total_counts)}'})\nSuccess Ratios by Server & Test", pad=20)
+    plt.xlabel("Server & Test", labelpad=10)
+    plt.ylabel("Model @ Temperature", labelpad=10)
+
+    plt.xticks(rotation=45, ha='right')
+    plt.yticks(rotation=0)
+
+    plt.tight_layout()
+
+    if output:
+        plt.savefig(output, dpi=300, bbox_inches='tight')
+        logger.info(f"Plot saved to {output}")
+    else:
+        plt.show()
+
+
+@app.command()
+def run(
+    output: Annotated[Path, typer.Option(help="Output JSON file")],
+    model: Annotated[Optional[str], typer.Option(help="Name of the model to test (server agnostic)")] = None,
+    hf: Annotated[Optional[str], typer.Option(help="GGUF huggingface model repo id (+ optional quant) to test w/ llama-server")] = None,
+    chat_template: Annotated[Optional[str], typer.Option(help="Chat template override for llama-server")] = None,
+    ollama: Annotated[Optional[str], typer.Option(help="Ollama model tag to test")] = None,
+    llama_baseline: Annotated[Optional[str], typer.Option(help="llama-server baseline binary path to use as baseline")] = None,
+    n: Annotated[int, typer.Option(help="Number of times to run each test")] = 10,
+    temp: Annotated[Optional[List[float]], typer.Option(help="Set of temperatures to test")] = None,
+    top_p: Annotated[Optional[float], typer.Option(help="top_p")] = None,
+    top_k: Annotated[Optional[int], typer.Option(help="top_k")] = None,
+    ctk: Annotated[Optional[str], typer.Option(help="ctk")] = None,
+    ctv: Annotated[Optional[str], typer.Option(help="ctv")] = None,
+    fa: Annotated[Optional[bool], typer.Option(help="fa")] = None,
+    seed: Annotated[Optional[int], typer.Option(help="Random seed")] = None,
+    port: Annotated[int, typer.Option(help="llama-server port")] = 8084,
+    force: Annotated[bool, typer.Option(help="Force overwrite of output file")] = False,
+    append: Annotated[bool, typer.Option(help="Append to output file")] = False,
+
+    test_hello_world: Annotated[bool, typer.Option(help="Whether to run the hello world test")] = True,
+    test_weather: Annotated[bool, typer.Option(help="Whether to run the weather test")] = True,
+    test_calc_result: Annotated[bool, typer.Option(help="Whether to run the calc result test")] = False,
+):
+    # Check only one of output and append
+
+    n_predict = 512 # High because of DeepSeek R1
+    # n_ctx = 8192
+    n_ctx = 2048
+
+    assert force or append or not output.exists(), f"Output file already exists: {output}; use --force to overwrite"
+
+    with output.open('a' if append else 'w') as output_file:
+
+        def run(server: ServerProcess, *, server_name: str, model_id: str, temp: Optional[float] = None, output_kwargs={}, request_kwargs={}):
+            request_kwargs = {**request_kwargs}
+            if temp is not None:
+                request_kwargs['temperature'] = temp
+            if top_p is not None:
+                request_kwargs['top_p'] = top_p
+            if top_k is not None:
+                request_kwargs['top_k'] = top_k
+            if seed is not None:
+                request_kwargs['seed'] = seed
+
+            request_kwargs['cache_prompt'] = False
+
+            tests = {}
+            if test_hello_world:
+                tests["hello world"] = lambda server: do_test_hello_world(server, **request_kwargs)
+            if test_weather:
+                tests["weather"] = lambda server: do_test_weather(server, **request_kwargs)
+            if test_calc_result:
+                tests["calc result"] = lambda server: do_test_calc_result(server, None, 512, **request_kwargs)
+
+            for test_name, test in tests.items():
+                success_count = 0
+                failure_count = 0
+                failures = []
+                success_times = []
+                failure_times = []
+                logger.info(f"Running {test_name} ({server_name}, {model}): ")
+                for i in range(n):
+                    start_time = time.time()
+
+                    def elapsed():
+                        return time.time() - start_time
+
+                    try:
+                        test(server)
+                        success_times.append(elapsed())
+                        success_count += 1
+                        logger.info('success')
+                    except Exception as e:
+                        logger.error(f'failure: {e}')
+                        failure_count += 1
+                        failure_times.append(elapsed())
+                        failures.append(str(e))
+                        # import traceback
+                        # traceback.print_exc()
+                output_file.write(json.dumps({**output_kwargs, **dict(
+                    model=model,
+                    server_name=server_name,
+                    model_id=model_id,
+                    test=test_name,
+                    temp=t,
+                    top_p=top_p,
+                    top_k=top_k,
+                    ctk=ctk,
+                    ctv=ctv,
+                    seed=seed,
+                    success_ratio=float(success_count) / n,
+                    avg_time=mean(success_times + failure_times),
+                    median_time=median(success_times + failure_times),
+                    success_count=success_count,
+                    success_times=success_times,
+                    failure_count=failure_count,
+                    failure_times=failure_times,
+                    failures=list(set(failures)),
+                )}) + '\n')
+                output_file.flush()
+
+        for t in [None] if temp is None else [t if t >= 0 else None for t in temp]:
+            if hf is not None:
+
+                servers: list[Tuple[str, Optional[str]]] = [('llama-server', None)]
+                if llama_baseline is not None:
+                    servers.append(('llama-server (baseline)', llama_baseline))
+
+                for server_name, server_path in servers:
+                    server = ServerProcess()
+                    server.n_ctx = n_ctx
+                    server.n_slots = 1
+                    server.jinja = True
+                    server.ctk = ctk
+                    server.ctv = ctv
+                    server.fa = fa
+                    server.n_predict = n_predict
+                    server.model_hf_repo = hf
+                    server.model_hf_file = None
+                    server.chat_template = chat_template
+                    server.server_path = server_path
+                    if port is not None:
+                        server.server_port = port
+                    # server.debug = True
+
+                    with scoped_server(server):
+                        server.start(timeout_seconds=TIMEOUT_SERVER_START)
+                        for ignore_chat_grammar in [False]:
+                            run(
+                                server,
+                                server_name=server_name,
+                                model_id=hf,
+                                temp=t,
+                                output_kwargs=dict(
+                                    chat_template=chat_template,
+                                ),
+                                request_kwargs=dict(
+                                    ignore_chat_grammar=ignore_chat_grammar,
+                                ),
+                            )
+
+            if ollama is not None:
+                server = ServerProcess()
+                server.server_port = 11434
+                server.server_host = "localhost"
+                subprocess.check_call(["ollama", "pull", ollama])
+
+                with scoped_server(server):
+                    run(
+                        server,
+                        server_name="ollama",
+                        model_id=ollama,
+                        temp=t,
+                        output_kwargs=dict(
+                            chat_template=None,
+                        ),
+                        request_kwargs=dict(
+                            model=ollama,
+                            max_tokens=n_predict,
+                            num_ctx = n_ctx,
+                        ),
+                    )
+
+
+if __name__ == "__main__":
+    app()
diff --git a/scripts/tool_bench.sh b/scripts/tool_bench.sh
new file mode 100755
index 000000000..6c7616a88
--- /dev/null
+++ b/scripts/tool_bench.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+set -euo pipefail
+
+cmake --build build -j
+
+export LLAMA_CACHE=${LLAMA_CACHE:-$HOME/Library/Caches/llama.cpp}
+export LLAMA_SERVER_BIN_PATH=$PWD/build/bin/llama-server
+
+if [ ! -x "$LLAMA_SERVER_BIN_PATH" ]; then
+    echo "Could not find llama-server binary at $LLAMA_SERVER_BIN_PATH"
+    exit 1
+fi
+if [ ! -d "$LLAMA_CACHE" ]; then
+    echo "Could not find llama cache at $LLAMA_CACHE, please set LLAMA_CACHE explicitly."
+    exit 1
+fi
+
+export ARGS=(
+    --llama-baseline="$(which llama-server)"
+    --n 30
+    --temp -1  # Leaves temperature parameter unset (use the server's default, e.g. 0.6 for ollama)
+    --temp 0
+    --temp 0.5
+    --temp 0.75
+    --temp 1
+    --temp 1.5
+    --temp 2
+    --temp 5
+    "$@"
+)
+
+./scripts/tool_bench.py run ${ARGS[@]} --model "Qwen 2.5 Coder 0.5B Q4_K_M"           --output ../qwenc0.5b.jsonl --hf bartowski/Qwen2.5-Coder-0.5B-Instruct-GGUF:Q4_K_M --ollama qwen2.5-coder:0.5b-instruct-q4_K_M
+./scripts/tool_bench.py run ${ARGS[@]} --model "Qwen 2.5 Coder 1.5B Q4_K_M"           --output ../qwenc1.5b.jsonl --hf bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF:Q4_K_M --ollama qwen2.5-coder:1.5b-instruct-q4_K_M
+./scripts/tool_bench.py run ${ARGS[@]} --model "Qwen 2.5 Coder 3B Q4_K_M"             --output ../qwenc3b.jsonl   --hf bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M   --ollama qwen2.5-coder:3b-instruct-q4_K_M
+./scripts/tool_bench.py run ${ARGS[@]} --model "Qwen 2.5 Coder 7B Q4_K_M"             --output ../qwenc7b.jsonl   --hf bartowski/Qwen2.5-Coder-7B-Instruct-GGUF:Q4_K_M   --ollama qwen2.5-coder:7b-instruct-q4_K_M
+./scripts/tool_bench.py run ${ARGS[@]} --model "Qwen 2.5 Coder 32B Q4_K_M"            --output ../qwenc32b.jsonl  --hf bartowski/Qwen2.5-Coder-32B-Instruct-GGUF:Q4_K_M  --ollama qwen2.5-coder:32B-instruct-q4_K_M
+./scripts/tool_bench.py run ${ARGS[@]} --model "Qwen 2.5 1.5B Q4_K_M"                 --output ../qwen1.5b.jsonl  --hf bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M       --ollama qwen2.5:1.5b-instruct-q4_K_M
+./scripts/tool_bench.py run ${ARGS[@]} --model "Qwen 2.5 3B Q4_K_M"                   --output ../qwen3b.jsonl    --hf bartowski/Qwen2.5-3B-Instruct-GGUF:Q4_K_M         --ollama qwen2.5:3b-instruct-q4_K_M
+./scripts/tool_bench.py run ${ARGS[@]} --model "Qwen 2.5 7B Q4_K_M"                   --output ../qwen7b.jsonl    --hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M         --ollama qwen2.5:7b-instruct-q4_K_M
+
+./scripts/tool_bench.py run ${ARGS[@]} --model "Llama 3.2 Instruct 1B Q4_K_M"         --output ../llama1b.jsonl   --hf bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M       --ollama llama3.2:1b-instruct-q4_K_M
+./scripts/tool_bench.py run ${ARGS[@]} --model "Llama 3.2 Instruct 3B Q4_K_M"         --output ../llama3b.jsonl   --hf bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M       --ollama llama3.2:3b-instruct-q4_K_M
+./scripts/tool_bench.py run ${ARGS[@]} --model "Llama 3.1 Instruct 8B Q4_K_M"         --output ../llama8b.jsonl   --hf bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M  --ollama llama3.1:8b-instruct-q4_K_M
+./scripts/tool_bench.py run ${ARGS[@]} --model "Llama 3.3 70B Q4_K_M"                 --output ../llama70b.jsonl  --hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M
+
+./scripts/tool_bench.py run ${ARGS[@]} --model "Mistral Nemo Q4_K_M"                  --output ../nemo.jsonl      --hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M  --ollama mistral-nemo:12b-instruct-2407-q4_K_M
+
+./scripts/tool_bench.py run ${ARGS[@]} --model "Hermes 3 Llama 3.1 8B Q4_K_M"         --output ../hermes3.jsonl   --hf bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M       --ollama hermes3:8b-llama3.1-q4_K_M  --chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use )
+./scripts/tool_bench.py run ${ARGS[@]} --model "Hermes 2 Pro Llama 3 8B Q4_K_M"       --output ../hermes2.jsonl   --hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M     --ollama hermes2:8b-llama3-q4_K_M    --chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use )
+
+./scripts/tool_bench.py run ${ARGS[@]} --model "Functionary Small V3.2 Q4_K_M"        --output ../funct3.2.jsonl  --hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
+./scripts/tool_bench.py run ${ARGS[@]} --model "FireFunction V2 IQ1_M"                --output ../firef2.jsonl    --hf bartowski/firefunction-v2-GGUF:IQ1_M                                                   --chat-template-file <( python scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2 tool_use )
+
+./scripts/tool_bench.py run ${ARGS[@]} --model "Command R7B 12-2024 Q6_K_L"           --output ../c4ai.jsonl      --hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L                                         --chat-template-file <( python scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 tool_use )
+
+./scripts/tool_bench.py run ${ARGS[@]} --model "Gemma 2 2B Q8_0"                      --output ../gemma2.jsonl    --hf bartowski/gemma-2-2b-it-GGUF:Q8_0
+./scripts/tool_bench.py run ${ARGS[@]} --model "Phi 4 Instruct Q4_K_M"                --output ../phi4.jsonl      --hf bartowski/phi-4-GGUF:Q4_K_M                       # --ollama phi4
+./scripts/tool_bench.py run ${ARGS[@]} --model "Phi 3.5 Mini Instruct Q4_K_M"         --output ../phi3.5.jsonl    --hf bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M       # --ollama phi3.5:3.8b-mini-instruct-q4_K_M
+
+# ./scripts/tool_bench.py run ${ARGS[@]} --model "DeepSeek R1 Distill Qwen 7B Q6_K_L"   --output ../dsqw7.jsonl     --hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L --chat-template-file <( python scripts/get_chat_template.py NousResearch/DeepSeek-R1-Distill-Qwen-7B tool_use )
+# ./scripts/tool_bench.py run ${ARGS[@]} --model "DeepSeek R1 Distill Qwen 32B Q4_K_M"  --output ../dsqw32.jsonl    --hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M --chat-template-file <( python scripts/get_chat_template.py NousResearch/DeepSeek-R1-Distill-Qwen-32B tool_use )
+
+
+for f in ../*.jsonl; do
+    ./scripts/tool_bench.py plot "$f" --output ${f%.jsonl}.png || true
+done
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
index 98af1ba39..973b47ae0 100644
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@@ -969,7 +969,7 @@ struct llama_grammar * llama_grammar_init_impl(
         /* .awaiting_trigger = */ false,
         /* .trigger_buffer = */   "",
         /* .trigger_tokens   = */ {},
-        /* .trigger_words    = */ {},
+        /* .trigger_patterns    = */ {},
     };
 }
 
@@ -978,19 +978,15 @@ struct llama_grammar * llama_grammar_init_impl(
                       const char * grammar_str,
                       const char * grammar_root,
                               bool lazy,
-                     const char ** trigger_words,
-                            size_t num_trigger_words,
+                     const char ** trigger_patterns,
+                            size_t num_trigger_patterns,
                const llama_token * trigger_tokens,
                             size_t num_trigger_tokens) {
     llama_grammar_parser parser;
 
     // if there is a grammar, parse it
-    if (!parser.parse(grammar_str)) {
-        return nullptr;
-    }
-
-    // will be empty (default) if there are parse errors
-    if (parser.rules.empty()) {
+    // rules will be empty (default) if there are parse errors
+    if (!parser.parse(grammar_str) || parser.rules.empty()) {
         fprintf(stderr, "%s: failed to parse grammar\n", __func__);
         return nullptr;
     }
@@ -1054,14 +1050,16 @@ struct llama_grammar * llama_grammar_init_impl(
     } while (true);
 
     std::vector<llama_token>    vec_trigger_tokens;
-    std::vector<std::string> vec_trigger_words;
+    std::vector<llama_grammar_trigger_pattern> vec_trigger_patterns;
     for (size_t i = 0; i < num_trigger_tokens; i++) {
         GGML_ASSERT(trigger_tokens != nullptr);
         vec_trigger_tokens.push_back(trigger_tokens[i]);
     }
-    for (size_t i = 0; i < num_trigger_words; i++) {
-        GGML_ASSERT(trigger_words != nullptr);
-        vec_trigger_words.push_back(trigger_words[i]);
+    for (size_t i = 0; i < num_trigger_patterns; i++) {
+        GGML_ASSERT(trigger_patterns != nullptr);
+        auto & trigger = vec_trigger_patterns.emplace_back();
+        trigger.pattern = trigger_patterns[i];
+        trigger.regex = std::regex(trigger.pattern);
     }
 
     // Important: vec_rules has to be moved here, not copied, because stacks contains
@@ -1076,7 +1074,7 @@ struct llama_grammar * llama_grammar_init_impl(
         /* .awaiting_trigger = */ lazy,
         /* .trigger_buffer = */   "",
         std::move(vec_trigger_tokens),
-        std::move(vec_trigger_words),
+        std::move(vec_trigger_patterns),
     };
 }
 
@@ -1089,7 +1087,7 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
 }
 
 struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
-    llama_grammar * result = new llama_grammar {
+    auto * result = new llama_grammar {
         grammar.vocab,
         grammar.rules,
         grammar.stacks,
@@ -1098,7 +1096,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
         grammar.awaiting_trigger,
         grammar.trigger_buffer,
         grammar.trigger_tokens,
-        grammar.trigger_words,
+        grammar.trigger_patterns,
     };
 
     // redirect elements in stacks to point to new rules
@@ -1173,16 +1171,18 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
             LLAMA_LOG_DEBUG("Grammar triggered on token %u (`%s`)", token, piece.c_str());
             return;
         } else {
-            // TODO: consider a smarter incremental substring search algorithm (store last position to search from).
             grammar.trigger_buffer += piece;
-            for (const auto & word : grammar.trigger_words) {
-                auto pos = grammar.trigger_buffer.find(word);
-                if (pos != std::string::npos) {
+
+            std::smatch match;
+            for (const auto & trigger_pattern : grammar.trigger_patterns) {
+                if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
                     grammar.awaiting_trigger = false;
-                    auto constrained_str = grammar.trigger_buffer.substr(pos);
+                    // get from the first match to the end of the string
+                    auto constrained_str = grammar.trigger_buffer.substr(match.position(1));
+                    // std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
                     grammar.trigger_buffer.clear();
                     llama_grammar_accept_str(grammar, constrained_str);
-                    LLAMA_LOG_DEBUG("Grammar triggered on word `%s`", word.c_str());
+                    LLAMA_LOG_DEBUG("Grammar triggered on regex: '%s'\n", constrained_str.c_str());
                     return;
                 }
             }
diff --git a/src/llama-grammar.h b/src/llama-grammar.h
index b143d834c..f8c291de9 100644
--- a/src/llama-grammar.h
+++ b/src/llama-grammar.h
@@ -3,6 +3,7 @@
 #include "llama.h"
 
 #include <map>
+#include <regex>
 #include <string>
 #include <vector>
 
@@ -105,6 +106,11 @@ struct llama_grammar_parser {
     void print(FILE * file);
 };
 
+struct llama_grammar_trigger_pattern {
+    std::string pattern;
+    std::regex  regex;
+};
+
 struct llama_grammar {
     // note: allow null vocab for testing (not great)
     const llama_vocab * vocab;
@@ -122,7 +128,10 @@ struct llama_grammar {
     bool                     awaiting_trigger = false; // Initialized to true for lazy grammars only
     std::string              trigger_buffer;           // Output buffered by lazy grammar. Will be cleared once trigger is found.
     std::vector<llama_token> trigger_tokens;           // Tokens that trigger a lazy grammar, or tokens to force printing of (even if special).
-    std::vector<std::string> trigger_words;
+    std::vector<llama_grammar_trigger_pattern>
+                             trigger_patterns;         // Regular expressions that trigger a lazy grammar. Must be a full match of the entire generated
+                                                       // string, and the grammar will be given the string from the first match group onwards.
+
 };
 
 //
@@ -141,8 +150,8 @@ struct llama_grammar * llama_grammar_init_impl(
                       const char * grammar_str,
                       const char * grammar_root,
                               bool lazy,
-                     const char ** trigger_words,
-                            size_t num_trigger_words,
+                     const char ** trigger_patterns,
+                            size_t num_trigger_patterns,
                const llama_token * trigger_tokens,
                             size_t num_trigger_tokens);
 
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index f40bf2db8..c25977ca3 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1449,7 +1449,9 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
                      const char ** trigger_words,
                             size_t num_trigger_words,
                const llama_token * trigger_tokens,
-                            size_t num_trigger_tokens);
+                            size_t num_trigger_tokens,
+                     const char ** trigger_patterns,
+                            size_t num_trigger_patterns);
 
 static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
     auto * ctx = (llama_sampler_grammar *) smpl->ctx;
@@ -1457,12 +1459,14 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
         return;
     }
 
-    std::vector<const char *>  trigger_words;
-    for (auto & word : ctx->grammar->trigger_words) {
-        trigger_words.push_back(word.c_str());
+    std::vector<const char *>  trigger_patterns_c;
+    trigger_patterns_c.reserve(ctx->grammar->trigger_patterns.size());
+    for (auto & trigger_pattern : ctx->grammar->trigger_patterns) {
+        trigger_patterns_c.push_back(trigger_pattern.pattern.c_str());
     }
+
     auto * grammar_new = llama_grammar_init_impl(ctx->grammar->vocab, ctx->grammar_str.c_str(), ctx->grammar_root.c_str(),
-                                                 ctx->grammar->lazy, trigger_words.data(), trigger_words.size(),
+                                                 ctx->grammar->lazy, trigger_patterns_c.data(), trigger_patterns_c.size(),
                                                  ctx->grammar->trigger_tokens.data(), ctx->grammar->trigger_tokens.size());
 
     llama_grammar_free_impl(ctx->grammar);
@@ -1472,7 +1476,7 @@ static void llama_sampler_grammar_reset(struct llama_sampler * smpl) {
 static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sampler * smpl) {
     const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
 
-    auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0);
+    auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0);
 
     // copy the state
     {
@@ -1516,15 +1520,33 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
                      const char ** trigger_words,
                             size_t num_trigger_words,
                const llama_token * trigger_tokens,
-                            size_t num_trigger_tokens) {
+                            size_t num_trigger_tokens,
+                     const char ** trigger_patterns,
+                            size_t num_trigger_patterns) {
     auto * ctx = new llama_sampler_grammar;
 
     if (grammar_str != nullptr && grammar_str[0] != '\0') {
+        // TODO: remove trigger_words support.
+        if (trigger_words != nullptr && num_trigger_words > 0) {
+            GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0);
+            std::string trigger_pattern("[\\s\\S]*?(");
+            for (size_t i = 0; i < num_trigger_words; ++i) {
+                static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
+                if (i > 0) {
+                    trigger_pattern += "|";
+                }
+                trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
+            }
+            trigger_pattern += ")[\\s\\S]*";
+            auto trigger_pattern_c = trigger_pattern.c_str();
+            trigger_patterns = &trigger_pattern_c;
+            num_trigger_patterns = 1;
+        }
         *ctx = {
             /* .vocab        = */ vocab,
             /* .grammar_str  = */ grammar_str,
             /* .grammar_root = */ grammar_root,
-            /* .grammar      = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens),
+            /* .grammar      = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
         };
     } else {
         *ctx = {
@@ -1545,7 +1567,7 @@ struct llama_sampler * llama_sampler_init_grammar(
         const struct llama_vocab * vocab,
                       const char * grammar_str,
                       const char * grammar_root) {
-    return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0);
+    return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ false, nullptr, 0, nullptr, 0, nullptr, 0);
 }
 
 struct llama_sampler * llama_sampler_init_grammar_lazy(
@@ -1556,7 +1578,18 @@ struct llama_sampler * llama_sampler_init_grammar_lazy(
                             size_t num_trigger_words,
                const llama_token * trigger_tokens,
                             size_t num_trigger_tokens) {
-    return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens);
+    return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, trigger_words, num_trigger_words, trigger_tokens, num_trigger_tokens, nullptr, 0);
+}
+
+struct llama_sampler * llama_sampler_init_grammar_lazy_patterns(
+        const struct llama_vocab * vocab,
+                      const char * grammar_str,
+                      const char * grammar_root,
+                     const char ** trigger_patterns,
+                            size_t num_trigger_patterns,
+               const llama_token * trigger_tokens,
+                            size_t num_trigger_tokens) {
+    return llama_sampler_init_grammar_impl(vocab, grammar_str, grammar_root, /* lazy= */ true, nullptr, 0, trigger_tokens, num_trigger_tokens, trigger_patterns, num_trigger_patterns);
 }
 
 // penalties
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index 643592305..35a307c63 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -237,12 +237,35 @@ static void test_templates(const struct common_chat_templates * tmpls, const std
             auto earliest_trigger_pos = std::string::npos;
             auto constrained = data.delta;
             for (const auto & trigger : data.params.grammar_triggers) {
-                auto pos = constrained.find(trigger.word);
-                if (pos == std::string::npos) {
-                    continue;
+                size_t pos = std::string::npos;
+                std::smatch match;
+                switch (trigger.type) {
+                    case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
+                    {
+                        const auto & word = trigger.value;
+                        pos = constrained.find(word);
+                        break;
+                    }
+                    case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
+                    {
+                        const auto & pattern = trigger.value;
+                        if (std::regex_search(constrained, match, std::regex(pattern))) {
+                            pos = match.position();
+                        }
+                        break;
+                    }
+                    case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
+                    {
+                        const auto & pattern = trigger.value;
+                        if (std::regex_search(constrained, match, std::regex(pattern)) && match.position() == 0) {
+                            pos = 0;
+                        }
+                        break;
+                    }
+                    default:
+                        throw std::runtime_error("Unknown trigger type");
                 }
-                if (pos > 0 && trigger.at_start) {
-                    fprintf(stderr, "Trigger %s not at start of message, skipping:\n\n%s\n\n", trigger.word.c_str(), constrained.c_str());
+                if (pos == std::string::npos) {
                     continue;
                 }
                 if (earliest_trigger_pos == std::string::npos || pos < earliest_trigger_pos) {
@@ -260,7 +283,8 @@ static void test_templates(const struct common_chat_templates * tmpls, const std
 
             if (grammar_triggered && test_grammar_if_triggered && !match_string(constrained, grammar.get())) {
                 throw std::runtime_error("Failed to match delta against grammar:\n\n" + data.delta +
-                                            "\n\nGrammar: " + data.params.grammar);
+                    "\n\nConstrained: " + constrained +
+                    "\n\nGrammar: " + data.params.grammar);
             }
         }
     }
@@ -640,6 +664,93 @@ static void test_template_output_parsers() {
                 inputs_tools)
                 .format);
 
+        // Test parsing
+        assert_msg_equals(message_assist_call, common_chat_parse(
+            "<tool_call>\n"
+            "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+            "</tool_call>",
+            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(message_assist_call, common_chat_parse(
+            "<function=special_function>{\"arg1\": 1}</function>",
+            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(message_assist_call, common_chat_parse(
+            "<function name=\"special_function\">\n"
+            "{\"arg1\": 1}\n"
+            "</function>",
+            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(message_assist_call, common_chat_parse(
+            "<tool>\n"
+            "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+            "</tool>",
+            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(message_assist_call, common_chat_parse(
+            "<tools>\n"
+            "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+            "</tools>",
+            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(message_assist_call, common_chat_parse(
+            "<response>\n"
+            "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+            "</response>",
+            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(message_assist_call, common_chat_parse(
+            "```xml\n"
+            "<response>\n"
+            "    {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+            "</response>\n"
+            "```",
+            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(message_assist_call, common_chat_parse(
+            "```xml\n"
+            "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+            "```",
+            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(message_assist_call, common_chat_parse(
+            "```\n"
+            "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+            "```",
+            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(message_assist_call, common_chat_parse(
+            "```\n"
+            "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+            "```",
+            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(message_assist_call, common_chat_parse(
+            "```json\n"
+            "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+            "```",
+            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(message_assist_call, common_chat_parse(
+            "```json\n"
+            "\n"
+            "                    <function_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}} \n"
+            "                    </function_call> \n"
+            "``` ",
+            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(message_assist_call, common_chat_parse(
+            "<json>\n"
+            "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+            "</json>",
+            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(message_assist_call, common_chat_parse(
+            "<xml>\n"
+            "  {\n"
+            "    \"name\": \"special_function\", \"arguments\": {\"arg1\": 1}\n"
+            "  }\n"
+            "</xml>",
+            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(message_assist_call, common_chat_parse(
+            "<JSON>\n"
+            "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
+            "</JSON>",
+            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(message_assist_call, common_chat_parse(
+            "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}",
+            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+        assert_msg_equals(message_assist_call, common_chat_parse(
+            "{\n  \"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}",
+            COMMON_CHAT_FORMAT_HERMES_2_PRO));
+
         test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
         test_templates(tmpls.get(), end_tokens, message_assist_call, tools,
                       "<tool_call>\n"
@@ -789,7 +900,7 @@ static void test_template_output_parsers() {
 }
 
 int main(int argc, char ** argv) {
-    try {
+    // try {
 #ifndef _WIN32
         if (argc > 1) {
             common_chat_templates_inputs inputs;
@@ -827,8 +938,8 @@ int main(int argc, char ** argv) {
             std::cout << "\n[chat] All tests passed!" << '\n';
         }
         return 0;
-    } catch (const std::exception & e) {
-        std::cerr << "Error: " << e.what() << '\n';
-        return 1;
-    }
+    // } catch (const std::exception & e) {
+    //     std::cerr << "Error: " << e.what() << '\n';
+    //     return 1;
+    // }
 }
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
index f38994c92..4d78e9142 100755
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -91,7 +91,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= ([0] | [1-9] [0-9]{0,15}) space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -104,7 +104,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= ([1-9] [0-9]{0,15}) space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -117,7 +117,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= ([1-2] [0-9]{1,15} | [3-9] [0-9]{0,15}) space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -130,7 +130,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= ([1-8] [0-9]{1,15} | [9] [0-9]{0,15}) space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -143,7 +143,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= ([1] ([0-9]{1,15}) | [2-9] [0-9]{1,15}) space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -156,7 +156,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= ([1] [0-9]{2,15} | [2] ([0-4] [0-9]{1,14} | [5-9] [0-9]{0,14}) | [3-9] [0-9]{1,15}) space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -169,7 +169,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-2] [0-9] | [3] "0")) space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -182,7 +182,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= ("-" ([0-5]) | [0] | [1-9] [0-9]{0,15}) space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -195,7 +195,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0] | [1-9] [0-9]{0,15}) space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -208,7 +208,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= ("-" ([0-4] [0-9]{1,15} | [5-9] [0-9]{0,15})) space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -221,7 +221,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= ("-" [1-9] [0-9]{0,15} | [0-1]) space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -234,7 +234,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-8] [0-9] | [9] [0-9]) | "100") space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -248,7 +248,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= ([0-9] | ([1] [0-9] | [2] [0-3])) space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -262,7 +262,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= (([1] ([5-9]) | [2-9] [0-9]) | ([1-2] [0-9]{2} | [3] "00")) space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -276,7 +276,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= ([5-9] | ([1-2] [0-9] | [3] "0")) space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -290,7 +290,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0-9] | ([1-3] [0-9] | [4] [0-2])) space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -304,7 +304,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= ("-" ([0-9] | "10") | [0-9] | "10") space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -340,7 +340,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
             root ::= object
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
             string ::= "\"" char* "\"" space
             value ::= object | array | string | number | boolean | null
         )"""
@@ -363,7 +363,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             date-time ::= date "T" time
             date-time-string ::= "\"" date-time "\"" space
             root ::= "[" space tuple-0 "," space uuid "," space tuple-2 "," space tuple-3 "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
             time ::= ([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )
             time-string ::= "\"" time "\"" space
             tuple-0 ::= date-string
@@ -382,7 +382,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""(
             char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             root ::= "\"" char* "\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -396,7 +396,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""(
             char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             root ::= "\"" char+ "\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -410,7 +410,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""(
             char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             root ::= "\"" char{3,} "\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -424,7 +424,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""(
             char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             root ::= "\"" char{0,3} "\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -439,7 +439,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""(
             char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             root ::= "\"" char{1,4} "\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -451,7 +451,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= ("true" | "false") space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -464,7 +464,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""(
             integral-part ::= [0] | [1-9] [0-9]{0,15}
             root ::= ("-"? integral-part) space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -476,7 +476,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= "\"foo\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -488,7 +488,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= "123" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -500,7 +500,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= ("\"red\"" | "\"amber\"" | "\"green\"" | "null" | "42" | "[\"foo\"]") space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -514,7 +514,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""(
             char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             root ::= "[" space (string ("," space string)*)? "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
             string ::= "\"" char* "\"" space
         )"""
     });
@@ -531,7 +531,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             null ::= "null" space
             root ::= alternative-0 | null
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
             string ::= "\"" char* "\"" space
         )"""
     });
@@ -545,7 +545,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""(
             char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             root ::= "[" space string "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
             string ::= "\"" char* "\"" space
         )"""
     });
@@ -562,7 +562,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             root ::= "[" space string "," space number "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
             string ::= "\"" char* "\"" space
         )"""
     });
@@ -577,7 +577,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             decimal-part ::= [0-9]{1,16}
             integral-part ::= [0] | [1-9] [0-9]{0,15}
             root ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -593,7 +593,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""(
             boolean ::= ("true" | "false") space
             root ::= "[" space boolean ("," space boolean)+ "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -609,7 +609,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""(
             boolean ::= ("true" | "false") space
             root ::= "[" space boolean? "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -625,7 +625,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""(
             boolean ::= ("true" | "false") space
             root ::= "[" space (boolean ("," space boolean)?)? "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -646,7 +646,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             item ::= number | integer
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             root ::= "[" space item ("," space item){2,4} "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -665,7 +665,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""(
             item ::= ("-" ([0-9] | "1" [0-2]) | [0-9] | ([1-8] [0-9] | [9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7])) space
             root ::= "[" space item ("," space item){2,4} "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -684,7 +684,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""(
             item ::= (([1] ([2-9]) | [2-9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7])) space
             root ::= "[" space item ("," space item){2,4} "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -697,7 +697,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= "\"" ("ab" "c"? "d"* "ef" "g"+ ("hij")? "kl") "\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -710,7 +710,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= "\"" ("[]{}()|+*?") "\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -723,7 +723,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= "\"" ("\"") "\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -736,7 +736,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= "\"" ("A" | "B" | "C" | "D") "\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -751,7 +751,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             dot ::= [^\x0A\x0D]
             root ::= "\"" (("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot) "\"" space
             root-1 ::= [0-9]
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -779,7 +779,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             c-kv ::= "\"c\"" space ":" space string
             char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
             string ::= "\"" char* "\"" space
         )"""
     });
@@ -799,7 +799,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             a-kv ::= "\"a\"" space ":" space string
             char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             root ::= "{" space  (a-kv )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
             string ::= "\"" char* "\"" space
         )"""
     });
@@ -823,7 +823,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             c-kv ::= "\"c\"" space ":" space string
             char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
             root ::= "{" space  (a-kv a-rest | b-kv b-rest | c-kv )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
             string ::= "\"" char* "\"" space
         )"""
     });
@@ -849,7 +849,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             d-kv ::= "\"d\"" space ":" space string
             d-rest ::= ( "," space c-kv )?
             root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
             string ::= "\"" char* "\"" space
         )"""
     });
@@ -869,7 +869,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             root ::= "{" space  (additional-kv ( "," space additional-kv )* )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
             string ::= "\"" char* "\"" space
         )"""
     });
@@ -891,7 +891,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
             root ::= object
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
             string ::= "\"" char* "\"" space
             value ::= object | array | string | number | boolean | null
         )"""
@@ -913,7 +913,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
             root ::= object
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
             string ::= "\"" char* "\"" space
             value ::= object | array | string | number | boolean | null
         )"""
@@ -928,7 +928,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             root ::= "{" space  "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -952,7 +952,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             root ::= "{" space a-kv ( "," space ( additional-kv ( "," space additional-kv )* ) )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
             string ::= "\"" char* "\"" space
         )"""
     });
@@ -977,7 +977,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             root ::= "{" space  (a-kv a-rest | additional-kv ( "," space additional-kv )* )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -1004,7 +1004,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             root ::= "{" space and-kv ( "," space ( also-kv also-rest | additional-kv ( "," space additional-kv )* ) )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -1030,7 +1030,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             integral-part ::= [0] | [1-9] [0-9]{0,15}
             root ::= ("-"? integral-part) space
             root0 ::= "{" space  (-kv -rest | a-kv a-rest | additional-kv ( "," space additional-kv )* )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -1055,7 +1055,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             integer ::= ("-"? integral-part) space
             integral-part ::= [0] | [1-9] [0-9]{0,15}
             root ::= "{" space  (a-kv a-rest | aa-kv aa-rest | additional-kv ( "," space additional-kv )* )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -1080,7 +1080,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             integer ::= ("-"? integral-part) space
             integral-part ::= [0] | [1-9] [0-9]{0,15}
             root ::= "{" space  (ab-kv ab-rest | ac-kv ac-rest | additional-kv ( "," space additional-kv )* )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -1109,7 +1109,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             foo ::= "{" space foo-a-kv "}" space
             foo-a-kv ::= "\"a\"" space ":" space string
             root ::= foo
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
             string ::= "\"" char* "\"" space
         )"""
     });
@@ -1143,7 +1143,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             root ::= alternative-0 | alternative-1
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -1187,7 +1187,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 
@@ -1235,7 +1235,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             number-number-kv ::= "\"number\"" space ":" space number-number
             number-number-root-kv ::= "\"root\"" space ":" space number
             root ::= "{" space number-kv "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
         )"""
     });
 }

From 074c4fd39df3af974e10fbee6cc6db5d9304655b Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Wed, 5 Mar 2025 14:16:40 +0100
Subject: [PATCH 09/19] ci : add fetch-depth to xcframework upload (#12195)

This commit adds the fetch-depth: 0 option to the checkout action in the
build.yml workflow file (0 meaning that it fetches the complete
history). The default value is 1 when not specified which only fetches
the latest commit.

This is necessary to ensure that `git rev-list --count HEAD` counts the
total number of commits in the history. Currently because the default is
being used the name of the xcframework artifact is always
llama-b1-xcframework.
---
 .github/workflows/build.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 65f068a9b..b653b1f82 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1320,6 +1320,8 @@ jobs:
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
 
       - name: Build
         id: cmake_build

From 16e4b22c5e58ee200ede913fd7b7b8ba92132ce8 Mon Sep 17 00:00:00 2001
From: Plamen Minev <pacominev@gmail.com>
Date: Wed, 5 Mar 2025 17:16:01 +0200
Subject: [PATCH 10/19] ggml : fix GGMLMetalClass ODR (#12200)

-- it might happen if ggml is loaded from 2 separate libraries since each one of them will expose the class. This is more of a guard since we want to use only Metal as embedded library and don't care about the other case.
---
 ggml/src/ggml-metal/ggml-metal.m | 49 ++++++++++++++++----------------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 9eb9a0db6..1f45ebad1 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -467,11 +467,13 @@ struct ggml_backend_metal_context {
 //       for now it is easier to work in a separate file
 // static NSString * const msl_library_source = @"see metal.metal";
 
+#if !GGML_METAL_EMBED_LIBRARY
 // Here to assist with NSBundle Path Hack
 @interface GGMLMetalClass : NSObject
 @end
 @implementation GGMLMetalClass
 @end
+#endif
 
 static void * ggml_metal_host_malloc(size_t n) {
     void * data = NULL;
@@ -520,7 +522,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
 
     ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
 
-    id<MTLLibrary> metal_library;
+    id<MTLLibrary> metal_library = nil;
 
     // load library
     //
@@ -529,19 +531,23 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
     // - if not found, load the source and compile it
     // - if that fails, return NULL
     {
-        NSBundle * bundle = nil;
-#ifdef SWIFT_PACKAGE
-        bundle = SWIFTPM_MODULE_BUNDLE;
-#else
-        bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
-#endif
-
         NSError * error = nil;
+        NSString * src = nil;
 
 #if GGML_METAL_EMBED_LIBRARY
-        const bool try_metallib = false;
+        GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
+
+        extern const char ggml_metallib_start[];
+        extern const char ggml_metallib_end[];
+
+        src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
+
 #else
-        const bool try_metallib = true;
+
+#ifdef SWIFT_PACKAGE
+        NSBundle * bundle = SWIFTPM_MODULE_BUNDLE;
+#else
+        NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
 #endif
 
         NSString * path_lib = [bundle pathForResource:@"default" ofType:@"metallib"];
@@ -574,7 +580,7 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
             path_lib = default_metallib_path;
         }
 
-        if (try_metallib && path_lib != nil) {
+        if (path_lib != nil) {
             // pre-compiled library found
             NSURL * libURL = [NSURL fileURLWithPath:path_lib];
             GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_lib UTF8String]);
@@ -585,14 +591,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
                 return NULL;
             }
         } else {
-#if GGML_METAL_EMBED_LIBRARY
-            GGML_LOG_INFO("%s: using embedded metal library\n", __func__);
-
-            extern const char ggml_metallib_start[];
-            extern const char ggml_metallib_end[];
-
-            NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
-#else
             GGML_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
 
             NSString * path_source;
@@ -613,13 +611,15 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
 
             GGML_LOG_INFO("%s: loading '%s'\n", __func__, [path_source UTF8String]);
 
-            NSString * src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
+            src = [NSString stringWithContentsOfFile:path_source encoding:NSUTF8StringEncoding error:&error];
             if (error) {
                 GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
                 return NULL;
             }
-#endif // GGML_METAL_EMBED_LIBRARY
+        }
+#endif
 
+        if (!metal_library) {
             @autoreleasepool {
                 // dictionary of preprocessor macros
                 NSMutableDictionary * prep = [NSMutableDictionary dictionary];
@@ -647,10 +647,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
                 [options release];
 #endif
             }
-#if GGML_METAL_EMBED_LIBRARY
-            [src release];
-#endif // GGML_METAL_EMBED_LIBRARY
         }
+
+#if GGML_METAL_EMBED_LIBRARY
+        [src release];
+#endif // GGML_METAL_EMBED_LIBRARY
     }
 
     // print MTL GPU family:

From 5e43f104cca1a14874e980326a506b44fde022b8 Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Wed, 5 Mar 2025 21:28:23 +0530
Subject: [PATCH 11/19] SYCL: Disable f16 Unary OPs as not supported by the
 kernels (#12201)

---
 ggml/src/ggml-sycl/ggml-sycl.cpp | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index e7304947f..6977b705e 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -41,6 +41,7 @@
 #include "ggml-sycl/gemm.hpp"
 #include "ggml-sycl/sycl_hw.hpp"
 #include "ggml-sycl/getrows.hpp"
+#include "ggml.h"
 
 static bool g_sycl_loaded = false;
 int g_ggml_sycl_debug = 0;
@@ -3864,7 +3865,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_TANH:
                 case GGML_UNARY_OP_EXP:
-                    return ggml_is_contiguous(op->src[0]);
+                    return ggml_is_contiguous(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32);
                 default:
                     return false;
             }
@@ -3981,23 +3982,24 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
+            return true;
         case GGML_OP_ADD:
         case GGML_OP_ADD1:
-        case GGML_OP_LOG:
         case GGML_OP_SUB:
         case GGML_OP_MUL:
         case GGML_OP_DIV:
-            return true;
-        case GGML_OP_NORM:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_GROUP_NORM:
-            return ggml_is_contiguous(op->src[0]);
-        case GGML_OP_SCALE:
         case GGML_OP_SQR:
         case GGML_OP_SQRT:
         case GGML_OP_SIN:
         case GGML_OP_COS:
         case GGML_OP_CLAMP:
+        case GGML_OP_LOG:
+            return (op->src[0]->type == GGML_TYPE_F32);
+        case GGML_OP_NORM:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_GROUP_NORM:
+            return ggml_is_contiguous(op->src[0]);
+        case GGML_OP_SCALE:
             return true;
         case GGML_OP_CONT:
             return op->src[0]->type != GGML_TYPE_BF16;

From 07d15723470a0a5b15d8ccad1aff5b20354ffbe1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9my=20O?= <remyoudompheng@gmail.com>
Date: Thu, 6 Mar 2025 02:26:10 +0100
Subject: [PATCH 12/19] ggml-cpu: Faster IQ1 mul_mat_vec on AVX2 using BMI2
 instructions (#12154)

* ggml-cpu: Faster IQ1 mul_mat_vec on AVX2 using BMI2 instructions

* cmake: Add GGML_BMI2 build option

* ggml: enable BMI2 on relevant CPU variants

* ggml-cpu: include BMI2 in backend score

* ggml-cpu: register BMI2 in ggml_backend_cpu_get_features

* ggml-cpu: add __BMI2__ define when using MSVC
---
 ggml/CMakeLists.txt                 |  1 +
 ggml/include/ggml-cpu.h             |  1 +
 ggml/src/CMakeLists.txt             | 12 +++----
 ggml/src/ggml-cpu/CMakeLists.txt    |  8 +++++
 ggml/src/ggml-cpu/cpu-feats-x86.cpp |  4 +++
 ggml/src/ggml-cpu/ggml-cpu-quants.c | 49 ++++++++++++++++++++++-------
 ggml/src/ggml-cpu/ggml-cpu.c        |  8 +++++
 ggml/src/ggml-cpu/ggml-cpu.cpp      |  3 ++
 8 files changed, 68 insertions(+), 18 deletions(-)

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 92a62ea6f..412d294dc 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -106,6 +106,7 @@ option(GGML_CPU_KLEIDIAI     "ggml: use KleidiAI optimized kernels if applicable
 option(GGML_AVX              "ggml: enable AVX"              ${INS_ENB})
 option(GGML_AVX_VNNI         "ggml: enable AVX-VNNI"         OFF)
 option(GGML_AVX2             "ggml: enable AVX2"             ${INS_ENB})
+option(GGML_BMI2             "ggml: enable BMI2"             ${INS_ENB})
 option(GGML_AVX512           "ggml: enable AVX512F"          OFF)
 option(GGML_AVX512_VBMI      "ggml: enable AVX512-VBMI"      OFF)
 option(GGML_AVX512_VNNI      "ggml: enable AVX512-VNNI"      OFF)
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index b48cc560e..f5e11f1e1 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -80,6 +80,7 @@ extern "C" {
     GGML_BACKEND_API int ggml_cpu_has_avx        (void);
     GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
     GGML_BACKEND_API int ggml_cpu_has_avx2       (void);
+    GGML_BACKEND_API int ggml_cpu_has_bmi2       (void);
     GGML_BACKEND_API int ggml_cpu_has_f16c       (void);
     GGML_BACKEND_API int ggml_cpu_has_fma        (void);
     GGML_BACKEND_API int ggml_cpu_has_avx512     (void);
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index fcb354e16..cfd4ac54c 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -289,7 +289,7 @@ function(ggml_add_cpu_backend_variant tag_name)
     set(GGML_CPU_TAG_NAME ${tag_name})
     # other: OPENMP LLAMAFILE CPU_HBM
     foreach (feat NATIVE
-                  AVX AVX2 AVX_VNNI FMA F16C
+                  AVX AVX2 BMI2 AVX_VNNI FMA F16C
                   AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
                   AMX_TILE AMX_INT8 AMX_BF16)
         set(GGML_${feat} OFF)
@@ -309,13 +309,13 @@ if (GGML_CPU_ALL_VARIANTS)
         message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
     endif()
     ggml_add_cpu_backend_variant(sandybridge    AVX)
-    ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 FMA)
-    ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 FMA AVX512)
-    ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
-    ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 FMA AVX_VNNI)
+    ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 BMI2 FMA)
+    ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 BMI2 FMA AVX512)
+    ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+    ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 BMI2 FMA AVX_VNNI)
     if (NOT MSVC)
         # MSVC doesn't support AMX
-        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
     endif()
 elseif (GGML_CPU)
     ggml_add_cpu_backend_variant_impl("")
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index f8836ed61..d6c4a9c29 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -219,6 +219,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
             if (GGML_AVX_VNNI)
                 list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
             endif()
+            if (GGML_BMI2)
+                # MSVC does not define macro __BMI2__
+                list(APPEND ARCH_DEFINITIONS __BMI2__ GGML_BMI2)
+            endif()
         else ()
             if (GGML_NATIVE)
                 list(APPEND ARCH_FLAGS -march=native)
@@ -233,6 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                     list(APPEND ARCH_FLAGS -mfma)
                     list(APPEND ARCH_DEFINITIONS GGML_FMA)
                 endif()
+                if (GGML_BMI2)
+                    list(APPEND ARCH_FLAGS -mbmi2)
+                    list(APPEND ARCH_DEFINITIONS GGML_BMI2)
+                endif()
                 if (GGML_AVX)
                     list(APPEND ARCH_FLAGS -mavx)
                     list(APPEND ARCH_DEFINITIONS GGML_AVX)
diff --git a/ggml/src/ggml-cpu/cpu-feats-x86.cpp b/ggml/src/ggml-cpu/cpu-feats-x86.cpp
index e8133d411..902ee4346 100644
--- a/ggml/src/ggml-cpu/cpu-feats-x86.cpp
+++ b/ggml/src/ggml-cpu/cpu-feats-x86.cpp
@@ -278,6 +278,10 @@ static int ggml_backend_cpu_x86_score() {
     if (!is.SSE42()) { return 0; }
     score += 1<<2;
 #endif
+#ifdef GGML_BMI2
+    if (!is.BMI2()) { return 0; }
+    score += 1<<3;
+#endif
 #ifdef GGML_AVX
     if (!is.AVX()) { return 0; }
     score += 1<<4;
diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c
index c30ac0318..2ae66591d 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -11362,10 +11362,19 @@ void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const
         __m256i sumi = _mm256_setzero_si256();
         int sumi1 = 0;
         for (int ib = 0; ib < QK_K/32; ib += 2) {
+#ifdef __BMI2__
+            const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL);
+            const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL);
+            const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
+            const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
+            const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
+            const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
+#else
             const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
                                                     iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
             const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
                                                     iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
+#endif
             qs += 8;
             const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
             const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
@@ -11709,8 +11718,9 @@ void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const
 
 #elif defined __AVX2__
 
-    const __m256i mask = _mm256_set1_epi16(0x7);
+    const __m256i mask = _mm256_set1_epi16(2 * 0x7);
     const __m256i mone = _mm256_set1_epi16(1);
+    const __m256i mone8 = _mm256_set1_epi8(1);
 
     __m256 accum1 = _mm256_setzero_ps();
     __m256 accum2 = _mm256_setzero_ps();
@@ -11726,6 +11736,21 @@ void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const
         __m256i sumi1 = _mm256_setzero_si256();
         __m256i sumi2 = _mm256_setzero_si256();
         for (int ib = 0; ib < QK_K/32; ib += 2) {
+#ifdef __BMI2__
+            const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL)
+                                       | _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL);
+            const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL)
+                                       | _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL);
+            const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
+            const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
+            const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
+            const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
+
+            // Convert signs to bytes 0x81 (negative) or 0x01 (positive)
+            const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL);
+            const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign)));
+            const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32)));
+#else
             const __m256i q1b_1 = _mm256_set_epi64x(
                     iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
                     iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
@@ -11734,11 +11759,6 @@ void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const
                     iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
                     iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
             );
-            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
-
-            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
-            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
 
             const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
                                                      qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
@@ -11748,15 +11768,20 @@ void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const
                                                      qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
                                                      qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
                                                      qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
+#endif
+            const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
+            const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
 
-            const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
-            const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
+            const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
+            const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
+            const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1));
+            const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2));
 
-            __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
-            __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
+            __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 2), _mm_set1_epi16(sc[ib/2] << 1));
+            __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 8), _mm_set1_epi16(sc[ib/2] >> 5));
 
-            scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
-            scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
+            scale1 = _mm256_add_epi16(_mm256_and_si256(scale1, mask), mone);
+            scale2 = _mm256_add_epi16(_mm256_and_si256(scale2, mask), mone);
             const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
             const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
             const __m256i p3 = _mm256_madd_epi16(dot3, scale1);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 2a5463fdf..c67fdd045 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -15440,6 +15440,14 @@ int ggml_cpu_has_amx_int8(void) {
 #endif
 }
 
+int ggml_cpu_has_bmi2(void) {
+#if defined(__BMI2__)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_fma(void) {
 #if defined(__FMA__)
     return 1;
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index a84203f29..09f8382b9 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -511,6 +511,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
         if (ggml_cpu_has_fma()) {
             features.push_back({ "FMA", "1" });
         }
+        if (ggml_cpu_has_bmi2()) {
+            features.push_back({ "BMI2", "1" });
+        }
         if (ggml_cpu_has_avx512()) {
             features.push_back({ "AVX512", "1" });
         }

From ed4ce0dda24986c80d58e2ce112049af00f632f4 Mon Sep 17 00:00:00 2001
From: simon886212 <37953122+simon886212@users.noreply.github.com>
Date: Thu, 6 Mar 2025 09:30:05 +0800
Subject: [PATCH 13/19] opencl : fix profile-related errors (#12095)

Co-authored-by: ubuntu <ubuntu@localhost.localdomain>
---
 ggml/src/ggml-opencl/ggml-opencl.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index dc9a718f7..9a76da43b 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -278,7 +278,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
 
     cl_int err;
 
-#ifdef GGML_PROFILE_OPENCL
+#ifdef GGML_OPENCL_PROFILING
     GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
 #endif
 
@@ -3023,6 +3023,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
         // enqueue kernel with profiling
         // <--------------------------------------------> //
     #ifdef GGML_OPENCL_PROFILING
+        cl_event evt;
         CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
 
         g_profiling_info.emplace_back();

From f79243992cd31e4e4844d5158d2f3325b1d2d811 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= <henry.linjamaki@gmail.com>
Date: Thu, 6 Mar 2025 03:31:14 +0200
Subject: [PATCH 14/19] opencl : fix `ulong` kernel args were set from `int`
 variables (#12174)

... which left garbage bits in the upper half of the kernel args. This
caused segmentation faults when running PoCL.
---
 ggml/src/ggml-opencl/ggml-opencl.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 9a76da43b..340284086 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -3765,10 +3765,10 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
     const int  ne02 = src0 ? src0->ne[2] : 0;
     const int  ne03 = src0 ? src0->ne[3] : 0;
 
-    const int  nb00 = src0 ? src0->nb[0] : 0;
-    const int  nb01 = src0 ? src0->nb[1] : 0;
-    const int  nb02 = src0 ? src0->nb[2] : 0;
-    const int  nb03 = src0 ? src0->nb[3] : 0;
+    const cl_ulong  nb00 = src0 ? src0->nb[0] : 0;
+    const cl_ulong  nb01 = src0 ? src0->nb[1] : 0;
+    const cl_ulong  nb02 = src0 ? src0->nb[2] : 0;
+    const cl_ulong  nb03 = src0 ? src0->nb[3] : 0;
 
     const int ne10 = src1 ? src1->ne[0] : 0;
     const int ne11 = src1 ? src1->ne[1] : 0; UNUSED(ne11);
@@ -3780,10 +3780,10 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
     const int  ne2 = dst ? dst->ne[2] : 0;
     const int  ne3 = dst ? dst->ne[3] : 0;
 
-    const int  nb0 = dst ? dst->nb[0] : 0;
-    const int  nb1 = dst ? dst->nb[1] : 0;
-    const int  nb2 = dst ? dst->nb[2] : 0;
-    const int  nb3 = dst ? dst->nb[3] : 0;
+    const cl_ulong  nb0 = dst ? dst->nb[0] : 0;
+    const cl_ulong  nb1 = dst ? dst->nb[1] : 0;
+    const cl_ulong  nb2 = dst ? dst->nb[2] : 0;
+    const cl_ulong  nb3 = dst ? dst->nb[3] : 0;
 
     GGML_ASSERT(ne10 % ne02 == 0);
     GGML_ASSERT(ne10 >= ne02);

From 94bb63e4f0f6135579b88e5700ed40cefa374e09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= <henry.linjamaki@gmail.com>
Date: Thu, 6 Mar 2025 03:33:40 +0200
Subject: [PATCH 15/19] opencl : fix buffer alignment (#12197)

Fix the following error:

```
ggml-alloc.c:99: not enough space in the buffer
ggml_tallocr_alloc: not enough space in the buffer to allocate blk.17.ffn_down.weight (needed 27525120, available 27521024)
```

which occurs when `ggml_backend_opencl_context::alignment` is larger
than `cl_ptr_base` (hard-coded to `0x1000`).

Also, fix `ggml_backend_opencl_context::alignment` was set to
`CL_DEVICE_MEM_BASE_ADDR_ALIGN` which was treated as bytes but the
value is reported in bits.
---
 ggml/src/ggml-opencl/ggml-opencl.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 340284086..bc2ea06b5 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -524,7 +524,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
         return backend_ctx;
     }
 
-    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &backend_ctx->alignment, NULL));
+    cl_uint base_align_in_bits;
+    CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL));
+    GGML_ASSERT(base_align_in_bits % 8u == 0);
+    backend_ctx->alignment = base_align_in_bits / 8u;
     GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
 
     clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
@@ -1198,17 +1201,14 @@ struct ggml_backend_opencl_buffer_context {
     std::string name;
 };
 
-static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
-
 static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
     delete ctx;
 }
 
 static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return cl_ptr_base;
-
-    GGML_UNUSED(buffer);
+    ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer->buft->device);
+    return (void *) (uintptr_t) backend_ctx->alignment;
 }
 
 static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
@@ -1241,7 +1241,7 @@ static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buff
         tensor->extra = view_extra;
     } else {
         {
-            size_t offset = (char *)tensor->data - (char *)cl_ptr_base;
+            size_t offset = (char *) tensor->data - (char *) ggml_backend_opencl_buffer_get_base(buffer);
 
             ggml_tensor_extra_cl * extra = ctx->ggml_opencl_alloc_temp_tensor_extra();
             extra->offset = offset;

From 57b6abf85acb1f697913a44e5e105e333d894b78 Mon Sep 17 00:00:00 2001
From: Han Yin <han.yin@arm.com>
Date: Wed, 5 Mar 2025 22:22:49 -0800
Subject: [PATCH 16/19] android : fix KV cache log message condition (#12212)

---
 examples/llama.android/llama/src/main/cpp/llama-android.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
index 2a73983a9..0de61ce77 100644
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -361,7 +361,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
     const auto tokens_list = common_tokenize(context, text, true, parse_special);
 
     auto n_ctx = llama_n_ctx(context);
-    auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
+    auto n_kv_req = tokens_list.size() + n_len;
 
     LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
 

From e721c05c9336a72fbb59d5c75967360bc67036c6 Mon Sep 17 00:00:00 2001
From: uvos <philipp@uvos.xyz>
Date: Thu, 6 Mar 2025 08:20:52 +0100
Subject: [PATCH 17/19] HIP/CUDA: set the paramerter value in
 maintain_cuda_graph instead of replaceing it. (#12209)

This avoids conflict with internal cuda/hip runtimes memory managment behavior.
---
 ggml/src/ggml-cuda/ggml-cuda.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index b5d2c8411..497de37be 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2571,7 +2571,7 @@ static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vecto
         for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
             if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
                 char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
-                cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
+                *(void**)cuda_ctx->cuda_graph->params[i].kernelParams[1] = *(void**)updated_kernel_arg_ptr;
                 CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
             }
         }

From e9b2f84f145fbd458dcb98f227bd09370918be6e Mon Sep 17 00:00:00 2001
From: Aaron Teo <57927438+taronaeo@users.noreply.github.com>
Date: Thu, 6 Mar 2025 16:33:21 +0800
Subject: [PATCH 18/19] llava: add big-endian conversion for image encoder
 (#12218)

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 examples/llava/convert_image_encoder_to_gguf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/llava/convert_image_encoder_to_gguf.py b/examples/llava/convert_image_encoder_to_gguf.py
index de29687ec..2949faec4 100644
--- a/examples/llava/convert_image_encoder_to_gguf.py
+++ b/examples/llava/convert_image_encoder_to_gguf.py
@@ -89,6 +89,7 @@ def bytes_to_unicode():
 ap = argparse.ArgumentParser()
 ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
 ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+ap.add_argument('--bigendian', action="store_true", default=False, help="Model is executed on big-endian machine")
 ap.add_argument("--text-only", action="store_true", required=False,
                 help="Save a text-only model. It can't be used to encode images")
 ap.add_argument("--vision-only", action="store_true", required=False,
@@ -191,7 +192,7 @@ output_dir = args.output_dir if args.output_dir is not None else dir_model
 os.makedirs(output_dir, exist_ok=True)
 output_prefix = os.path.basename(output_dir).replace("ggml_", "")
 fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
-fout = GGUFWriter(path=fname_out, arch="clip")
+fout = GGUFWriter(path=fname_out, arch="clip", endianess=GGUFEndian.LITTLE if not args.bigendian else GGUFEndian.BIG)
 
 fout.add_bool("clip.has_text_encoder", has_text_encoder)
 fout.add_bool("clip.has_vision_encoder", has_vision_encoder)

From 42994048a34b0bddd72b57c26f8ae2c7d417946d Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@users.noreply.github.com>
Date: Thu, 6 Mar 2025 09:03:31 +0000
Subject: [PATCH 19/19] update function-calling.md w/ template override for
 functionary-small-v3.2 (#12214)

---
 docs/function-calling.md | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/docs/function-calling.md b/docs/function-calling.md
index 92cb6531a..c3873c3fa 100644
--- a/docs/function-calling.md
+++ b/docs/function-calling.md
@@ -287,30 +287,32 @@ Here are some models known to work (w/ chat template override when needed):
 
 llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
 llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L
-llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
 llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M
 
-# Native support for DeepSeek R1 works best w/ our own template (official template buggy)
+# Native support for DeepSeek R1 works best w/ our template override (official template is buggy, although we do work around it)
 
 llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L \
---chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
+    --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
 
 llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M \
---chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
+    --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
 
 # Native support requires the right template for these GGUFs:
 
+llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M
+    --chat-template-file models/templates/meetkai-functionary-medium-v3.2.jinja
+
 llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \
---chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use )
+    --chat-template-file models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja
 
 llama-server --jinja -fa -hf bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M \
---chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use )
+    --chat-template-file models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja
 
 llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \
---chat-template-file <( python scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2 tool_use )
+    --chat-template-file models/templates/fireworks-ai-llama-3-firefunction-v2.jinja
 
 llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L \
---chat-template-file <( python scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 tool_use )
+    --chat-template-file models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja
 
 # Generic format support
 llama-server --jinja -fa -hf bartowski/phi-4-GGUF:Q4_0
@@ -318,6 +320,8 @@ llama-server --jinja -fa -hf bartowski/gemma-2-2b-it-GGUF:Q8_0
 llama-server --jinja -fa -hf bartowski/c4ai-command-r-v01-GGUF:Q2_K
 ```
 
+To get the official template from original HuggingFace repos, you can use [scripts/get_chat_template.py](../scripts/get_chat_template.py) (see examples invocations in [models/templates/README.md](../models/templates/README.md))
+
 > [!TIP]
 > If there is no official `tool_use` Jinja template, you may want to set `--chat-template chatml` to use a default that works with many models (YMMV!), or write your own (e.g. we provide a custom [llama-cpp-deepseek-r1.jinja](../models/templates/llama-cpp-deepseek-r1.jinja) for DeepSeek R1 distills)