From b2f87cb64db47d799b6f3656855c9caf9792ab2a Mon Sep 17 00:00:00 2001
From: Michael Podvitskiy <podvitskiymichael@gmail.com>
Date: Fri, 9 Feb 2024 10:56:43 +0100
Subject: [PATCH 01/10] ggml : fix `error C2078: too many initializers` for
 MSVC ARM64 (#5404)

---
 ggml-quants.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index 101d3e783..1031e3761 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -268,6 +268,17 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
 #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
 
 #if defined(__ARM_NEON)
+
+#ifdef _MSC_VER
+
+#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
+
+#else
+
+#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
+
+#endif
+
 #if !defined(__aarch64__)
 
 // 64-bit compatibility
@@ -8698,10 +8709,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * res
         for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
             q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
             memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
-            const uint32x4_t aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
-            const uint32x4_t aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
-            const uint32x4_t aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
-            const uint32x4_t aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
+            const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
+            const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
+            const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
+            const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
             q3 += 16;
             q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >>  7) & 127))));
             q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));

From e4124c24775f2cb5b3d7acc93bf9dc5471c172ef Mon Sep 17 00:00:00 2001
From: Marko Tasic <mtasic85@gmail.com>
Date: Fri, 9 Feb 2024 11:17:00 +0100
Subject: [PATCH 02/10] readme : add JavaScript/Wasm repo (#5415)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 66166c01b..0b4efdd33 100644
--- a/README.md
+++ b/README.md
@@ -124,6 +124,7 @@ Typically finetunes of the base models below are supported as well.
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
 - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
+- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
 - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)

From e5ca3937c685d6e012ac4db40555d6ec100ff03c Mon Sep 17 00:00:00 2001
From: Paul Tsochantaris <ptsochantaris@icloud.com>
Date: Fri, 9 Feb 2024 10:48:06 +0000
Subject: [PATCH 03/10] llama : do not cap thread count when MoE on CPU (#5419)

* Not capping thread count when MoE inference is running on CPU

* Whitespace
---
 llama.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index db7d1c1cd..0566b087b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7285,7 +7285,9 @@ static int llama_decode_internal(
     // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
     //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
     //       with the BLAS calls. need a better solution
-    if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
+    // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
+    //                   being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
+    if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
         n_threads = std::min(4, n_threads);
     }
 

From 7c777fcd5dd4af7079e33390cf6a19c328a2666f Mon Sep 17 00:00:00 2001
From: Riley Stewart <ristew@users.noreply.github.com>
Date: Fri, 9 Feb 2024 02:49:49 -0800
Subject: [PATCH 04/10] server : fix prompt caching for repeated prompts
 (#5420)

---
 examples/server/server.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index eceda30d0..8d668f798 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1592,10 +1592,6 @@ struct llama_server_context
                         LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
                     }
 
-                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
-
-                    llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
-
                     slot.cache_tokens = prompt_tokens;
 
                     if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
@@ -1609,6 +1605,10 @@ struct llama_server_context
                         }
                     }
 
+                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
+
+                    llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
+
                     LOG_VERBOSE("prompt ingested", {
                                                     {"n_past",  slot.n_past},
                                                     {"cached",  tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},

From e00d2a62dd1441e3b089570ec06d05c18800d368 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Fri, 9 Feb 2024 14:00:59 +0100
Subject: [PATCH 05/10] llava : add requirements.txt and update README.md
 (#5428)

* llava: add requirements.txt and update README.md

This commit adds a `requirements.txt` file to the `examples/llava`
directory. This file contains the required Python packages to run the
scripts in the `examples/llava` directory.

The motivation of this to make it easier for users to run the scripts in
`examples/llava`. This will avoid users from having to possibly run into
missing package issues if the packages are not installed on their system.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

* llava: fix typo in llava-surgery.py output

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>

---------

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 examples/llava/README.md        | 12 +++++++++---
 examples/llava/llava-surgery.py |  2 +-
 examples/llava/requirements.txt |  3 +++
 3 files changed, 13 insertions(+), 4 deletions(-)
 create mode 100644 examples/llava/requirements.txt

diff --git a/examples/llava/README.md b/examples/llava/README.md
index 721d5e613..19f1a50a2 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -29,19 +29,25 @@ git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
 git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 ```
 
-2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+2. Install the required Python packages:
+
+```sh
+pip install -r examples/llava/requirements.txt
+```
+
+3. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
 
 ```sh
 python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
 ```
 
-3. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
+4. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
 
 ```sh
 python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```
 
-4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
 
 ```sh
 python ./convert.py ../llava-v1.5-7b
diff --git a/examples/llava/llava-surgery.py b/examples/llava/llava-surgery.py
index 515f6b58d..0a61efdfe 100644
--- a/examples/llava/llava-surgery.py
+++ b/examples/llava/llava-surgery.py
@@ -42,5 +42,5 @@ if len(clip_tensors) > 0:
 torch.save(checkpoint, path)
 
 print("Done!")
-print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
+print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
 print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt
new file mode 100644
index 000000000..f80f727a7
--- /dev/null
+++ b/examples/llava/requirements.txt
@@ -0,0 +1,3 @@
+-r ../../requirements/requirements-convert.txt
+pillow~=10.2.0
+torch~=2.1.1

From 4b7b38bef5addbd31f453871d79647fbae6bec8a Mon Sep 17 00:00:00 2001
From: Neuman Vong <neuman.vong@gmail.com>
Date: Sat, 10 Feb 2024 05:30:19 +1100
Subject: [PATCH 06/10] vulkan: Set limit for task concurrency (#5427)

A common default for the maximum number of open files is 256, which can
lead to `asyncio.gather(*tasks)` failing with Too many open files.

    $ python ggml_vk_generate_shaders.py --glslc=$ANDROID_NDK_PATH/shader-tools/darwin-x86_64/glslc
    ggml_vulkan: Generating and compiling shaders to SPIR-V
    Traceback (most recent call last):
      File "/Users/neuman/Code.noindex/github/llama.cpp/ggml_vk_generate_shaders.py", line 2326, in <module>
        asyncio.run(main())
      File "/Users/neuman/Code.noindex/miniforge3/lib/python3.10/asyncio/runners.py", line 44, in run
        return loop.run_until_complete(main)
      File "/Users/neuman/Code.noindex/miniforge3/lib/python3.10/asyncio/base_events.py", line 649, in run_until_complete
        return future.result()
      File "/Users/neuman/Code.noindex/github/llama.cpp/ggml_vk_generate_shaders.py", line 2294, in main
        await asyncio.gather(*tasks)
    [...snip...]
    OSError: [Errno 24] Too many open files

This change sets a reasonable concurrency limit for tasks (and therefore
open files), without significant impact on run time.
---
 ggml_vk_generate_shaders.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/ggml_vk_generate_shaders.py b/ggml_vk_generate_shaders.py
index 4abb0383f..b2e86e182 100644
--- a/ggml_vk_generate_shaders.py
+++ b/ggml_vk_generate_shaders.py
@@ -2067,6 +2067,8 @@ type_names = {
 
 K_QUANTS_PER_ITERATION = 2
 
+ASYNCIO_CONCURRENCY = 64
+
 output_dir = gettempdir()
 
 lock = asyncio.Lock()
@@ -2291,7 +2293,14 @@ async def main():
     tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"}))
     tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
 
-    await asyncio.gather(*tasks)
+    # Helper to decorate tasks with semaphore acquisition.
+    async def withSemaphore(sem, task):
+        async with sem:
+            return await task
+
+    # Run tasks concurrently guarded by a concurrency limit.
+    sem = asyncio.Semaphore(ASYNCIO_CONCURRENCY)
+    await asyncio.gather(*(withSemaphore(sem, task) for task in tasks))
 
     with open("ggml-vulkan-shaders.hpp", "w") as f:
         f.write("#include <cstdint>\n\n")

From 4633d93af08d890ecd00fa6e4f61d76f21cded4c Mon Sep 17 00:00:00 2001
From: Michael Podvitskiy <podvitskiymichael@gmail.com>
Date: Fri, 9 Feb 2024 10:42:27 +0100
Subject: [PATCH 07/10] ggml : add abort_callback for cpu backend (ggml/725)

* a way to use abort_callback with the cpu backend

* whisper update
---
 ggml-backend.c | 26 ++++++++++++++++++++++----
 ggml-backend.h |  5 +++--
 ggml.c         |  2 +-
 ggml.h         |  9 +++++++--
 4 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index 0764dfebc..532da8eda 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -653,6 +653,9 @@ struct ggml_backend_cpu_context {
     int n_threads;
     void * work_data;
     size_t work_size;
+
+    ggml_abort_callback abort_callback;
+    void *              abort_callback_data;
 };
 
 GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
@@ -691,6 +694,9 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
         cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
     }
 
+    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
+    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+
     return cpu_plan;
 }
 
@@ -721,9 +727,11 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
         cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
         cpu_ctx->work_size = cplan.work_size;
     }
-
     cplan.work_data = cpu_ctx->work_data;
 
+    cplan.abort_callback      = cpu_ctx->abort_callback;
+    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+
     ggml_graph_compute(cgraph, &cplan);
     return true;
 }
@@ -759,9 +767,11 @@ static struct ggml_backend_i cpu_backend_i = {
 ggml_backend_t ggml_backend_cpu_init(void) {
     struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
 
-    ctx->n_threads = GGML_DEFAULT_N_THREADS;
-    ctx->work_data = NULL;
-    ctx->work_size = 0;
+    ctx->n_threads           = GGML_DEFAULT_N_THREADS;
+    ctx->work_data           = NULL;
+    ctx->work_size           = 0;
+    ctx->abort_callback      = NULL;
+    ctx->abort_callback_data = NULL;
 
     ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
 
@@ -783,6 +793,14 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
     ctx->n_threads = n_threads;
 }
 
+void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->abort_callback = abort_callback;
+    ctx->abort_callback_data = abort_callback_data;
+}
+
 GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
     return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
 }
diff --git a/ggml-backend.h b/ggml-backend.h
index 8b8160fcf..282b3a9b7 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -83,8 +83,9 @@ extern "C" {
 
     GGML_API ggml_backend_t ggml_backend_cpu_init(void);
 
-    GGML_API GGML_CALL bool ggml_backend_is_cpu           (ggml_backend_t backend);
-    GGML_API           void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
+    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
+    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
 
     // Create a backend buffer from an existing pointer
     GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
diff --git a/ggml.c b/ggml.c
index f783a6fd3..86cd65862 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16649,7 +16649,7 @@ struct ggml_compute_state_shared {
     atomic_int node_n;    // active graph node
     atomic_int node_task; // active graph node task phase
 
-    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
+    ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
     void * abort_callback_data;
 };
 
diff --git a/ggml.h b/ggml.h
index e0a4799f3..1360cd8ee 100644
--- a/ggml.h
+++ b/ggml.h
@@ -567,6 +567,11 @@ extern "C" {
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
 
+    // Abort callback
+    // If not NULL, called before ggml computation
+    // If it returns true, the computation is aborted
+    typedef bool (*ggml_abort_callback)(void * data);
+
     // the compute plan that needs to be prepared for ggml_graph_compute()
     // since https://github.com/ggerganov/ggml/issues/287
     struct ggml_cplan {
@@ -576,8 +581,8 @@ extern "C" {
         int n_threads;
 
         // abort ggml_graph_compute when true
-        bool (*abort_callback)(void * data);
-        void * abort_callback_data;
+        ggml_abort_callback abort_callback;
+        void *              abort_callback_data;
     };
 
     enum ggml_cgraph_eval_order {

From 43b65f5eb85e8741aba573a8f65bb8efad245d31 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 10 Feb 2024 09:30:36 +0200
Subject: [PATCH 08/10] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 7b6c17915..6ae75bc31 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-475cbad5c1c834e31e26a2283bc1413181644360
+2c7cf49810d523b9632da393a9e8270b60bf3b24

From cd9aea63b577a83def84dbd6dcd90a6fa02af745 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 10 Feb 2024 09:53:05 +0200
Subject: [PATCH 09/10] scripts : update sync scripts with new backends

---
 scripts/sync-ggml-am.sh | 12 ++++++++++++
 scripts/sync-ggml.sh    |  6 ++++++
 2 files changed, 18 insertions(+)

diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh
index 6b2514a11..2c391e641 100755
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -97,6 +97,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
     # src/ggml-cuda.cu            -> ggml-cuda.cu
     # src/ggml-cuda.h             -> ggml-cuda.h
     # src/ggml-impl.h             -> ggml-impl.h
+    # src/ggml-kompute.cpp        -> ggml-kompute.cpp
+    # src/ggml-kompute.h          -> ggml-kompute.h
     # src/ggml-metal.h            -> ggml-metal.h
     # src/ggml-metal.m            -> ggml-metal.m
     # src/ggml-mpi.h              -> ggml-mpi.h
@@ -105,6 +107,10 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
     # src/ggml-opencl.h           -> ggml-opencl.h
     # src/ggml-quants.c           -> ggml-quants.c
     # src/ggml-quants.h           -> ggml-quants.h
+    # src/ggml-sycl.cpp           -> ggml-sycl.cpp
+    # src/ggml-sycl.h             -> ggml-sycl.h
+    # src/ggml-vulkan.cpp         -> ggml-vulkan.cpp
+    # src/ggml-vulkan.h           -> ggml-vulkan.h
     # include/ggml/ggml.h         -> ggml.h
     # include/ggml/ggml-alloc.h   -> ggml-alloc.h
     # include/ggml/ggml-backend.h -> ggml-backend.h
@@ -123,6 +129,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
         -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
         -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
         -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
+        -e 's/src\/ggml-kompute\.cpp/ggml-kompute.cpp/g' \
+        -e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \
         -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
         -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
         -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
@@ -131,6 +139,10 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
         -e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
         -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
         -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
+        -e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \
+        -e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \
+        -e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \
+        -e 's/src\/ggml-vulkan\.h/ggml-vulkan.h/g' \
         -e 's/include\/ggml\/ggml\.h/ggml.h/g' \
         -e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \
         -e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \
diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh
index 0097db435..feb34bbc8 100755
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@@ -7,6 +7,8 @@ cp -rpv ../ggml/src/ggml-backend.c          ./ggml-backend.c
 cp -rpv ../ggml/src/ggml-cuda.cu            ./ggml-cuda.cu
 cp -rpv ../ggml/src/ggml-cuda.h             ./ggml-cuda.h
 cp -rpv ../ggml/src/ggml-impl.h             ./ggml-impl.h
+cp -rpv ../ggml/src/ggml-kompute.cpp        ./ggml-kompute.cpp
+cp -rpv ../ggml/src/ggml-kompute.h          ./ggml-kompute.h
 cp -rpv ../ggml/src/ggml-metal.h            ./ggml-metal.h
 cp -rpv ../ggml/src/ggml-metal.m            ./ggml-metal.m
 cp -rpv ../ggml/src/ggml-metal.metal        ./ggml-metal.metal
@@ -16,6 +18,10 @@ cp -rpv ../ggml/src/ggml-opencl.cpp         ./ggml-opencl.cpp
 cp -rpv ../ggml/src/ggml-opencl.h           ./ggml-opencl.h
 cp -rpv ../ggml/src/ggml-quants.c           ./ggml-quants.c
 cp -rpv ../ggml/src/ggml-quants.h           ./ggml-quants.h
+cp -rpv ../ggml/src/ggml-sycl.cpp           ./ggml-sycl.cpp
+cp -rpv ../ggml/src/ggml-sycl.h             ./ggml-sycl.h
+cp -rpv ../ggml/src/ggml-vulkan.cpp         ./ggml-vulkan.cpp
+cp -rpv ../ggml/src/ggml-vulkan.h           ./ggml-vulkan.h
 cp -rpv ../ggml/include/ggml/ggml.h         ./ggml.h
 cp -rpv ../ggml/include/ggml/ggml-alloc.h   ./ggml-alloc.h
 cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h

From f026f8120f97090d34a52b3dc023c82e0ede3f7d Mon Sep 17 00:00:00 2001
From: Ian Bull <irbull@eclipsesource.com>
Date: Sat, 10 Feb 2024 02:53:28 -0800
Subject: [PATCH 10/10] metal : use autoreleasepool to avoid memory leaks
 (#5437)

There appears to be a known memory leak when using the
`MLTCommandBuffer`. It is suggested to use `@autoreleasepool` in
[1,2]

[1] https://developer.apple.com/forums/thread/662721
[2] https://forums.developer.apple.com/forums/thread/120931

This change-set wraps the `ggml_metal_graph_compute` in a
`@autoreleasepool`.

This commit addresses https://github.com/ggerganov/llama.cpp/issues/5436
---
 ggml-metal.m | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml-metal.m b/ggml-metal.m
index 5260ed827..c1d8e2de8 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -687,6 +687,7 @@ static bool ggml_metal_graph_compute(
         struct ggml_metal_context * ctx,
                struct ggml_cgraph * gf) {
 
+    @autoreleasepool {
     MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
     edesc.dispatchType = MTLDispatchTypeSerial;
 
@@ -2272,6 +2273,7 @@ static bool ggml_metal_graph_compute(
         [[MTLCaptureManager sharedCaptureManager] stopCapture];
     }
 
+    }
     return true;
 }