diff --git a/examples/llava/README.md b/examples/llava/README.md
index 721d5e613..19f1a50a2 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -29,19 +29,25 @@ git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
 git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 ```
 
-2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+2. Install the required Python packages:
+
+```sh
+pip install -r examples/llava/requirements.txt
+```
+
+3. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
 
 ```sh
 python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
 ```
 
-3. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
+4. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
 
 ```sh
 python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```
 
-4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
 
 ```sh
 python ./convert.py ../llava-v1.5-7b
diff --git a/examples/llava/llava-surgery.py b/examples/llava/llava-surgery.py
index 515f6b58d..0a61efdfe 100644
--- a/examples/llava/llava-surgery.py
+++ b/examples/llava/llava-surgery.py
@@ -42,5 +42,5 @@ if len(clip_tensors) > 0:
 torch.save(checkpoint, path)
 
 print("Done!")
-print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
+print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
 print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt
new file mode 100644
index 000000000..f80f727a7
--- /dev/null
+++ b/examples/llava/requirements.txt
@@ -0,0 +1,3 @@
+-r ../../requirements/requirements-convert.txt
+pillow~=10.2.0
+torch~=2.1.1
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 48a2909dc..0e7c44648 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1593,10 +1593,6 @@ struct llama_server_context
                         LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
                     }
 
-                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
-
-                    llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
-
                     slot.cache_tokens = prompt_tokens;
 
                     if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
@@ -1610,6 +1606,10 @@ struct llama_server_context
                         }
                     }
 
+                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
+
+                    llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
+
                     LOG_VERBOSE("prompt ingested", {
                                                     {"n_past",  slot.n_past},
                                                     {"cached",  tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
diff --git a/ggml-backend.c b/ggml-backend.c
index 0764dfebc..532da8eda 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -653,6 +653,9 @@ struct ggml_backend_cpu_context {
     int n_threads;
     void * work_data;
     size_t work_size;
+
+    ggml_abort_callback abort_callback;
+    void *              abort_callback_data;
 };
 
 GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
@@ -691,6 +694,9 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
         cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
     }
 
+    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
+    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+
     return cpu_plan;
 }
 
@@ -721,9 +727,11 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
         cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
         cpu_ctx->work_size = cplan.work_size;
     }
-
     cplan.work_data = cpu_ctx->work_data;
 
+    cplan.abort_callback      = cpu_ctx->abort_callback;
+    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+
     ggml_graph_compute(cgraph, &cplan);
     return true;
 }
@@ -759,9 +767,11 @@ static struct ggml_backend_i cpu_backend_i = {
 ggml_backend_t ggml_backend_cpu_init(void) {
     struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
 
-    ctx->n_threads = GGML_DEFAULT_N_THREADS;
-    ctx->work_data = NULL;
-    ctx->work_size = 0;
+    ctx->n_threads           = GGML_DEFAULT_N_THREADS;
+    ctx->work_data           = NULL;
+    ctx->work_size           = 0;
+    ctx->abort_callback      = NULL;
+    ctx->abort_callback_data = NULL;
 
     ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
 
@@ -783,6 +793,14 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
     ctx->n_threads = n_threads;
 }
 
+void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->abort_callback = abort_callback;
+    ctx->abort_callback_data = abort_callback_data;
+}
+
 GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
     return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
 }
diff --git a/ggml-backend.h b/ggml-backend.h
index 8b8160fcf..282b3a9b7 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -83,8 +83,9 @@ extern "C" {
 
     GGML_API ggml_backend_t ggml_backend_cpu_init(void);
 
-    GGML_API GGML_CALL bool ggml_backend_is_cpu           (ggml_backend_t backend);
-    GGML_API           void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
+    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
+    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
 
     // Create a backend buffer from an existing pointer
     GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
diff --git a/ggml-metal.m b/ggml-metal.m
index 82dd8af36..50e8661f8 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -687,6 +687,7 @@ static bool ggml_metal_graph_compute(
         struct ggml_metal_context * ctx,
                struct ggml_cgraph * gf) {
 
+    @autoreleasepool {
     MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
     edesc.dispatchType = MTLDispatchTypeSerial;
 
@@ -2272,6 +2273,7 @@ static bool ggml_metal_graph_compute(
         [[MTLCaptureManager sharedCaptureManager] stopCapture];
     }
 
+    }
     return true;
 }
 
diff --git a/ggml-quants.c b/ggml-quants.c
index 62fc366ff..a91593c5e 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -270,6 +270,17 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
 #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
 
 #if defined(__ARM_NEON)
+
+#ifdef _MSC_VER
+
+#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
+
+#else
+
+#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
+
+#endif
+
 #if !defined(__aarch64__)
 
 // 64-bit compatibility
@@ -8700,10 +8711,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * res
         for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
             q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
             memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
-            const uint32x4_t aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
-            const uint32x4_t aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
-            const uint32x4_t aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
-            const uint32x4_t aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
+            const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
+            const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
+            const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
+            const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
             q3 += 16;
             q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >>  7) & 127))));
             q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
diff --git a/ggml.c b/ggml.c
index 2afdd9df4..4b1f2d63b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16649,7 +16649,7 @@ struct ggml_compute_state_shared {
     atomic_int node_n;    // active graph node
     atomic_int node_task; // active graph node task phase
 
-    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
+    ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
     void * abort_callback_data;
 };
 
diff --git a/ggml.h b/ggml.h
index 030515e16..d59668f75 100644
--- a/ggml.h
+++ b/ggml.h
@@ -574,6 +574,11 @@ extern "C" {
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
 
+    // Abort callback
+    // If not NULL, called before ggml computation
+    // If it returns true, the computation is aborted
+    typedef bool (*ggml_abort_callback)(void * data);
+
     // the compute plan that needs to be prepared for ggml_graph_compute()
     // since https://github.com/ggerganov/ggml/issues/287
     struct ggml_cplan {
@@ -583,8 +588,8 @@ extern "C" {
         int n_threads;
 
         // abort ggml_graph_compute when true
-        bool (*abort_callback)(void * data);
-        void * abort_callback_data;
+        ggml_abort_callback abort_callback;
+        void *              abort_callback_data;
     };
 
     enum ggml_cgraph_eval_order {
diff --git a/ggml_vk_generate_shaders.py b/ggml_vk_generate_shaders.py
index 4abb0383f..b2e86e182 100644
--- a/ggml_vk_generate_shaders.py
+++ b/ggml_vk_generate_shaders.py
@@ -2067,6 +2067,8 @@ type_names = {
 
 K_QUANTS_PER_ITERATION = 2
 
+ASYNCIO_CONCURRENCY = 64
+
 output_dir = gettempdir()
 
 lock = asyncio.Lock()
@@ -2291,7 +2293,14 @@ async def main():
     tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"}))
     tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
 
-    await asyncio.gather(*tasks)
+    # Helper to decorate tasks with semaphore acquisition.
+    async def withSemaphore(sem, task):
+        async with sem:
+            return await task
+
+    # Run tasks concurrently guarded by a concurrency limit.
+    sem = asyncio.Semaphore(ASYNCIO_CONCURRENCY)
+    await asyncio.gather(*(withSemaphore(sem, task) for task in tasks))
 
     with open("ggml-vulkan-shaders.hpp", "w") as f:
         f.write("#include <cstdint>\n\n")
diff --git a/llama.cpp b/llama.cpp
index c99e091e2..e500129fa 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7360,7 +7360,9 @@ static int llama_decode_internal(
     // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
     //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
     //       with the BLAS calls. need a better solution
-    if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
+    // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
+    //                   being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
+    if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
         n_threads = std::min(4, n_threads);
     }
 
diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh
index 6b2514a11..2c391e641 100755
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -97,6 +97,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
     # src/ggml-cuda.cu            -> ggml-cuda.cu
     # src/ggml-cuda.h             -> ggml-cuda.h
     # src/ggml-impl.h             -> ggml-impl.h
+    # src/ggml-kompute.cpp        -> ggml-kompute.cpp
+    # src/ggml-kompute.h          -> ggml-kompute.h
     # src/ggml-metal.h            -> ggml-metal.h
     # src/ggml-metal.m            -> ggml-metal.m
     # src/ggml-mpi.h              -> ggml-mpi.h
@@ -105,6 +107,10 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
     # src/ggml-opencl.h           -> ggml-opencl.h
     # src/ggml-quants.c           -> ggml-quants.c
     # src/ggml-quants.h           -> ggml-quants.h
+    # src/ggml-sycl.cpp           -> ggml-sycl.cpp
+    # src/ggml-sycl.h             -> ggml-sycl.h
+    # src/ggml-vulkan.cpp         -> ggml-vulkan.cpp
+    # src/ggml-vulkan.h           -> ggml-vulkan.h
     # include/ggml/ggml.h         -> ggml.h
     # include/ggml/ggml-alloc.h   -> ggml-alloc.h
     # include/ggml/ggml-backend.h -> ggml-backend.h
@@ -123,6 +129,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
         -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
         -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
         -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
+        -e 's/src\/ggml-kompute\.cpp/ggml-kompute.cpp/g' \
+        -e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \
         -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
         -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
         -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
@@ -131,6 +139,10 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
         -e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
         -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
         -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
+        -e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \
+        -e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \
+        -e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \
+        -e 's/src\/ggml-vulkan\.h/ggml-vulkan.h/g' \
         -e 's/include\/ggml\/ggml\.h/ggml.h/g' \
         -e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \
         -e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \
diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index 7b6c17915..6ae75bc31 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-475cbad5c1c834e31e26a2283bc1413181644360
+2c7cf49810d523b9632da393a9e8270b60bf3b24