Merge branch 'master' into concedo_experimental

# Conflicts: # README.md # scripts/sync-ggml.sh
2025-09-11 01:24:36 +00:00 · 2024-02-11 15:18:46 +08:00 · 2024-02-11 15:18:46 +08:00 · ea3fd87f68
commit ea3fd87f68
parent 038779af41 f026f8120f
14 changed files with 93 additions and 24 deletions
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@ -29,19 +29,25 @@ git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
 git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 ```

-2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+2. Install the required Python packages:
+
+```sh
+pip install -r examples/llava/requirements.txt
+```
+
+3. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:

 ```sh
 python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
 ```

-3. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
+4. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:

 ```sh
 python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```

-4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:

 ```sh
 python ./convert.py ../llava-v1.5-7b
--- a/examples/llava/llava-surgery.py
+++ b/examples/llava/llava-surgery.py
@ -42,5 +42,5 @@ if len(clip_tensors) > 0:
 torch.save(checkpoint, path)

 print("Done!")
-print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
+print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
 print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
--- a/examples/llava/requirements.txt
+++ b/examples/llava/requirements.txt
@ -0,0 +1,3 @@
+-r ../../requirements/requirements-convert.txt
+pillow~=10.2.0
+torch~=2.1.1
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1593,10 +1593,6 @@ struct llama_server_context
                        LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
                    }

-                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
-
-                    llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
-
                    slot.cache_tokens = prompt_tokens;

                    if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
@ -1610,6 +1606,10 @@ struct llama_server_context
                        }
                    }

+                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
+
+                    llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
+
                    LOG_VERBOSE("prompt ingested", {
                                                    {"n_past",  slot.n_past},
                                                    {"cached",  tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -653,6 +653,9 @@ struct ggml_backend_cpu_context {
    int n_threads;
    void * work_data;
    size_t work_size;
+
+    ggml_abort_callback abort_callback;
+    void *              abort_callback_data;
 };

 GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
@ -691,6 +694,9 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
        cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
    }

+    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
+    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+
    return cpu_plan;
 }

@ -721,9 +727,11 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
        cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
        cpu_ctx->work_size = cplan.work_size;
    }
-
    cplan.work_data = cpu_ctx->work_data;

+    cplan.abort_callback      = cpu_ctx->abort_callback;
+    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+
    ggml_graph_compute(cgraph, &cplan);
    return true;
 }
@ -762,6 +770,8 @@ ggml_backend_t ggml_backend_cpu_init(void) {
    ctx->n_threads           = GGML_DEFAULT_N_THREADS;
    ctx->work_data           = NULL;
    ctx->work_size           = 0;
+    ctx->abort_callback      = NULL;
+    ctx->abort_callback_data = NULL;

    ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));

@ -783,6 +793,14 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
    ctx->n_threads = n_threads;
 }

+void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->abort_callback = abort_callback;
+    ctx->abort_callback_data = abort_callback_data;
+}
+
 GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
    return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
 }
--- a/ggml-backend.h
+++ b/ggml-backend.h
@ -84,7 +84,8 @@ extern "C" {
    GGML_API ggml_backend_t ggml_backend_cpu_init(void);

    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
-    GGML_API           void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
+    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);

    // Create a backend buffer from an existing pointer
    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -687,6 +687,7 @@ static bool ggml_metal_graph_compute(
        struct ggml_metal_context * ctx,
               struct ggml_cgraph * gf) {

+    @autoreleasepool {
    MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
    edesc.dispatchType = MTLDispatchTypeSerial;

@ -2272,6 +2273,7 @@ static bool ggml_metal_graph_compute(
        [[MTLCaptureManager sharedCaptureManager] stopCapture];
    }

+    }
    return true;
 }

--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -270,6 +270,17 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
 #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)

 #if defined(__ARM_NEON)
+
+#ifdef _MSC_VER
+
+#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
+
+#else
+
+#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
+
+#endif
+
 #if !defined(__aarch64__)

 // 64-bit compatibility
@ -8700,10 +8711,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * res
        for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
            q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
            memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
-            const uint32x4_t aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
-            const uint32x4_t aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
-            const uint32x4_t aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
-            const uint32x4_t aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
+            const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
+            const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
+            const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
+            const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
            q3 += 16;
            q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >>  0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >>  7) & 127))));
            q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
--- a/ggml.c
+++ b/ggml.c
@ -16649,7 +16649,7 @@ struct ggml_compute_state_shared {
    atomic_int node_n;    // active graph node
    atomic_int node_task; // active graph node task phase

-    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
+    ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
    void * abort_callback_data;
 };

--- a/ggml.h
+++ b/ggml.h
@ -574,6 +574,11 @@ extern "C" {

    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);

+    // Abort callback
+    // If not NULL, called before ggml computation
+    // If it returns true, the computation is aborted
+    typedef bool (*ggml_abort_callback)(void * data);
+
    // the compute plan that needs to be prepared for ggml_graph_compute()
    // since https://github.com/ggerganov/ggml/issues/287
    struct ggml_cplan {
@ -583,7 +588,7 @@ extern "C" {
        int n_threads;

        // abort ggml_graph_compute when true
-        bool (*abort_callback)(void * data);
+        ggml_abort_callback abort_callback;
        void *              abort_callback_data;
    };

--- a/ggml_vk_generate_shaders.py
+++ b/ggml_vk_generate_shaders.py
@ -2067,6 +2067,8 @@ type_names = {

 K_QUANTS_PER_ITERATION = 2

+ASYNCIO_CONCURRENCY = 64
+
 output_dir = gettempdir()

 lock = asyncio.Lock()
@ -2291,7 +2293,14 @@ async def main():
    tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"}))
    tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))

-    await asyncio.gather(*tasks)
+    # Helper to decorate tasks with semaphore acquisition.
+    async def withSemaphore(sem, task):
+        async with sem:
+            return await task
+
+    # Run tasks concurrently guarded by a concurrency limit.
+    sem = asyncio.Semaphore(ASYNCIO_CONCURRENCY)
+    await asyncio.gather(*(withSemaphore(sem, task) for task in tasks))

    with open("ggml-vulkan-shaders.hpp", "w") as f:
        f.write("#include <cstdint>\n\n")
--- a/llama.cpp
+++ b/llama.cpp
@ -7360,7 +7360,9 @@ static int llama_decode_internal(
    // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
    //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
    //       with the BLAS calls. need a better solution
-    if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
+    // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is
+    //                   being processed then Accelerate/BLAS will not be involved, so capping would limit performance.
+    if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
        n_threads = std::min(4, n_threads);
    }

--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@ -97,6 +97,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
    # src/ggml-cuda.cu            -> ggml-cuda.cu
    # src/ggml-cuda.h             -> ggml-cuda.h
    # src/ggml-impl.h             -> ggml-impl.h
+    # src/ggml-kompute.cpp        -> ggml-kompute.cpp
+    # src/ggml-kompute.h          -> ggml-kompute.h
    # src/ggml-metal.h            -> ggml-metal.h
    # src/ggml-metal.m            -> ggml-metal.m
    # src/ggml-mpi.h              -> ggml-mpi.h
@ -105,6 +107,10 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
    # src/ggml-opencl.h           -> ggml-opencl.h
    # src/ggml-quants.c           -> ggml-quants.c
    # src/ggml-quants.h           -> ggml-quants.h
+    # src/ggml-sycl.cpp           -> ggml-sycl.cpp
+    # src/ggml-sycl.h             -> ggml-sycl.h
+    # src/ggml-vulkan.cpp         -> ggml-vulkan.cpp
+    # src/ggml-vulkan.h           -> ggml-vulkan.h
    # include/ggml/ggml.h         -> ggml.h
    # include/ggml/ggml-alloc.h   -> ggml-alloc.h
    # include/ggml/ggml-backend.h -> ggml-backend.h
@ -123,6 +129,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
        -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
        -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
        -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
+        -e 's/src\/ggml-kompute\.cpp/ggml-kompute.cpp/g' \
+        -e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \
        -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
        -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
        -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
@ -131,6 +139,10 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
        -e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
        -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
        -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
+        -e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \
+        -e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \
+        -e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \
+        -e 's/src\/ggml-vulkan\.h/ggml-vulkan.h/g' \
        -e 's/include\/ggml\/ggml\.h/ggml.h/g' \
        -e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \
        -e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@ -1 +1 @@
-475cbad5c1c834e31e26a2283bc1413181644360
+2c7cf49810d523b9632da393a9e8270b60bf3b24