diff --git a/examples/llava/README.md b/examples/llava/README.md index 721d5e613..19f1a50a2 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -29,19 +29,25 @@ git clone https://huggingface.co/liuhaotian/llava-v1.5-7b git clone https://huggingface.co/openai/clip-vit-large-patch14-336 ``` -2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: +2. Install the required Python packages: + +```sh +pip install -r examples/llava/requirements.txt +``` + +3. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: ```sh python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b ``` -3. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF: +4. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF: ```sh python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b ``` -4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: +5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: ```sh python ./convert.py ../llava-v1.5-7b diff --git a/examples/llava/llava-surgery.py b/examples/llava/llava-surgery.py index 515f6b58d..0a61efdfe 100644 --- a/examples/llava/llava-surgery.py +++ b/examples/llava/llava-surgery.py @@ -42,5 +42,5 @@ if len(clip_tensors) > 0: torch.save(checkpoint, path) print("Done!") -print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") +print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt new file mode 100644 index 000000000..f80f727a7 --- /dev/null +++ b/examples/llava/requirements.txt @@ -0,0 +1,3 @@ +-r ../../requirements/requirements-convert.txt +pillow~=10.2.0 +torch~=2.1.1 diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 48a2909dc..0e7c44648 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1593,10 +1593,6 @@ struct llama_server_context LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed); } - LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); - - llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); - slot.cache_tokens = prompt_tokens; if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) @@ -1610,6 +1606,10 @@ struct llama_server_context } } + LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); + + llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); + LOG_VERBOSE("prompt ingested", { {"n_past", slot.n_past}, {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)}, diff --git a/ggml-backend.c b/ggml-backend.c index 0764dfebc..532da8eda 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -653,6 +653,9 @@ struct ggml_backend_cpu_context { int n_threads; void * work_data; size_t work_size; + + ggml_abort_callback abort_callback; + void * abort_callback_data; }; GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) { @@ -691,6 +694,9 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); } + cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback; + cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data; + return cpu_plan; } @@ -721,9 +727,11 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size); cpu_ctx->work_size = cplan.work_size; } - cplan.work_data = cpu_ctx->work_data; + cplan.abort_callback = cpu_ctx->abort_callback; + cplan.abort_callback_data = cpu_ctx->abort_callback_data; + ggml_graph_compute(cgraph, &cplan); return true; } @@ -759,9 +767,11 @@ static struct ggml_backend_i cpu_backend_i = { ggml_backend_t ggml_backend_cpu_init(void) { struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context)); - ctx->n_threads = GGML_DEFAULT_N_THREADS; - ctx->work_data = NULL; - ctx->work_size = 0; + ctx->n_threads = GGML_DEFAULT_N_THREADS; + ctx->work_data = NULL; + ctx->work_size = 0; + ctx->abort_callback = NULL; + ctx->abort_callback_data = NULL; ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend)); @@ -783,6 +793,14 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { ctx->n_threads = n_threads; } +void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) { + GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); + + struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; + ctx->abort_callback = abort_callback; + ctx->abort_callback_data = abort_callback_data; +} + GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) { return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size); } diff --git a/ggml-backend.h b/ggml-backend.h index 8b8160fcf..282b3a9b7 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -83,8 +83,9 @@ extern "C" { GGML_API ggml_backend_t ggml_backend_cpu_init(void); - GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); - GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); + GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); + GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); + GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); // Create a backend buffer from an existing pointer GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); diff --git a/ggml-metal.m b/ggml-metal.m index 82dd8af36..50e8661f8 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -687,6 +687,7 @@ static bool ggml_metal_graph_compute( struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { + @autoreleasepool { MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor; edesc.dispatchType = MTLDispatchTypeSerial; @@ -2272,6 +2273,7 @@ static bool ggml_metal_graph_compute( [[MTLCaptureManager sharedCaptureManager] stopCapture]; } + } return true; } diff --git a/ggml-quants.c b/ggml-quants.c index 62fc366ff..a91593c5e 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -270,6 +270,17 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) #if defined(__ARM_NEON) + +#ifdef _MSC_VER + +#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } + +#else + +#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } + +#endif + #if !defined(__aarch64__) // 64-bit compatibility @@ -8700,10 +8711,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * res for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { q8b = ggml_vld1q_s8_x4(q8); q8 += 64; memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t); - const uint32x4_t aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]}; - const uint32x4_t aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]}; - const uint32x4_t aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]}; - const uint32x4_t aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]}; + const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]); + const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]); + const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]); + const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]); q3 += 16; q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127)))); q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127)))); diff --git a/ggml.c b/ggml.c index 2afdd9df4..4b1f2d63b 100644 --- a/ggml.c +++ b/ggml.c @@ -16649,7 +16649,7 @@ struct ggml_compute_state_shared { atomic_int node_n; // active graph node atomic_int node_task; // active graph node task phase - bool (*abort_callback)(void * data); // abort ggml_graph_compute when true + ggml_abort_callback abort_callback; // abort ggml_graph_compute when true void * abort_callback_data; }; diff --git a/ggml.h b/ggml.h index 030515e16..d59668f75 100644 --- a/ggml.h +++ b/ggml.h @@ -574,6 +574,11 @@ extern "C" { static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); + // Abort callback + // If not NULL, called before ggml computation + // If it returns true, the computation is aborted + typedef bool (*ggml_abort_callback)(void * data); + // the compute plan that needs to be prepared for ggml_graph_compute() // since https://github.com/ggerganov/ggml/issues/287 struct ggml_cplan { @@ -583,8 +588,8 @@ extern "C" { int n_threads; // abort ggml_graph_compute when true - bool (*abort_callback)(void * data); - void * abort_callback_data; + ggml_abort_callback abort_callback; + void * abort_callback_data; }; enum ggml_cgraph_eval_order { diff --git a/ggml_vk_generate_shaders.py b/ggml_vk_generate_shaders.py index 4abb0383f..b2e86e182 100644 --- a/ggml_vk_generate_shaders.py +++ b/ggml_vk_generate_shaders.py @@ -2067,6 +2067,8 @@ type_names = { K_QUANTS_PER_ITERATION = 2 +ASYNCIO_CONCURRENCY = 64 + output_dir = gettempdir() lock = asyncio.Lock() @@ -2291,7 +2293,14 @@ async def main(): tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"})) tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"})) - await asyncio.gather(*tasks) + # Helper to decorate tasks with semaphore acquisition. + async def withSemaphore(sem, task): + async with sem: + return await task + + # Run tasks concurrently guarded by a concurrency limit. + sem = asyncio.Semaphore(ASYNCIO_CONCURRENCY) + await asyncio.gather(*(withSemaphore(sem, task) for task in tasks)) with open("ggml-vulkan-shaders.hpp", "w") as f: f.write("#include \n\n") diff --git a/llama.cpp b/llama.cpp index c99e091e2..e500129fa 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7360,7 +7360,9 @@ static int llama_decode_internal( // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering // with the BLAS calls. need a better solution - if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { + // MoE Special Case: This logic applies when hparams.n_expert == 0, i.e. the model is NOT an MoE model. When an MoE is + // being processed then Accelerate/BLAS will not be involved, so capping would limit performance. + if (n_tokens >= 32 && hparams.n_expert == 0 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) { n_threads = std::min(4, n_threads); } diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index 6b2514a11..2c391e641 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -97,6 +97,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # src/ggml-cuda.cu -> ggml-cuda.cu # src/ggml-cuda.h -> ggml-cuda.h # src/ggml-impl.h -> ggml-impl.h + # src/ggml-kompute.cpp -> ggml-kompute.cpp + # src/ggml-kompute.h -> ggml-kompute.h # src/ggml-metal.h -> ggml-metal.h # src/ggml-metal.m -> ggml-metal.m # src/ggml-mpi.h -> ggml-mpi.h @@ -105,6 +107,10 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # src/ggml-opencl.h -> ggml-opencl.h # src/ggml-quants.c -> ggml-quants.c # src/ggml-quants.h -> ggml-quants.h + # src/ggml-sycl.cpp -> ggml-sycl.cpp + # src/ggml-sycl.h -> ggml-sycl.h + # src/ggml-vulkan.cpp -> ggml-vulkan.cpp + # src/ggml-vulkan.h -> ggml-vulkan.h # include/ggml/ggml.h -> ggml.h # include/ggml/ggml-alloc.h -> ggml-alloc.h # include/ggml/ggml-backend.h -> ggml-backend.h @@ -123,6 +129,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \ -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \ -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \ + -e 's/src\/ggml-kompute\.cpp/ggml-kompute.cpp/g' \ + -e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \ -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \ -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \ -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \ @@ -131,6 +139,10 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \ -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \ -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \ + -e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \ + -e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \ + -e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \ + -e 's/src\/ggml-vulkan\.h/ggml-vulkan.h/g' \ -e 's/include\/ggml\/ggml\.h/ggml.h/g' \ -e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \ -e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \ diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 7b6c17915..6ae75bc31 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -475cbad5c1c834e31e26a2283bc1413181644360 +2c7cf49810d523b9632da393a9e8270b60bf3b24