From 1d20e53c40c3cc848ba2b95f5bf7c075eeec8b19 Mon Sep 17 00:00:00 2001 From: Patrick Peng Date: Thu, 6 Feb 2025 09:29:13 -0500 Subject: [PATCH 01/12] rpc: fix known RCE in rpc-server (ggml/1103) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add bounds checking in `rpc_server::copy_tensor` to prevent out-of-bounds writes + Check if `(uint8_t *)dst->data + ggml_nbytes(src)` remains within the destination buffer’s allocated region. --- ggml/src/ggml-rpc/ggml-rpc.cpp | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 3d0c46578..97873acc7 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -1045,7 +1045,28 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co ggml_free(ctx); return false; } - GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n", __func__, (void*)src->buffer, (void*)dst->buffer); + + uint64_t src_size = (uint64_t) ggml_nbytes(src); + uint64_t dst_data = (uint64_t) dst->data; + uint64_t dst_base = (uint64_t) ggml_backend_buffer_get_base(dst->buffer); + uint64_t dst_buf_sz = (uint64_t) ggml_backend_buffer_get_size(dst->buffer); + + if (dst_data + src_size > dst_base + dst_buf_sz) { + GGML_PRINT_DEBUG("[%s] out-of-bounds write in rpc_server::copy_tensor:\n" + " write range : [0x%" PRIx64 ", 0x%" PRIx64 "]\n" + " buffer base: [0x%" PRIx64 ", 0x%" PRIx64 "]\n", + __func__, + dst_data, + dst_data + src_size, + dst_base, + dst_base + dst_buf_sz); + ggml_free(ctx); + return false; + } + + GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n", + __func__, (void*) src->buffer, (void*) dst->buffer); + response.result = ggml_backend_buffer_copy_tensor(src, dst); ggml_free(ctx); return true; From 8a59053f63fffc24e730cd3ea067760abfe4a919 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Feb 2025 21:23:03 +0200 Subject: [PATCH 02/12] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 26a105f64..255109ab7 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -694244a6e40dc255f6bb4376fb17431c06633e6c +08b538031f7f944e84f472483ef5d26bf5190ead From 855cd0734aca26c86cc23d94aefd34f934464ac9 Mon Sep 17 00:00:00 2001 From: tv1wnd <55383215+tv1wnd@users.noreply.github.com> Date: Thu, 6 Feb 2025 22:48:51 +0100 Subject: [PATCH 03/12] llama : fix old glm4 models (#11670) --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index aae3c69b5..3d5a928a8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7217,7 +7217,7 @@ struct llm_build_context { struct ggml_tensor * Qcur = nullptr; struct ggml_tensor * Kcur = nullptr; struct ggml_tensor * Vcur = nullptr; - if (model.type == LLM_TYPE_1_5B || model.type == LLM_TYPE_4B || model.type == LLM_TYPE_9B) { + if (model.layers[il].wqkv == nullptr) { Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); From 225bbbfa39930cda38a2e5d1f3e5b38226732009 Mon Sep 17 00:00:00 2001 From: Jinyang He Date: Fri, 7 Feb 2025 15:38:31 +0800 Subject: [PATCH 04/12] ggml : optimize and build warning fix for LoongArch (#11709) * ggml : optimize convert f32<->f16 for loongarch_asx * ggml : optimize loongarch_asx extend i16,i8,u8 to i32,i16 * ggml : Fix warnings when run cpu CI locally on LoongArch --- ggml/src/ggml-cpu/ggml-cpu-impl.h | 18 +++++--------- ggml/src/ggml-cpu/ggml-cpu-quants.c | 37 ++++++----------------------- ggml/src/ggml-cpu/ggml-cpu.c | 24 +++++++------------ 3 files changed, 22 insertions(+), 57 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index d71076ad1..9ddd972a5 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -360,21 +360,15 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) #endif #if defined(__loongarch_asx) - -typedef union { - int32_t i; - float f; -} ft_union; - /* float type data load instructions */ -static __m128 __lsx_vreplfr2vr_s(float val) { - ft_union fi_tmpval = {.f = val}; - return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i); +static __m128 __lsx_vreplfr2vr_s(const float val) { + v4f32 res = {val, val, val, val}; + return (__m128)res; } -static __m256 __lasx_xvreplfr2vr_s(float val) { - ft_union fi_tmpval = {.f = val}; - return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i); +static __m256 __lasx_xvreplfr2vr_s(const float val) { + v8f32 res = {val, val, val, val, val, val, val, val}; + return (__m256)res; } #endif diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c index 72ec58cee..27ec14935 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-quants.c +++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c @@ -501,30 +501,15 @@ static __m256i lasx_shuffle_b(__m256i a, __m256i b) { } static __m256i lasx_extu8_16(__m128i a) { - __m128i zero = __lsx_vldi(0); - __m128i vlo = __lsx_vilvl_b(zero, a); - __m128i vhi = __lsx_vilvh_b(zero, a); - return lasx_set_q(vhi, vlo); + return __lasx_vext2xv_hu_bu(____m256i(a)); } static __m256i lasx_ext8_16(__m128i a) { - __m128i sign = __lsx_vslti_b(a, 0); - __m128i vlo = __lsx_vilvl_b(sign, a); - __m128i vhi = __lsx_vilvh_b(sign, a); - return lasx_set_q(vhi, vlo); + return __lasx_vext2xv_h_b(____m256i(a)); } static __m256i lasx_ext16_32(__m128i a) { - __m256i tmp1; - tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 0), 0); - tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 1), 1); - tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 2), 2); - tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 3), 3); - tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 4), 4); - tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 5), 5); - tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 6), 6); - tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 7), 7); - return tmp1; + return __lasx_vext2xv_w_h(____m256i(a)); } static __m128i lasx_extracti128( __m256i a, int pos) { @@ -592,12 +577,10 @@ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { // horizontally add 8 floats static inline float hsum_float_8(const __m256 x) { __m128 res = lasx_extractf128(x, 1); - ft_union tmp; res = __lsx_vfadd_s(res, lasx_extractf128(x, 0)); res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res)); res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0)); - tmp.i = __lsx_vpickve2gr_w(res, 0); - return tmp.f; + return ((v4f32)res)[0]; } // horizontally add 8 int32_t @@ -939,7 +922,6 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) #elif defined(__loongarch_asx) for (int i = 0; i < nb; i++) { - ft_union fi; __m256 v0 = (__m256)__lasx_xvld( x , 0); __m256 v1 = (__m256)__lasx_xvld( x , 32); __m256 v2 = (__m256)__lasx_xvld( x , 64); @@ -957,8 +939,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) ); __m128 tmp = max4; max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 )); - fi.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 ); - const float max_scalar = fi.f; + const float max_scalar = ((v4f32)max4)[0]; // Quantize these floats const float d = max_scalar / 127.f; @@ -1263,7 +1244,6 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) #elif defined(__loongarch_asx) for (int i = 0; i < nb; i++) { - ft_union ft; __m256 v0 = (__m256)__lasx_xvld( x , 0 ); __m256 v1 = (__m256)__lasx_xvld( x , 32 ); __m256 v2 = (__m256)__lasx_xvld( x , 64 ); @@ -1281,8 +1261,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) ); __m128 tmp = max4; max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 )); - ft.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 ); - const float max_scalar = ft.f; + const float max_scalar = ((v4f32)max4)[0]; // Quantize these floats const float d = max_scalar / 127.f; @@ -6154,9 +6133,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1); - ft_union fi; - fi.i = __lsx_vpickve2gr_w(acc_m, 0); - *s = hsum_float_8(acc) + fi.f ; + *s = hsum_float_8(acc) + ((v4f32)acc_m)[0]; #else const uint8_t * scales = (const uint8_t*)&utmp[0]; diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index e809f05d2..59efaeb71 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1078,29 +1078,23 @@ do { \ #define GGML_F16_STEP 32 #define GGML_F16_EPR 8 -// F16 arithmetic is not supported by AVX, so we use F32 instead +// F16 arithmetic is not supported by LASX, so we use F32 instead #define GGML_F32Cx8 __m256 #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0) #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x)) static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) { - float tmp[8]; - - for (int i = 0; i < 8; i++) { - tmp[i] = GGML_FP16_TO_FP32(x[i]); - } - - return (__m256)__lasx_xvld(tmp, 0); + __m256i a; + memcpy(&a, x, sizeof(ggml_fp16_t) * 8); + a = __lasx_xvpermi_d(a, 0 | (1 << 4)); + return __lasx_xvfcvtl_s_h(a); } + static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) { - float arr[8]; - - __lasx_xvst(y, arr, 0); - - for (int i = 0; i < 8; i++) { - x[i] = GGML_FP32_TO_FP16(arr[i]); - } + __m256i a = __lasx_xvfcvt_h_s(y, y); + a = __lasx_xvpermi_d(a, 0 | (2 << 2)); + memcpy(x, &a, sizeof(ggml_fp16_t) * 8); } #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x) #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y) From b7552cfcbc7defccd8bdefd0a7b9c47d145ed3d7 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 7 Feb 2025 09:15:22 +0100 Subject: [PATCH 05/12] common : add default embeddings presets (#11677) * common : add default embeddings presets This commit adds default embeddings presets for the following models: - bge-small-en-v1.5 - e5-small-v2 - gte-small These can be used with llama-embedding and llama-server. For example, with llama-embedding: ```console ./build/bin/llama-embedding --embd-gte-small-default -p "Hello, how are you?" ``` And with llama-server: ```console ./build/bin/llama-server --embd-gte-small-default ``` And the embeddings endpoint can then be called with a POST request: ```console curl --request POST \ --url http://localhost:8080/embeddings \ --header "Content-Type: application/json" \ --data '{"input": "Hello, how are you?"}' ``` I'm not sure if these are the most common embedding models but hopefully this can be a good starting point for discussion and further improvements. Refs: https://github.com/ggerganov/llama.cpp/issues/10932 --- common/arg.cpp | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/common/arg.cpp b/common/arg.cpp index 76b898881..152f671ab 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2324,5 +2324,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_TTS})); + add_opt(common_arg( + {"--embd-bge-small-en-default"}, + string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"), + [](common_params & params) { + params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF"; + params.hf_file = "bge-small-en-v1.5-q8_0.gguf"; + params.pooling_type = LLAMA_POOLING_TYPE_NONE; + params.embd_normalize = 2; + params.n_ctx = 512; + params.verbose_prompt = true; + params.embedding = true; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER})); + + add_opt(common_arg( + {"--embd-e5-small-en-default"}, + string_format("use default e5-small-v2 model (note: can download weights from the internet)"), + [](common_params & params) { + params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF"; + params.hf_file = "e5-small-v2-q8_0.gguf"; + params.pooling_type = LLAMA_POOLING_TYPE_NONE; + params.embd_normalize = 2; + params.n_ctx = 512; + params.verbose_prompt = true; + params.embedding = true; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER})); + + add_opt(common_arg( + {"--embd-gte-small-default"}, + string_format("use default gte-small model (note: can download weights from the internet)"), + [](common_params & params) { + params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF"; + params.hf_file = "gte-small-q8_0.gguf"; + params.pooling_type = LLAMA_POOLING_TYPE_NONE; + params.embd_normalize = 2; + params.n_ctx = 512; + params.verbose_prompt = true; + params.embedding = true; + } + ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER})); + return ctx_arg; } From ec3bc8270bc67b58955748d40a3e558a05b2d8f2 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Fri, 7 Feb 2025 14:57:53 +0530 Subject: [PATCH 06/12] SYCL: remove XMX info from print devices (#11712) --- ggml/src/ggml-sycl/ggml-sycl.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index aab34a752..3d24d2165 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -103,11 +103,10 @@ void print_device_detail(int id, sycl::device &device, std::string device_type) name = std::regex_replace(name, std::regex("\\(TM\\)"), ""); auto global_mem_size = prop.get_global_mem_size()/1000000; - std::string xmx = gpu_has_xmx(device) ? "yes" : "no"; - GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|%14s|\n", id, device_type.c_str(), + GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(), name.c_str(), version.c_str(), prop.get_max_compute_units(), prop.get_max_work_group_size(), prop.get_max_sub_group_size(), - global_mem_size, device.get_info().c_str(), xmx.c_str()); + global_mem_size, device.get_info().c_str()); } void ggml_backend_sycl_print_sycl_devices() { @@ -118,16 +117,16 @@ void ggml_backend_sycl_print_sycl_devices() { GGML_LOG_INFO( "| | | | " - " |Max | |Max |Global | | XMX |\n"); + " |Max | |Max |Global | |\n"); GGML_LOG_INFO( "| | | | " - " |compute|Max work|sub |mem | | or |\n"); + " |compute|Max work|sub |mem | |\n"); GGML_LOG_INFO( "|ID| Device Type| " - "Name|Version|units |group |group|size | Driver version| Tensor Cores |\n"); + "Name|Version|units |group |group|size | Driver version|\n"); GGML_LOG_INFO( "|--|-------------------|---------------------------------------|------" - "-|-------|--------|-----|-------|---------------------|--------------|\n"); + "-|-------|--------|-----|-------|---------------------|\n"); for (int id = 0; id < device_count; ++id) { sycl::device device = dpct::dev_mgr::instance().get_device(id); From 7ee953a64a40c09438b2064539becdbc577cefd0 Mon Sep 17 00:00:00 2001 From: Christian Fillion Date: Fri, 7 Feb 2025 04:33:27 -0500 Subject: [PATCH 07/12] llama : add llama_sampler_init for safe usage of llama_sampler_free (#11727) The C API in llama.h claims users can implement `llama_sampler_i` to create custom `llama_sampler`. The sampler chain takes ownership and calls `llama_sampler_free` on them. However, `llama_sampler_free` is hard-coded to use `delete`. This is undefined behavior if the object wasn't also allocated via `new` from libllama's C++ runtime. Callers in C and C-compatible languages do not use C++'s `new` operator. C++ callers may not be sharing the same heap as libllama. --- common/llguidance.cpp | 6 +- include/llama.h | 5 +- src/llama-sampling.cpp | 121 ++++++++++++++++++++++------------------- 3 files changed, 70 insertions(+), 62 deletions(-) diff --git a/common/llguidance.cpp b/common/llguidance.cpp index 7aa8ddd80..2feeb93c8 100644 --- a/common/llguidance.cpp +++ b/common/llguidance.cpp @@ -254,10 +254,10 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g }; } - return new llama_sampler{ + return llama_sampler_init( /* .iface = */ &llama_sampler_llg_i, - /* .ctx = */ ctx, - }; + /* .ctx = */ ctx + ); } #else diff --git a/include/llama.h b/include/llama.h index 61907ed40..3784f7d39 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1114,11 +1114,12 @@ extern "C" { }; struct llama_sampler { - struct llama_sampler_i * iface; - llama_sampler_context_t ctx; + const struct llama_sampler_i * iface; + llama_sampler_context_t ctx; }; // mirror of llama_sampler_i: + LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx); LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl); LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token); LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p); diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 26974f539..990b61297 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -316,6 +316,13 @@ static uint32_t get_rng_seed(uint32_t seed) { // llama_sampler API +struct llama_sampler * llama_sampler_init(const struct llama_sampler_i * iface, llama_sampler_context_t ctx) { + return new llama_sampler { + /* .iface = */ iface, + /* .ctx = */ ctx, + }; +} + const char * llama_sampler_name(const struct llama_sampler * smpl) { if (!smpl->iface) { return "(null)"; @@ -347,10 +354,10 @@ struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) { } if (smpl->ctx == nullptr) { - return new llama_sampler { + return llama_sampler_init( /* .iface = */ smpl->iface, - /* .ctx = */ nullptr, - }; + /* .ctx = */ nullptr + ); } GGML_ABORT("the sampler does not support cloning"); @@ -472,15 +479,15 @@ static struct llama_sampler_i llama_sampler_chain_i = { }; struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) { - return new llama_sampler { + return llama_sampler_init( /* .iface = */ &llama_sampler_chain_i, /* .ctx = */ new llama_sampler_chain { /* .params = */ params, /* .samplers = */ {}, /* .t_sample_us = */ 0, /* .n_sample = */ 0, - }, - }; + } + ); } void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) { @@ -546,10 +553,10 @@ static struct llama_sampler_i llama_sampler_greedy_i = { }; struct llama_sampler * llama_sampler_init_greedy() { - return new llama_sampler { + return llama_sampler_init( /* .iface = */ &llama_sampler_greedy_i, - /* .ctx = */ nullptr, - }; + /* .ctx = */ nullptr + ); } // dist @@ -608,14 +615,14 @@ static struct llama_sampler_i llama_sampler_dist_i = { struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { auto seed_cur = get_rng_seed(seed); - return new llama_sampler { + return llama_sampler_init( /* .iface = */ &llama_sampler_dist_i, /* .ctx = */ new llama_sampler_dist { /* .seed = */ seed, /* .seed_cur = */ seed_cur, /* .rng = */ std::mt19937(seed_cur), - }, - }; + } + ); } // softmax @@ -638,10 +645,10 @@ static struct llama_sampler_i llama_sampler_softmax_i = { }; struct llama_sampler * llama_sampler_init_softmax() { - return new llama_sampler { + return llama_sampler_init( /* .iface = */ &llama_sampler_softmax_i, - /* .ctx = */ nullptr, - }; + /* .ctx = */ nullptr + ); } // top-k @@ -678,12 +685,12 @@ static struct llama_sampler_i llama_sampler_top_k_i = { }; struct llama_sampler * llama_sampler_init_top_k(int32_t k) { - return new llama_sampler { + return llama_sampler_init( /* .iface = */ &llama_sampler_top_k_i, /* .ctx = */ new llama_sampler_top_k { /* .k = */ k, - }, - }; + } + ); } // top-p @@ -744,13 +751,13 @@ static struct llama_sampler_i llama_sampler_top_p_i = { }; struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) { - return new llama_sampler { + return llama_sampler_init( /* .iface = */ &llama_sampler_top_p_i, /* .ctx = */ new llama_sampler_top_p { /* .p = */ p, /* .min_keep = */ min_keep, - }, - }; + } + ); } // min-p @@ -840,13 +847,13 @@ static struct llama_sampler_i llama_sampler_min_p_i = { }; struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) { - return new llama_sampler { + return llama_sampler_init( /* .iface = */ &llama_sampler_min_p_i, /* .ctx = */ new llama_sampler_min_p { /* .p = */ p, /* .min_keep = */ min_keep, - }, - }; + } + ); } // typical @@ -939,13 +946,13 @@ static struct llama_sampler_i llama_sampler_typical_i = { }; struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) { - return new llama_sampler { + return llama_sampler_init( /* .iface = */ &llama_sampler_typical_i, /* .ctx = */ new llama_sampler_typical { /* .p = */ p, /* .min_keep = */ min_keep, - }, - }; + } + ); } // temp @@ -983,12 +990,12 @@ static struct llama_sampler_i llama_sampler_temp_i = { }; struct llama_sampler * llama_sampler_init_temp(float temp) { - return new llama_sampler { + return llama_sampler_init( /* .iface = */ &llama_sampler_temp_i, /* .ctx = */ new llama_sampler_temp { /*.temp = */ temp, - }, - }; + } + ); } // temp-ext @@ -1093,14 +1100,14 @@ static struct llama_sampler_i llama_sampler_temp_ext_i = { }; struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) { - return new llama_sampler { + return llama_sampler_init( /* .iface = */ &llama_sampler_temp_ext_i, /* .ctx = */ new llama_sampler_temp_ext { /* .temp = */ temp, /* .delta = */ delta, /* .exponent = */ exponent, - }, - }; + } + ); } // xtc @@ -1185,7 +1192,7 @@ static struct llama_sampler_i llama_sampler_xtc_i = { struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) { auto seed_cur = get_rng_seed(seed); - return new llama_sampler { + return llama_sampler_init( /* .iface = */ &llama_sampler_xtc_i, /* .ctx = */ new llama_sampler_xtc { /* .probability = */ p, @@ -1194,8 +1201,8 @@ struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, /* .seed = */ seed, /* .seed_cur = */ seed_cur, /* .rng = */ std::mt19937(seed_cur), - }, - }; + } + ); } // mirostat @@ -1292,7 +1299,7 @@ static struct llama_sampler_i llama_sampler_mirostat_i = { struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) { auto seed_cur = get_rng_seed(seed); - return new llama_sampler { + return llama_sampler_init( /* .iface = */ &llama_sampler_mirostat_i, /* .ctx = */ new llama_sampler_mirostat { /* .n_vocab = */ n_vocab, @@ -1303,8 +1310,8 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see /* .m = */ m, /* .mu = */ 2.0f*tau, /* .rng = */ std::mt19937(seed_cur), - }, - }; + } + ); } // mirostat v2 @@ -1391,7 +1398,7 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = { struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) { auto seed_cur = get_rng_seed(seed); - return new llama_sampler { + return llama_sampler_init( /* .iface = */ &llama_sampler_mirostat_v2_i, /* .ctx = */ new llama_sampler_mirostat_v2 { /* .seed = */ seed, @@ -1400,8 +1407,8 @@ struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, /* .eta = */ eta, /* .mu = */ 2.0f*tau, /* .rng = */ std::mt19937(seed_cur), - }, - }; + } + ); } // grammar @@ -1528,10 +1535,10 @@ static struct llama_sampler * llama_sampler_init_grammar_impl( }; } - return new llama_sampler { + return llama_sampler_init( /* .iface = */ &llama_sampler_grammar_i, - /* .ctx = */ ctx, - }; + /* .ctx = */ ctx + ); } struct llama_sampler * llama_sampler_init_grammar( @@ -1678,7 +1685,7 @@ struct llama_sampler * llama_sampler_init_penalties( float penalty_present) { penalty_last_n = std::max(penalty_last_n, 0); - return new llama_sampler { + return llama_sampler_init( /* .iface = */ &llama_sampler_penalties_i, /* .ctx = */ new llama_sampler_penalties { /* .penalty_last_n = */ penalty_last_n, @@ -1687,8 +1694,8 @@ struct llama_sampler * llama_sampler_init_penalties( /* .penalty_present = */ penalty_present, /* .prev = */ ring_buffer(penalty_last_n), /* .token_count = */ {}, - }, - }; + } + ); } // DRY @@ -2041,7 +2048,7 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, } } - return new llama_sampler { + return llama_sampler_init( /* .iface = */ &llama_sampler_dry_i, /* .ctx = */ new llama_sampler_dry { /* .total_context_size = */ context_size, @@ -2053,8 +2060,8 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, /* .dry_repeat_count = */ dry_enabled ? std::vector(effective_dry_penalty_last_n, 0) : std::vector{}, /* .dry_max_token_repeat = */ {}, /* .last_tokens = */ dry_enabled ? ring_buffer(effective_dry_penalty_last_n) : ring_buffer(0), - }, - }; + } + ); } // wrapper for test-sampling.cpp @@ -2155,14 +2162,14 @@ struct llama_sampler * llama_sampler_init_logit_bias( int32_t n_vocab, int32_t n_logit_bias, const llama_logit_bias * logit_bias) { - return new llama_sampler { + return llama_sampler_init( /* .iface = */ &llama_sampler_logit_bias_i, /* .ctx = */ new llama_sampler_logit_bias { /* .n_vocab = */ n_vocab, /* .logit_bias = */ std::vector(logit_bias, logit_bias + n_logit_bias), /* .to_search = */ {}, - }, - }; + } + ); } // infill @@ -2377,14 +2384,14 @@ static struct llama_sampler_i llama_sampler_infill_i = { }; struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) { - return new llama_sampler { + return llama_sampler_init( /* .iface = */ &llama_sampler_infill_i, /* .ctx = */ new llama_sampler_infill { /* .vocab = */ vocab, /* .buf0 = */ std::vector(512), /* .buf1 = */ std::vector(512), - }, - }; + } + ); } // utils From c026ba3c23765a648ca27c7a15ecf179f8e27f26 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Fri, 7 Feb 2025 04:26:03 -0600 Subject: [PATCH 08/12] vulkan: print shared memory size (#11719) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 1c99ebe2e..4c962fde9 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -2780,8 +2780,9 @@ static void ggml_vk_print_gpu_info(size_t idx) { std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none"; std::string device_name = props2.properties.deviceName.data(); - GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | matrix cores: %s\n", - idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size, matrix_cores.c_str()); + GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | shared memory: %d | matrix cores: %s\n", + idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size, + props2.properties.limits.maxComputeSharedMemorySize, matrix_cores.c_str()); if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) { GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n"); From 333820d7491cd31c707a340ff23b984a84e40154 Mon Sep 17 00:00:00 2001 From: magicse Date: Fri, 7 Feb 2025 15:48:47 +0200 Subject: [PATCH 09/12] llama : fix progress dots (#11730) * Update llama.cpp For display progress dots in terminal. Without this it didn't display dots progress during loading model from file. * Update llama.cpp removed trailing spaces --- src/llama.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 3d5a928a8..c3da3c43b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9428,7 +9428,6 @@ static struct llama_model * llama_model_load_from_file_impl( struct llama_model_params params) { ggml_time_init(); - llama_model * model = new llama_model(params); unsigned cur_percentage = 0; if (params.progress_callback == NULL) { @@ -9447,6 +9446,8 @@ static struct llama_model * llama_model_load_from_file_impl( }; } + llama_model * model = new llama_model(params); + // create list of devices to use with this model if (params.devices) { for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) { From 2d219b389e8c8c40bce547b08c8aa7add60fde1f Mon Sep 17 00:00:00 2001 From: Christian Fillion Date: Fri, 7 Feb 2025 08:55:47 -0500 Subject: [PATCH 10/12] vocab : ignore invalid UTF-8 input in the BPE tokenizer (#11729) Silently insert U+FFFD(s) (Unicode replacement character) instead until the next valid codepoint can be found. This fixes `llama_tokenize` throwing an exception across the C API boundary or libllama's module boundary (the caller's runtime might be incompatible!) Returing a proper error code might be desirable, however the signature of `llama_tokenize` doesn't allow it as all return values already have existing meaning. --- src/unicode.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/unicode.cpp b/src/unicode.cpp index 89180da41..a32ae6d08 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -618,7 +618,14 @@ std::vector unicode_cpts_from_utf8(const std::string & utf8) { result.reserve(utf8.size()); size_t offset = 0; while (offset < utf8.size()) { - result.push_back(unicode_cpt_from_utf8(utf8, offset)); + try { + result.push_back(unicode_cpt_from_utf8(utf8, offset)); + } + catch (const std::invalid_argument & /*ex*/) { + // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize + ++offset; + result.emplace_back(0xFFFD); // replacement character + } } return result; } From ed926d8833df3c29797edbc98dafd9b575aa0729 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 7 Feb 2025 16:05:34 +0200 Subject: [PATCH 11/12] llama : fix defrag logic (#11707) * llama : fix defrag logic ggml-ci * cont : better logic ggml-ci * cont : clamp fragmentation to 0.0 ggml-ci --- src/llama.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index c3da3c43b..3b6a21d81 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8801,12 +8801,14 @@ static int llama_decode_impl( //llama_synchronize(&lctx); // decide if we need to defrag the kv cache - if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) { - const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f; + if (cparams.causal_attn && cparams.defrag_thold > 0.0f) { + // - do not defrag small contexts (i.e. < 2048 tokens) + // - count the padding towards the number of used tokens + const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + llama_kv_cache_get_padding(cparams))/float(kv_self.n)) : 0.0f; // queue defragmentation for next llama_kv_cache_update if (fragmentation > cparams.defrag_thold) { - //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation); + LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation); llama_kv_cache_defrag(kv_self); } From d2fe216fb2fb7ca8627618c9ea3a2e7886325780 Mon Sep 17 00:00:00 2001 From: Eric Curtin Date: Fri, 7 Feb 2025 14:42:46 +0000 Subject: [PATCH 12/12] Make logging more verbose (#11714) Debugged an issue with a user who was on a read-only filesystem. Signed-off-by: Eric Curtin --- examples/run/run.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 39353ba30..eab60cad1 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -346,7 +346,7 @@ class HttpClient { if (!output_file.empty()) { output_file_partial = output_file + ".partial"; if (!out.open(output_file_partial, "ab")) { - printe("Failed to open file\n"); + printe("Failed to open file for writing\n"); return 1; }