Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	examples/run/run.cpp
#	scripts/sync-ggml.last
This commit is contained in:
Concedo 2025-02-08 01:31:49 +08:00
commit 27b9358baf
12 changed files with 181 additions and 135 deletions

View file

@ -2325,5 +2325,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_examples({LLAMA_EXAMPLE_TTS})); ).set_examples({LLAMA_EXAMPLE_TTS}));
add_opt(common_arg(
{"--embd-bge-small-en-default"},
string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
[](common_params & params) {
params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
params.embd_normalize = 2;
params.n_ctx = 512;
params.verbose_prompt = true;
params.embedding = true;
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--embd-e5-small-en-default"},
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
[](common_params & params) {
params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
params.hf_file = "e5-small-v2-q8_0.gguf";
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
params.embd_normalize = 2;
params.n_ctx = 512;
params.verbose_prompt = true;
params.embedding = true;
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--embd-gte-small-default"},
string_format("use default gte-small model (note: can download weights from the internet)"),
[](common_params & params) {
params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
params.hf_file = "gte-small-q8_0.gguf";
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
params.embd_normalize = 2;
params.n_ctx = 512;
params.verbose_prompt = true;
params.embedding = true;
}
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
return ctx_arg; return ctx_arg;
} }

View file

@ -254,10 +254,10 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
}; };
} }
return new llama_sampler{ return llama_sampler_init(
/* .iface = */ &llama_sampler_llg_i, /* .iface = */ &llama_sampler_llg_i,
/* .ctx = */ ctx, /* .ctx = */ ctx
}; );
} }
#else #else

View file

@ -360,21 +360,15 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
#endif #endif
#if defined(__loongarch_asx) #if defined(__loongarch_asx)
typedef union {
int32_t i;
float f;
} ft_union;
/* float type data load instructions */ /* float type data load instructions */
static __m128 __lsx_vreplfr2vr_s(float val) { static __m128 __lsx_vreplfr2vr_s(const float val) {
ft_union fi_tmpval = {.f = val}; v4f32 res = {val, val, val, val};
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i); return (__m128)res;
} }
static __m256 __lasx_xvreplfr2vr_s(float val) { static __m256 __lasx_xvreplfr2vr_s(const float val) {
ft_union fi_tmpval = {.f = val}; v8f32 res = {val, val, val, val, val, val, val, val};
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i); return (__m256)res;
} }
#endif #endif

View file

@ -502,30 +502,15 @@ static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
} }
static __m256i lasx_extu8_16(__m128i a) { static __m256i lasx_extu8_16(__m128i a) {
__m128i zero = __lsx_vldi(0); return __lasx_vext2xv_hu_bu(____m256i(a));
__m128i vlo = __lsx_vilvl_b(zero, a);
__m128i vhi = __lsx_vilvh_b(zero, a);
return lasx_set_q(vhi, vlo);
} }
static __m256i lasx_ext8_16(__m128i a) { static __m256i lasx_ext8_16(__m128i a) {
__m128i sign = __lsx_vslti_b(a, 0); return __lasx_vext2xv_h_b(____m256i(a));
__m128i vlo = __lsx_vilvl_b(sign, a);
__m128i vhi = __lsx_vilvh_b(sign, a);
return lasx_set_q(vhi, vlo);
} }
static __m256i lasx_ext16_32(__m128i a) { static __m256i lasx_ext16_32(__m128i a) {
__m256i tmp1; return __lasx_vext2xv_w_h(____m256i(a));
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 0), 0);
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 1), 1);
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 2), 2);
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 3), 3);
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 4), 4);
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 5), 5);
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 6), 6);
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 7), 7);
return tmp1;
} }
static __m128i lasx_extracti128( __m256i a, int pos) { static __m128i lasx_extracti128( __m256i a, int pos) {
@ -593,12 +578,10 @@ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
// horizontally add 8 floats // horizontally add 8 floats
static inline float hsum_float_8(const __m256 x) { static inline float hsum_float_8(const __m256 x) {
__m128 res = lasx_extractf128(x, 1); __m128 res = lasx_extractf128(x, 1);
ft_union tmp;
res = __lsx_vfadd_s(res, lasx_extractf128(x, 0)); res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res)); res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0)); res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
tmp.i = __lsx_vpickve2gr_w(res, 0); return ((v4f32)res)[0];
return tmp.f;
} }
// horizontally add 8 int32_t // horizontally add 8 int32_t
@ -940,7 +923,6 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
#elif defined(__loongarch_asx) #elif defined(__loongarch_asx)
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
ft_union fi;
__m256 v0 = (__m256)__lasx_xvld( x , 0); __m256 v0 = (__m256)__lasx_xvld( x , 0);
__m256 v1 = (__m256)__lasx_xvld( x , 32); __m256 v1 = (__m256)__lasx_xvld( x , 32);
__m256 v2 = (__m256)__lasx_xvld( x , 64); __m256 v2 = (__m256)__lasx_xvld( x , 64);
@ -958,8 +940,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) ); max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
__m128 tmp = max4; __m128 tmp = max4;
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 )); max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
fi.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 ); const float max_scalar = ((v4f32)max4)[0];
const float max_scalar = fi.f;
// Quantize these floats // Quantize these floats
const float d = max_scalar / 127.f; const float d = max_scalar / 127.f;
@ -1264,7 +1245,6 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
#elif defined(__loongarch_asx) #elif defined(__loongarch_asx)
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
ft_union ft;
__m256 v0 = (__m256)__lasx_xvld( x , 0 ); __m256 v0 = (__m256)__lasx_xvld( x , 0 );
__m256 v1 = (__m256)__lasx_xvld( x , 32 ); __m256 v1 = (__m256)__lasx_xvld( x , 32 );
__m256 v2 = (__m256)__lasx_xvld( x , 64 ); __m256 v2 = (__m256)__lasx_xvld( x , 64 );
@ -1282,8 +1262,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) ); max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
__m128 tmp = max4; __m128 tmp = max4;
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 )); max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
ft.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 ); const float max_scalar = ((v4f32)max4)[0];
const float max_scalar = ft.f;
// Quantize these floats // Quantize these floats
const float d = max_scalar / 127.f; const float d = max_scalar / 127.f;
@ -6155,9 +6134,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1); acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
ft_union fi; *s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
fi.i = __lsx_vpickve2gr_w(acc_m, 0);
*s = hsum_float_8(acc) + fi.f ;
#else #else
const uint8_t * scales = (const uint8_t*)&utmp[0]; const uint8_t * scales = (const uint8_t*)&utmp[0];

View file

@ -1082,29 +1082,23 @@ do { \
#define GGML_F16_STEP 32 #define GGML_F16_STEP 32
#define GGML_F16_EPR 8 #define GGML_F16_EPR 8
// F16 arithmetic is not supported by AVX, so we use F32 instead // F16 arithmetic is not supported by LASX, so we use F32 instead
#define GGML_F32Cx8 __m256 #define GGML_F32Cx8 __m256
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0) #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x)) #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) { static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
float tmp[8]; __m256i a;
memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
for (int i = 0; i < 8; i++) { a = __lasx_xvpermi_d(a, 0 | (1 << 4));
tmp[i] = GGML_FP16_TO_FP32(x[i]); return __lasx_xvfcvtl_s_h(a);
} }
return (__m256)__lasx_xvld(tmp, 0);
}
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) { static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
float arr[8]; __m256i a = __lasx_xvfcvt_h_s(y, y);
a = __lasx_xvpermi_d(a, 0 | (2 << 2));
__lasx_xvst(y, arr, 0); memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
for (int i = 0; i < 8; i++) {
x[i] = GGML_FP32_TO_FP16(arr[i]);
}
} }
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x) #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y) #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)

View file

@ -1045,7 +1045,28 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
ggml_free(ctx); ggml_free(ctx);
return false; return false;
} }
GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n", __func__, (void*)src->buffer, (void*)dst->buffer);
uint64_t src_size = (uint64_t) ggml_nbytes(src);
uint64_t dst_data = (uint64_t) dst->data;
uint64_t dst_base = (uint64_t) ggml_backend_buffer_get_base(dst->buffer);
uint64_t dst_buf_sz = (uint64_t) ggml_backend_buffer_get_size(dst->buffer);
if (dst_data + src_size > dst_base + dst_buf_sz) {
GGML_PRINT_DEBUG("[%s] out-of-bounds write in rpc_server::copy_tensor:\n"
" write range : [0x%" PRIx64 ", 0x%" PRIx64 "]\n"
" buffer base: [0x%" PRIx64 ", 0x%" PRIx64 "]\n",
__func__,
dst_data,
dst_data + src_size,
dst_base,
dst_base + dst_buf_sz);
ggml_free(ctx);
return false;
}
GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n",
__func__, (void*) src->buffer, (void*) dst->buffer);
response.result = ggml_backend_buffer_copy_tensor(src, dst); response.result = ggml_backend_buffer_copy_tensor(src, dst);
ggml_free(ctx); ggml_free(ctx);
return true; return true;

View file

@ -103,11 +103,10 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
name = std::regex_replace(name, std::regex("\\(TM\\)"), ""); name = std::regex_replace(name, std::regex("\\(TM\\)"), "");
auto global_mem_size = prop.get_global_mem_size()/1000000; auto global_mem_size = prop.get_global_mem_size()/1000000;
std::string xmx = gpu_has_xmx(device) ? "yes" : "no"; GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|%14s|\n", id, device_type.c_str(),
name.c_str(), version.c_str(), prop.get_max_compute_units(), name.c_str(), version.c_str(), prop.get_max_compute_units(),
prop.get_max_work_group_size(), prop.get_max_sub_group_size(), prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str(), xmx.c_str()); global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
} }
void ggml_backend_sycl_print_sycl_devices() { void ggml_backend_sycl_print_sycl_devices() {
@ -118,16 +117,16 @@ void ggml_backend_sycl_print_sycl_devices() {
GGML_LOG_INFO( GGML_LOG_INFO(
"| | | | " "| | | | "
" |Max | |Max |Global | | XMX |\n"); " |Max | |Max |Global | |\n");
GGML_LOG_INFO( GGML_LOG_INFO(
"| | | | " "| | | | "
" |compute|Max work|sub |mem | | or |\n"); " |compute|Max work|sub |mem | |\n");
GGML_LOG_INFO( GGML_LOG_INFO(
"|ID| Device Type| " "|ID| Device Type| "
"Name|Version|units |group |group|size | Driver version| Tensor Cores |\n"); "Name|Version|units |group |group|size | Driver version|\n");
GGML_LOG_INFO( GGML_LOG_INFO(
"|--|-------------------|---------------------------------------|------" "|--|-------------------|---------------------------------------|------"
"-|-------|--------|-----|-------|---------------------|--------------|\n"); "-|-------|--------|-----|-------|---------------------|\n");
for (int id = 0; id < device_count; ++id) { for (int id = 0; id < device_count; ++id) {
sycl::device device = dpct::dev_mgr::instance().get_device(id); sycl::device device = dpct::dev_mgr::instance().get_device(id);

View file

@ -2788,8 +2788,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none"; std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none";
std::string device_name = props2.properties.deviceName.data(); std::string device_name = props2.properties.deviceName.data();
GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | matrix cores: %s\n", GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | shared memory: %d | matrix cores: %s\n",
idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size, matrix_cores.c_str()); idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size,
props2.properties.limits.maxComputeSharedMemorySize, matrix_cores.c_str());
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) { if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n"); GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n");

View file

@ -1116,11 +1116,12 @@ extern "C" {
}; };
struct llama_sampler { struct llama_sampler {
struct llama_sampler_i * iface; const struct llama_sampler_i * iface;
llama_sampler_context_t ctx; llama_sampler_context_t ctx;
}; };
// mirror of llama_sampler_i: // mirror of llama_sampler_i:
LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl); LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token); LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p); LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);

View file

@ -316,6 +316,13 @@ static uint32_t get_rng_seed(uint32_t seed) {
// llama_sampler API // llama_sampler API
struct llama_sampler * llama_sampler_init(const struct llama_sampler_i * iface, llama_sampler_context_t ctx) {
return new llama_sampler {
/* .iface = */ iface,
/* .ctx = */ ctx,
};
}
const char * llama_sampler_name(const struct llama_sampler * smpl) { const char * llama_sampler_name(const struct llama_sampler * smpl) {
if (!smpl->iface) { if (!smpl->iface) {
return "(null)"; return "(null)";
@ -347,10 +354,10 @@ struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
} }
if (smpl->ctx == nullptr) { if (smpl->ctx == nullptr) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ smpl->iface, /* .iface = */ smpl->iface,
/* .ctx = */ nullptr, /* .ctx = */ nullptr
}; );
} }
GGML_ABORT("the sampler does not support cloning"); GGML_ABORT("the sampler does not support cloning");
@ -472,15 +479,15 @@ static struct llama_sampler_i llama_sampler_chain_i = {
}; };
struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) { struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_chain_i, /* .iface = */ &llama_sampler_chain_i,
/* .ctx = */ new llama_sampler_chain { /* .ctx = */ new llama_sampler_chain {
/* .params = */ params, /* .params = */ params,
/* .samplers = */ {}, /* .samplers = */ {},
/* .t_sample_us = */ 0, /* .t_sample_us = */ 0,
/* .n_sample = */ 0, /* .n_sample = */ 0,
}, }
}; );
} }
void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) { void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
@ -546,10 +553,10 @@ static struct llama_sampler_i llama_sampler_greedy_i = {
}; };
struct llama_sampler * llama_sampler_init_greedy() { struct llama_sampler * llama_sampler_init_greedy() {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_greedy_i, /* .iface = */ &llama_sampler_greedy_i,
/* .ctx = */ nullptr, /* .ctx = */ nullptr
}; );
} }
// dist // dist
@ -608,14 +615,14 @@ static struct llama_sampler_i llama_sampler_dist_i = {
struct llama_sampler * llama_sampler_init_dist(uint32_t seed) { struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
auto seed_cur = get_rng_seed(seed); auto seed_cur = get_rng_seed(seed);
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_dist_i, /* .iface = */ &llama_sampler_dist_i,
/* .ctx = */ new llama_sampler_dist { /* .ctx = */ new llama_sampler_dist {
/* .seed = */ seed, /* .seed = */ seed,
/* .seed_cur = */ seed_cur, /* .seed_cur = */ seed_cur,
/* .rng = */ std::mt19937(seed_cur), /* .rng = */ std::mt19937(seed_cur),
}, }
}; );
} }
// softmax // softmax
@ -638,10 +645,10 @@ static struct llama_sampler_i llama_sampler_softmax_i = {
}; };
struct llama_sampler * llama_sampler_init_softmax() { struct llama_sampler * llama_sampler_init_softmax() {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_softmax_i, /* .iface = */ &llama_sampler_softmax_i,
/* .ctx = */ nullptr, /* .ctx = */ nullptr
}; );
} }
// top-k // top-k
@ -678,12 +685,12 @@ static struct llama_sampler_i llama_sampler_top_k_i = {
}; };
struct llama_sampler * llama_sampler_init_top_k(int32_t k) { struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_top_k_i, /* .iface = */ &llama_sampler_top_k_i,
/* .ctx = */ new llama_sampler_top_k { /* .ctx = */ new llama_sampler_top_k {
/* .k = */ k, /* .k = */ k,
}, }
}; );
} }
// top-p // top-p
@ -744,13 +751,13 @@ static struct llama_sampler_i llama_sampler_top_p_i = {
}; };
struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) { struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_top_p_i, /* .iface = */ &llama_sampler_top_p_i,
/* .ctx = */ new llama_sampler_top_p { /* .ctx = */ new llama_sampler_top_p {
/* .p = */ p, /* .p = */ p,
/* .min_keep = */ min_keep, /* .min_keep = */ min_keep,
}, }
}; );
} }
// min-p // min-p
@ -840,13 +847,13 @@ static struct llama_sampler_i llama_sampler_min_p_i = {
}; };
struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) { struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_min_p_i, /* .iface = */ &llama_sampler_min_p_i,
/* .ctx = */ new llama_sampler_min_p { /* .ctx = */ new llama_sampler_min_p {
/* .p = */ p, /* .p = */ p,
/* .min_keep = */ min_keep, /* .min_keep = */ min_keep,
}, }
}; );
} }
// typical // typical
@ -939,13 +946,13 @@ static struct llama_sampler_i llama_sampler_typical_i = {
}; };
struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) { struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_typical_i, /* .iface = */ &llama_sampler_typical_i,
/* .ctx = */ new llama_sampler_typical { /* .ctx = */ new llama_sampler_typical {
/* .p = */ p, /* .p = */ p,
/* .min_keep = */ min_keep, /* .min_keep = */ min_keep,
}, }
}; );
} }
// temp // temp
@ -983,12 +990,12 @@ static struct llama_sampler_i llama_sampler_temp_i = {
}; };
struct llama_sampler * llama_sampler_init_temp(float temp) { struct llama_sampler * llama_sampler_init_temp(float temp) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_temp_i, /* .iface = */ &llama_sampler_temp_i,
/* .ctx = */ new llama_sampler_temp { /* .ctx = */ new llama_sampler_temp {
/*.temp = */ temp, /*.temp = */ temp,
}, }
}; );
} }
// temp-ext // temp-ext
@ -1093,14 +1100,14 @@ static struct llama_sampler_i llama_sampler_temp_ext_i = {
}; };
struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) { struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_temp_ext_i, /* .iface = */ &llama_sampler_temp_ext_i,
/* .ctx = */ new llama_sampler_temp_ext { /* .ctx = */ new llama_sampler_temp_ext {
/* .temp = */ temp, /* .temp = */ temp,
/* .delta = */ delta, /* .delta = */ delta,
/* .exponent = */ exponent, /* .exponent = */ exponent,
}, }
}; );
} }
// xtc // xtc
@ -1185,7 +1192,7 @@ static struct llama_sampler_i llama_sampler_xtc_i = {
struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) { struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
auto seed_cur = get_rng_seed(seed); auto seed_cur = get_rng_seed(seed);
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_xtc_i, /* .iface = */ &llama_sampler_xtc_i,
/* .ctx = */ new llama_sampler_xtc { /* .ctx = */ new llama_sampler_xtc {
/* .probability = */ p, /* .probability = */ p,
@ -1194,8 +1201,8 @@ struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep,
/* .seed = */ seed, /* .seed = */ seed,
/* .seed_cur = */ seed_cur, /* .seed_cur = */ seed_cur,
/* .rng = */ std::mt19937(seed_cur), /* .rng = */ std::mt19937(seed_cur),
}, }
}; );
} }
// mirostat // mirostat
@ -1292,7 +1299,7 @@ static struct llama_sampler_i llama_sampler_mirostat_i = {
struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) { struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
auto seed_cur = get_rng_seed(seed); auto seed_cur = get_rng_seed(seed);
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_mirostat_i, /* .iface = */ &llama_sampler_mirostat_i,
/* .ctx = */ new llama_sampler_mirostat { /* .ctx = */ new llama_sampler_mirostat {
/* .n_vocab = */ n_vocab, /* .n_vocab = */ n_vocab,
@ -1303,8 +1310,8 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see
/* .m = */ m, /* .m = */ m,
/* .mu = */ 2.0f*tau, /* .mu = */ 2.0f*tau,
/* .rng = */ std::mt19937(seed_cur), /* .rng = */ std::mt19937(seed_cur),
}, }
}; );
} }
// mirostat v2 // mirostat v2
@ -1391,7 +1398,7 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) { struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
auto seed_cur = get_rng_seed(seed); auto seed_cur = get_rng_seed(seed);
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_mirostat_v2_i, /* .iface = */ &llama_sampler_mirostat_v2_i,
/* .ctx = */ new llama_sampler_mirostat_v2 { /* .ctx = */ new llama_sampler_mirostat_v2 {
/* .seed = */ seed, /* .seed = */ seed,
@ -1400,8 +1407,8 @@ struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau,
/* .eta = */ eta, /* .eta = */ eta,
/* .mu = */ 2.0f*tau, /* .mu = */ 2.0f*tau,
/* .rng = */ std::mt19937(seed_cur), /* .rng = */ std::mt19937(seed_cur),
}, }
}; );
} }
// grammar // grammar
@ -1528,10 +1535,10 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
}; };
} }
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_grammar_i, /* .iface = */ &llama_sampler_grammar_i,
/* .ctx = */ ctx, /* .ctx = */ ctx
}; );
} }
struct llama_sampler * llama_sampler_init_grammar( struct llama_sampler * llama_sampler_init_grammar(
@ -1678,7 +1685,7 @@ struct llama_sampler * llama_sampler_init_penalties(
float penalty_present) { float penalty_present) {
penalty_last_n = std::max(penalty_last_n, 0); penalty_last_n = std::max(penalty_last_n, 0);
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_penalties_i, /* .iface = */ &llama_sampler_penalties_i,
/* .ctx = */ new llama_sampler_penalties { /* .ctx = */ new llama_sampler_penalties {
/* .penalty_last_n = */ penalty_last_n, /* .penalty_last_n = */ penalty_last_n,
@ -1687,8 +1694,8 @@ struct llama_sampler * llama_sampler_init_penalties(
/* .penalty_present = */ penalty_present, /* .penalty_present = */ penalty_present,
/* .prev = */ ring_buffer<llama_token>(penalty_last_n), /* .prev = */ ring_buffer<llama_token>(penalty_last_n),
/* .token_count = */ {}, /* .token_count = */ {},
}, }
}; );
} }
// DRY // DRY
@ -2041,7 +2048,7 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
} }
} }
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_dry_i, /* .iface = */ &llama_sampler_dry_i,
/* .ctx = */ new llama_sampler_dry { /* .ctx = */ new llama_sampler_dry {
/* .total_context_size = */ context_size, /* .total_context_size = */ context_size,
@ -2053,8 +2060,8 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
/* .dry_repeat_count = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{}, /* .dry_repeat_count = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
/* .dry_max_token_repeat = */ {}, /* .dry_max_token_repeat = */ {},
/* .last_tokens = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0), /* .last_tokens = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
}, }
}; );
} }
// wrapper for test-sampling.cpp // wrapper for test-sampling.cpp
@ -2155,14 +2162,14 @@ struct llama_sampler * llama_sampler_init_logit_bias(
int32_t n_vocab, int32_t n_vocab,
int32_t n_logit_bias, int32_t n_logit_bias,
const llama_logit_bias * logit_bias) { const llama_logit_bias * logit_bias) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_logit_bias_i, /* .iface = */ &llama_sampler_logit_bias_i,
/* .ctx = */ new llama_sampler_logit_bias { /* .ctx = */ new llama_sampler_logit_bias {
/* .n_vocab = */ n_vocab, /* .n_vocab = */ n_vocab,
/* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias), /* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias),
/* .to_search = */ {}, /* .to_search = */ {},
}, }
}; );
} }
// infill // infill
@ -2377,14 +2384,14 @@ static struct llama_sampler_i llama_sampler_infill_i = {
}; };
struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) { struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
return new llama_sampler { return llama_sampler_init(
/* .iface = */ &llama_sampler_infill_i, /* .iface = */ &llama_sampler_infill_i,
/* .ctx = */ new llama_sampler_infill { /* .ctx = */ new llama_sampler_infill {
/* .vocab = */ vocab, /* .vocab = */ vocab,
/* .buf0 = */ std::vector<char>(512), /* .buf0 = */ std::vector<char>(512),
/* .buf1 = */ std::vector<char>(512), /* .buf1 = */ std::vector<char>(512),
}, }
}; );
} }
// utils // utils

View file

@ -7253,7 +7253,7 @@ struct llm_build_context {
struct ggml_tensor * Qcur = nullptr; struct ggml_tensor * Qcur = nullptr;
struct ggml_tensor * Kcur = nullptr; struct ggml_tensor * Kcur = nullptr;
struct ggml_tensor * Vcur = nullptr; struct ggml_tensor * Vcur = nullptr;
if (model.type == LLM_TYPE_1_5B || model.type == LLM_TYPE_4B || model.type == LLM_TYPE_9B) { if (model.layers[il].wqkv == nullptr) {
Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
if (model.layers[il].bq) { if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
@ -8837,12 +8837,14 @@ static int llama_decode_impl(
//llama_synchronize(&lctx); //llama_synchronize(&lctx);
// decide if we need to defrag the kv cache // decide if we need to defrag the kv cache
if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) { if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f; // - do not defrag small contexts (i.e. < 2048 tokens)
// - count the padding towards the number of used tokens
const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + llama_kv_cache_get_padding(cparams))/float(kv_self.n)) : 0.0f;
// queue defragmentation for next llama_kv_cache_update // queue defragmentation for next llama_kv_cache_update
if (fragmentation > cparams.defrag_thold) { if (fragmentation > cparams.defrag_thold) {
//LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation); LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
llama_kv_cache_defrag(kv_self); llama_kv_cache_defrag(kv_self);
} }
@ -9469,7 +9471,6 @@ static struct llama_model * llama_model_load_from_file_impl(
struct llama_model_params params) { struct llama_model_params params) {
ggml_time_init(); ggml_time_init();
llama_model * model = new llama_model(params);
unsigned cur_percentage = 0; unsigned cur_percentage = 0;
if (params.progress_callback == NULL) { if (params.progress_callback == NULL) {
@ -9488,6 +9489,8 @@ static struct llama_model * llama_model_load_from_file_impl(
}; };
} }
llama_model * model = new llama_model(params);
// create list of devices to use with this model // create list of devices to use with this model
if (params.devices) { if (params.devices) {
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) { for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {

View file

@ -626,8 +626,15 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
result.reserve(utf8.size()); result.reserve(utf8.size());
size_t offset = 0; size_t offset = 0;
while (offset < utf8.size()) { while (offset < utf8.size()) {
try {
result.push_back(unicode_cpt_from_utf8(utf8, offset)); result.push_back(unicode_cpt_from_utf8(utf8, offset));
} }
catch (const std::invalid_argument & /*ex*/) {
// Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
++offset;
result.emplace_back(0xFFFD); // replacement character
}
}
return result; return result;
} }