mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # examples/run/run.cpp # scripts/sync-ggml.last
This commit is contained in:
commit
27b9358baf
12 changed files with 181 additions and 135 deletions
|
@ -2325,5 +2325,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_TTS}));
|
).set_examples({LLAMA_EXAMPLE_TTS}));
|
||||||
|
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--embd-bge-small-en-default"},
|
||||||
|
string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
|
||||||
|
[](common_params & params) {
|
||||||
|
params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
|
||||||
|
params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
|
||||||
|
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
||||||
|
params.embd_normalize = 2;
|
||||||
|
params.n_ctx = 512;
|
||||||
|
params.verbose_prompt = true;
|
||||||
|
params.embedding = true;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--embd-e5-small-en-default"},
|
||||||
|
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
|
||||||
|
[](common_params & params) {
|
||||||
|
params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
|
||||||
|
params.hf_file = "e5-small-v2-q8_0.gguf";
|
||||||
|
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
||||||
|
params.embd_normalize = 2;
|
||||||
|
params.n_ctx = 512;
|
||||||
|
params.verbose_prompt = true;
|
||||||
|
params.embedding = true;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--embd-gte-small-default"},
|
||||||
|
string_format("use default gte-small model (note: can download weights from the internet)"),
|
||||||
|
[](common_params & params) {
|
||||||
|
params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
|
||||||
|
params.hf_file = "gte-small-q8_0.gguf";
|
||||||
|
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
||||||
|
params.embd_normalize = 2;
|
||||||
|
params.n_ctx = 512;
|
||||||
|
params.verbose_prompt = true;
|
||||||
|
params.embedding = true;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
||||||
|
|
||||||
return ctx_arg;
|
return ctx_arg;
|
||||||
}
|
}
|
||||||
|
|
|
@ -254,10 +254,10 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
return new llama_sampler{
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_llg_i,
|
/* .iface = */ &llama_sampler_llg_i,
|
||||||
/* .ctx = */ ctx,
|
/* .ctx = */ ctx
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -360,21 +360,15 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__loongarch_asx)
|
#if defined(__loongarch_asx)
|
||||||
|
|
||||||
typedef union {
|
|
||||||
int32_t i;
|
|
||||||
float f;
|
|
||||||
} ft_union;
|
|
||||||
|
|
||||||
/* float type data load instructions */
|
/* float type data load instructions */
|
||||||
static __m128 __lsx_vreplfr2vr_s(float val) {
|
static __m128 __lsx_vreplfr2vr_s(const float val) {
|
||||||
ft_union fi_tmpval = {.f = val};
|
v4f32 res = {val, val, val, val};
|
||||||
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
return (__m128)res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
static __m256 __lasx_xvreplfr2vr_s(const float val) {
|
||||||
ft_union fi_tmpval = {.f = val};
|
v8f32 res = {val, val, val, val, val, val, val, val};
|
||||||
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
|
return (__m256)res;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -502,30 +502,15 @@ static __m256i lasx_shuffle_b(__m256i a, __m256i b) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static __m256i lasx_extu8_16(__m128i a) {
|
static __m256i lasx_extu8_16(__m128i a) {
|
||||||
__m128i zero = __lsx_vldi(0);
|
return __lasx_vext2xv_hu_bu(____m256i(a));
|
||||||
__m128i vlo = __lsx_vilvl_b(zero, a);
|
|
||||||
__m128i vhi = __lsx_vilvh_b(zero, a);
|
|
||||||
return lasx_set_q(vhi, vlo);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __m256i lasx_ext8_16(__m128i a) {
|
static __m256i lasx_ext8_16(__m128i a) {
|
||||||
__m128i sign = __lsx_vslti_b(a, 0);
|
return __lasx_vext2xv_h_b(____m256i(a));
|
||||||
__m128i vlo = __lsx_vilvl_b(sign, a);
|
|
||||||
__m128i vhi = __lsx_vilvh_b(sign, a);
|
|
||||||
return lasx_set_q(vhi, vlo);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __m256i lasx_ext16_32(__m128i a) {
|
static __m256i lasx_ext16_32(__m128i a) {
|
||||||
__m256i tmp1;
|
return __lasx_vext2xv_w_h(____m256i(a));
|
||||||
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 0), 0);
|
|
||||||
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 1), 1);
|
|
||||||
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 2), 2);
|
|
||||||
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 3), 3);
|
|
||||||
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 4), 4);
|
|
||||||
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 5), 5);
|
|
||||||
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 6), 6);
|
|
||||||
tmp1 = __lasx_xvinsgr2vr_w(tmp1, __lsx_vpickve2gr_h(a, 7), 7);
|
|
||||||
return tmp1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __m128i lasx_extracti128( __m256i a, int pos) {
|
static __m128i lasx_extracti128( __m256i a, int pos) {
|
||||||
|
@ -593,12 +578,10 @@ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
||||||
// horizontally add 8 floats
|
// horizontally add 8 floats
|
||||||
static inline float hsum_float_8(const __m256 x) {
|
static inline float hsum_float_8(const __m256 x) {
|
||||||
__m128 res = lasx_extractf128(x, 1);
|
__m128 res = lasx_extractf128(x, 1);
|
||||||
ft_union tmp;
|
|
||||||
res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
|
res = __lsx_vfadd_s(res, lasx_extractf128(x, 0));
|
||||||
res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
|
res = __lsx_vfadd_s(res, (__m128)__lsx_vpickod_d((__m128i)res, (__m128i)res));
|
||||||
res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
|
res = __lsx_vfadd_s(res, (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w(res, 1), 0));
|
||||||
tmp.i = __lsx_vpickve2gr_w(res, 0);
|
return ((v4f32)res)[0];
|
||||||
return tmp.f;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// horizontally add 8 int32_t
|
// horizontally add 8 int32_t
|
||||||
|
@ -940,7 +923,6 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
||||||
|
|
||||||
#elif defined(__loongarch_asx)
|
#elif defined(__loongarch_asx)
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
ft_union fi;
|
|
||||||
__m256 v0 = (__m256)__lasx_xvld( x , 0);
|
__m256 v0 = (__m256)__lasx_xvld( x , 0);
|
||||||
__m256 v1 = (__m256)__lasx_xvld( x , 32);
|
__m256 v1 = (__m256)__lasx_xvld( x , 32);
|
||||||
__m256 v2 = (__m256)__lasx_xvld( x , 64);
|
__m256 v2 = (__m256)__lasx_xvld( x , 64);
|
||||||
|
@ -958,8 +940,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
||||||
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
|
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
|
||||||
__m128 tmp = max4;
|
__m128 tmp = max4;
|
||||||
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
|
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vinsgr2vr_w(tmp, __lsx_vpickve2gr_w( max4, 1 ), 0 ));
|
||||||
fi.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 );
|
const float max_scalar = ((v4f32)max4)[0];
|
||||||
const float max_scalar = fi.f;
|
|
||||||
|
|
||||||
// Quantize these floats
|
// Quantize these floats
|
||||||
const float d = max_scalar / 127.f;
|
const float d = max_scalar / 127.f;
|
||||||
|
@ -1264,7 +1245,6 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
|
||||||
|
|
||||||
#elif defined(__loongarch_asx)
|
#elif defined(__loongarch_asx)
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
ft_union ft;
|
|
||||||
__m256 v0 = (__m256)__lasx_xvld( x , 0 );
|
__m256 v0 = (__m256)__lasx_xvld( x , 0 );
|
||||||
__m256 v1 = (__m256)__lasx_xvld( x , 32 );
|
__m256 v1 = (__m256)__lasx_xvld( x , 32 );
|
||||||
__m256 v2 = (__m256)__lasx_xvld( x , 64 );
|
__m256 v2 = (__m256)__lasx_xvld( x , 64 );
|
||||||
|
@ -1282,8 +1262,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
|
||||||
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
|
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vpickod_d((__m128i) max4, (__m128i)max4 ) );
|
||||||
__m128 tmp = max4;
|
__m128 tmp = max4;
|
||||||
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
|
max4 = __lsx_vfmax_s( max4, (__m128)__lsx_vextrins_w((__m128i)tmp, (__m128i)max4, 0x10 ));
|
||||||
ft.i = __lsx_vpickve2gr_w( (__m128i)max4, 0 );
|
const float max_scalar = ((v4f32)max4)[0];
|
||||||
const float max_scalar = ft.f;
|
|
||||||
|
|
||||||
// Quantize these floats
|
// Quantize these floats
|
||||||
const float d = max_scalar / 127.f;
|
const float d = max_scalar / 127.f;
|
||||||
|
@ -6155,9 +6134,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||||
acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
|
acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
|
||||||
|
|
||||||
|
|
||||||
ft_union fi;
|
*s = hsum_float_8(acc) + ((v4f32)acc_m)[0];
|
||||||
fi.i = __lsx_vpickve2gr_w(acc_m, 0);
|
|
||||||
*s = hsum_float_8(acc) + fi.f ;
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||||
|
|
|
@ -1082,29 +1082,23 @@ do { \
|
||||||
#define GGML_F16_STEP 32
|
#define GGML_F16_STEP 32
|
||||||
#define GGML_F16_EPR 8
|
#define GGML_F16_EPR 8
|
||||||
|
|
||||||
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
// F16 arithmetic is not supported by LASX, so we use F32 instead
|
||||||
|
|
||||||
#define GGML_F32Cx8 __m256
|
#define GGML_F32Cx8 __m256
|
||||||
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
||||||
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
||||||
|
|
||||||
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
|
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
|
||||||
float tmp[8];
|
__m256i a;
|
||||||
|
memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
|
||||||
for (int i = 0; i < 8; i++) {
|
a = __lasx_xvpermi_d(a, 0 | (1 << 4));
|
||||||
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
return __lasx_xvfcvtl_s_h(a);
|
||||||
}
|
|
||||||
|
|
||||||
return (__m256)__lasx_xvld(tmp, 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
||||||
float arr[8];
|
__m256i a = __lasx_xvfcvt_h_s(y, y);
|
||||||
|
a = __lasx_xvpermi_d(a, 0 | (2 << 2));
|
||||||
__lasx_xvst(y, arr, 0);
|
memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
|
||||||
|
|
||||||
for (int i = 0; i < 8; i++) {
|
|
||||||
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
||||||
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
||||||
|
|
|
@ -1045,7 +1045,28 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n", __func__, (void*)src->buffer, (void*)dst->buffer);
|
|
||||||
|
uint64_t src_size = (uint64_t) ggml_nbytes(src);
|
||||||
|
uint64_t dst_data = (uint64_t) dst->data;
|
||||||
|
uint64_t dst_base = (uint64_t) ggml_backend_buffer_get_base(dst->buffer);
|
||||||
|
uint64_t dst_buf_sz = (uint64_t) ggml_backend_buffer_get_size(dst->buffer);
|
||||||
|
|
||||||
|
if (dst_data + src_size > dst_base + dst_buf_sz) {
|
||||||
|
GGML_PRINT_DEBUG("[%s] out-of-bounds write in rpc_server::copy_tensor:\n"
|
||||||
|
" write range : [0x%" PRIx64 ", 0x%" PRIx64 "]\n"
|
||||||
|
" buffer base: [0x%" PRIx64 ", 0x%" PRIx64 "]\n",
|
||||||
|
__func__,
|
||||||
|
dst_data,
|
||||||
|
dst_data + src_size,
|
||||||
|
dst_base,
|
||||||
|
dst_base + dst_buf_sz);
|
||||||
|
ggml_free(ctx);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n",
|
||||||
|
__func__, (void*) src->buffer, (void*) dst->buffer);
|
||||||
|
|
||||||
response.result = ggml_backend_buffer_copy_tensor(src, dst);
|
response.result = ggml_backend_buffer_copy_tensor(src, dst);
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -103,11 +103,10 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
|
||||||
name = std::regex_replace(name, std::regex("\\(TM\\)"), "");
|
name = std::regex_replace(name, std::regex("\\(TM\\)"), "");
|
||||||
|
|
||||||
auto global_mem_size = prop.get_global_mem_size()/1000000;
|
auto global_mem_size = prop.get_global_mem_size()/1000000;
|
||||||
std::string xmx = gpu_has_xmx(device) ? "yes" : "no";
|
GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
|
||||||
GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|%14s|\n", id, device_type.c_str(),
|
|
||||||
name.c_str(), version.c_str(), prop.get_max_compute_units(),
|
name.c_str(), version.c_str(), prop.get_max_compute_units(),
|
||||||
prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
|
prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
|
||||||
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str(), xmx.c_str());
|
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_sycl_print_sycl_devices() {
|
void ggml_backend_sycl_print_sycl_devices() {
|
||||||
|
@ -118,16 +117,16 @@ void ggml_backend_sycl_print_sycl_devices() {
|
||||||
|
|
||||||
GGML_LOG_INFO(
|
GGML_LOG_INFO(
|
||||||
"| | | | "
|
"| | | | "
|
||||||
" |Max | |Max |Global | | XMX |\n");
|
" |Max | |Max |Global | |\n");
|
||||||
GGML_LOG_INFO(
|
GGML_LOG_INFO(
|
||||||
"| | | | "
|
"| | | | "
|
||||||
" |compute|Max work|sub |mem | | or |\n");
|
" |compute|Max work|sub |mem | |\n");
|
||||||
GGML_LOG_INFO(
|
GGML_LOG_INFO(
|
||||||
"|ID| Device Type| "
|
"|ID| Device Type| "
|
||||||
"Name|Version|units |group |group|size | Driver version| Tensor Cores |\n");
|
"Name|Version|units |group |group|size | Driver version|\n");
|
||||||
GGML_LOG_INFO(
|
GGML_LOG_INFO(
|
||||||
"|--|-------------------|---------------------------------------|------"
|
"|--|-------------------|---------------------------------------|------"
|
||||||
"-|-------|--------|-----|-------|---------------------|--------------|\n");
|
"-|-------|--------|-----|-------|---------------------|\n");
|
||||||
|
|
||||||
for (int id = 0; id < device_count; ++id) {
|
for (int id = 0; id < device_count; ++id) {
|
||||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||||
|
|
|
@ -2788,8 +2788,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
||||||
std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none";
|
std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none";
|
||||||
|
|
||||||
std::string device_name = props2.properties.deviceName.data();
|
std::string device_name = props2.properties.deviceName.data();
|
||||||
GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | matrix cores: %s\n",
|
GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | shared memory: %d | matrix cores: %s\n",
|
||||||
idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size, matrix_cores.c_str());
|
idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size,
|
||||||
|
props2.properties.limits.maxComputeSharedMemorySize, matrix_cores.c_str());
|
||||||
|
|
||||||
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
|
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
|
||||||
GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n");
|
GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n");
|
||||||
|
|
|
@ -1116,11 +1116,12 @@ extern "C" {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_sampler {
|
struct llama_sampler {
|
||||||
struct llama_sampler_i * iface;
|
const struct llama_sampler_i * iface;
|
||||||
llama_sampler_context_t ctx;
|
llama_sampler_context_t ctx;
|
||||||
};
|
};
|
||||||
|
|
||||||
// mirror of llama_sampler_i:
|
// mirror of llama_sampler_i:
|
||||||
|
LLAMA_API struct llama_sampler * llama_sampler_init (const struct llama_sampler_i * iface, llama_sampler_context_t ctx);
|
||||||
LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
|
LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
|
||||||
LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
|
LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
|
||||||
LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
|
LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
|
||||||
|
|
|
@ -316,6 +316,13 @@ static uint32_t get_rng_seed(uint32_t seed) {
|
||||||
|
|
||||||
// llama_sampler API
|
// llama_sampler API
|
||||||
|
|
||||||
|
struct llama_sampler * llama_sampler_init(const struct llama_sampler_i * iface, llama_sampler_context_t ctx) {
|
||||||
|
return new llama_sampler {
|
||||||
|
/* .iface = */ iface,
|
||||||
|
/* .ctx = */ ctx,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
const char * llama_sampler_name(const struct llama_sampler * smpl) {
|
const char * llama_sampler_name(const struct llama_sampler * smpl) {
|
||||||
if (!smpl->iface) {
|
if (!smpl->iface) {
|
||||||
return "(null)";
|
return "(null)";
|
||||||
|
@ -347,10 +354,10 @@ struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (smpl->ctx == nullptr) {
|
if (smpl->ctx == nullptr) {
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ smpl->iface,
|
/* .iface = */ smpl->iface,
|
||||||
/* .ctx = */ nullptr,
|
/* .ctx = */ nullptr
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_ABORT("the sampler does not support cloning");
|
GGML_ABORT("the sampler does not support cloning");
|
||||||
|
@ -472,15 +479,15 @@ static struct llama_sampler_i llama_sampler_chain_i = {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
|
struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_chain_i,
|
/* .iface = */ &llama_sampler_chain_i,
|
||||||
/* .ctx = */ new llama_sampler_chain {
|
/* .ctx = */ new llama_sampler_chain {
|
||||||
/* .params = */ params,
|
/* .params = */ params,
|
||||||
/* .samplers = */ {},
|
/* .samplers = */ {},
|
||||||
/* .t_sample_us = */ 0,
|
/* .t_sample_us = */ 0,
|
||||||
/* .n_sample = */ 0,
|
/* .n_sample = */ 0,
|
||||||
},
|
}
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
|
void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
|
||||||
|
@ -546,10 +553,10 @@ static struct llama_sampler_i llama_sampler_greedy_i = {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_init_greedy() {
|
struct llama_sampler * llama_sampler_init_greedy() {
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_greedy_i,
|
/* .iface = */ &llama_sampler_greedy_i,
|
||||||
/* .ctx = */ nullptr,
|
/* .ctx = */ nullptr
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// dist
|
// dist
|
||||||
|
@ -608,14 +615,14 @@ static struct llama_sampler_i llama_sampler_dist_i = {
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
|
struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
|
||||||
auto seed_cur = get_rng_seed(seed);
|
auto seed_cur = get_rng_seed(seed);
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_dist_i,
|
/* .iface = */ &llama_sampler_dist_i,
|
||||||
/* .ctx = */ new llama_sampler_dist {
|
/* .ctx = */ new llama_sampler_dist {
|
||||||
/* .seed = */ seed,
|
/* .seed = */ seed,
|
||||||
/* .seed_cur = */ seed_cur,
|
/* .seed_cur = */ seed_cur,
|
||||||
/* .rng = */ std::mt19937(seed_cur),
|
/* .rng = */ std::mt19937(seed_cur),
|
||||||
},
|
}
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// softmax
|
// softmax
|
||||||
|
@ -638,10 +645,10 @@ static struct llama_sampler_i llama_sampler_softmax_i = {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_init_softmax() {
|
struct llama_sampler * llama_sampler_init_softmax() {
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_softmax_i,
|
/* .iface = */ &llama_sampler_softmax_i,
|
||||||
/* .ctx = */ nullptr,
|
/* .ctx = */ nullptr
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// top-k
|
// top-k
|
||||||
|
@ -678,12 +685,12 @@ static struct llama_sampler_i llama_sampler_top_k_i = {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
|
struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_top_k_i,
|
/* .iface = */ &llama_sampler_top_k_i,
|
||||||
/* .ctx = */ new llama_sampler_top_k {
|
/* .ctx = */ new llama_sampler_top_k {
|
||||||
/* .k = */ k,
|
/* .k = */ k,
|
||||||
},
|
}
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// top-p
|
// top-p
|
||||||
|
@ -744,13 +751,13 @@ static struct llama_sampler_i llama_sampler_top_p_i = {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
|
struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_top_p_i,
|
/* .iface = */ &llama_sampler_top_p_i,
|
||||||
/* .ctx = */ new llama_sampler_top_p {
|
/* .ctx = */ new llama_sampler_top_p {
|
||||||
/* .p = */ p,
|
/* .p = */ p,
|
||||||
/* .min_keep = */ min_keep,
|
/* .min_keep = */ min_keep,
|
||||||
},
|
}
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// min-p
|
// min-p
|
||||||
|
@ -840,13 +847,13 @@ static struct llama_sampler_i llama_sampler_min_p_i = {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
|
struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_min_p_i,
|
/* .iface = */ &llama_sampler_min_p_i,
|
||||||
/* .ctx = */ new llama_sampler_min_p {
|
/* .ctx = */ new llama_sampler_min_p {
|
||||||
/* .p = */ p,
|
/* .p = */ p,
|
||||||
/* .min_keep = */ min_keep,
|
/* .min_keep = */ min_keep,
|
||||||
},
|
}
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// typical
|
// typical
|
||||||
|
@ -939,13 +946,13 @@ static struct llama_sampler_i llama_sampler_typical_i = {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
|
struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_typical_i,
|
/* .iface = */ &llama_sampler_typical_i,
|
||||||
/* .ctx = */ new llama_sampler_typical {
|
/* .ctx = */ new llama_sampler_typical {
|
||||||
/* .p = */ p,
|
/* .p = */ p,
|
||||||
/* .min_keep = */ min_keep,
|
/* .min_keep = */ min_keep,
|
||||||
},
|
}
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// temp
|
// temp
|
||||||
|
@ -983,12 +990,12 @@ static struct llama_sampler_i llama_sampler_temp_i = {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_init_temp(float temp) {
|
struct llama_sampler * llama_sampler_init_temp(float temp) {
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_temp_i,
|
/* .iface = */ &llama_sampler_temp_i,
|
||||||
/* .ctx = */ new llama_sampler_temp {
|
/* .ctx = */ new llama_sampler_temp {
|
||||||
/*.temp = */ temp,
|
/*.temp = */ temp,
|
||||||
},
|
}
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// temp-ext
|
// temp-ext
|
||||||
|
@ -1093,14 +1100,14 @@ static struct llama_sampler_i llama_sampler_temp_ext_i = {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
|
struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_temp_ext_i,
|
/* .iface = */ &llama_sampler_temp_ext_i,
|
||||||
/* .ctx = */ new llama_sampler_temp_ext {
|
/* .ctx = */ new llama_sampler_temp_ext {
|
||||||
/* .temp = */ temp,
|
/* .temp = */ temp,
|
||||||
/* .delta = */ delta,
|
/* .delta = */ delta,
|
||||||
/* .exponent = */ exponent,
|
/* .exponent = */ exponent,
|
||||||
},
|
}
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// xtc
|
// xtc
|
||||||
|
@ -1185,7 +1192,7 @@ static struct llama_sampler_i llama_sampler_xtc_i = {
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
|
struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
|
||||||
auto seed_cur = get_rng_seed(seed);
|
auto seed_cur = get_rng_seed(seed);
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_xtc_i,
|
/* .iface = */ &llama_sampler_xtc_i,
|
||||||
/* .ctx = */ new llama_sampler_xtc {
|
/* .ctx = */ new llama_sampler_xtc {
|
||||||
/* .probability = */ p,
|
/* .probability = */ p,
|
||||||
|
@ -1194,8 +1201,8 @@ struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep,
|
||||||
/* .seed = */ seed,
|
/* .seed = */ seed,
|
||||||
/* .seed_cur = */ seed_cur,
|
/* .seed_cur = */ seed_cur,
|
||||||
/* .rng = */ std::mt19937(seed_cur),
|
/* .rng = */ std::mt19937(seed_cur),
|
||||||
},
|
}
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// mirostat
|
// mirostat
|
||||||
|
@ -1292,7 +1299,7 @@ static struct llama_sampler_i llama_sampler_mirostat_i = {
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
|
struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
|
||||||
auto seed_cur = get_rng_seed(seed);
|
auto seed_cur = get_rng_seed(seed);
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_mirostat_i,
|
/* .iface = */ &llama_sampler_mirostat_i,
|
||||||
/* .ctx = */ new llama_sampler_mirostat {
|
/* .ctx = */ new llama_sampler_mirostat {
|
||||||
/* .n_vocab = */ n_vocab,
|
/* .n_vocab = */ n_vocab,
|
||||||
|
@ -1303,8 +1310,8 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see
|
||||||
/* .m = */ m,
|
/* .m = */ m,
|
||||||
/* .mu = */ 2.0f*tau,
|
/* .mu = */ 2.0f*tau,
|
||||||
/* .rng = */ std::mt19937(seed_cur),
|
/* .rng = */ std::mt19937(seed_cur),
|
||||||
},
|
}
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// mirostat v2
|
// mirostat v2
|
||||||
|
@ -1391,7 +1398,7 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
|
struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
|
||||||
auto seed_cur = get_rng_seed(seed);
|
auto seed_cur = get_rng_seed(seed);
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_mirostat_v2_i,
|
/* .iface = */ &llama_sampler_mirostat_v2_i,
|
||||||
/* .ctx = */ new llama_sampler_mirostat_v2 {
|
/* .ctx = */ new llama_sampler_mirostat_v2 {
|
||||||
/* .seed = */ seed,
|
/* .seed = */ seed,
|
||||||
|
@ -1400,8 +1407,8 @@ struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau,
|
||||||
/* .eta = */ eta,
|
/* .eta = */ eta,
|
||||||
/* .mu = */ 2.0f*tau,
|
/* .mu = */ 2.0f*tau,
|
||||||
/* .rng = */ std::mt19937(seed_cur),
|
/* .rng = */ std::mt19937(seed_cur),
|
||||||
},
|
}
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// grammar
|
// grammar
|
||||||
|
@ -1528,10 +1535,10 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_grammar_i,
|
/* .iface = */ &llama_sampler_grammar_i,
|
||||||
/* .ctx = */ ctx,
|
/* .ctx = */ ctx
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_init_grammar(
|
struct llama_sampler * llama_sampler_init_grammar(
|
||||||
|
@ -1678,7 +1685,7 @@ struct llama_sampler * llama_sampler_init_penalties(
|
||||||
float penalty_present) {
|
float penalty_present) {
|
||||||
penalty_last_n = std::max(penalty_last_n, 0);
|
penalty_last_n = std::max(penalty_last_n, 0);
|
||||||
|
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_penalties_i,
|
/* .iface = */ &llama_sampler_penalties_i,
|
||||||
/* .ctx = */ new llama_sampler_penalties {
|
/* .ctx = */ new llama_sampler_penalties {
|
||||||
/* .penalty_last_n = */ penalty_last_n,
|
/* .penalty_last_n = */ penalty_last_n,
|
||||||
|
@ -1687,8 +1694,8 @@ struct llama_sampler * llama_sampler_init_penalties(
|
||||||
/* .penalty_present = */ penalty_present,
|
/* .penalty_present = */ penalty_present,
|
||||||
/* .prev = */ ring_buffer<llama_token>(penalty_last_n),
|
/* .prev = */ ring_buffer<llama_token>(penalty_last_n),
|
||||||
/* .token_count = */ {},
|
/* .token_count = */ {},
|
||||||
},
|
}
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// DRY
|
// DRY
|
||||||
|
@ -2041,7 +2048,7 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_dry_i,
|
/* .iface = */ &llama_sampler_dry_i,
|
||||||
/* .ctx = */ new llama_sampler_dry {
|
/* .ctx = */ new llama_sampler_dry {
|
||||||
/* .total_context_size = */ context_size,
|
/* .total_context_size = */ context_size,
|
||||||
|
@ -2053,8 +2060,8 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
|
||||||
/* .dry_repeat_count = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
|
/* .dry_repeat_count = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
|
||||||
/* .dry_max_token_repeat = */ {},
|
/* .dry_max_token_repeat = */ {},
|
||||||
/* .last_tokens = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
|
/* .last_tokens = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
|
||||||
},
|
}
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// wrapper for test-sampling.cpp
|
// wrapper for test-sampling.cpp
|
||||||
|
@ -2155,14 +2162,14 @@ struct llama_sampler * llama_sampler_init_logit_bias(
|
||||||
int32_t n_vocab,
|
int32_t n_vocab,
|
||||||
int32_t n_logit_bias,
|
int32_t n_logit_bias,
|
||||||
const llama_logit_bias * logit_bias) {
|
const llama_logit_bias * logit_bias) {
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_logit_bias_i,
|
/* .iface = */ &llama_sampler_logit_bias_i,
|
||||||
/* .ctx = */ new llama_sampler_logit_bias {
|
/* .ctx = */ new llama_sampler_logit_bias {
|
||||||
/* .n_vocab = */ n_vocab,
|
/* .n_vocab = */ n_vocab,
|
||||||
/* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias),
|
/* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias),
|
||||||
/* .to_search = */ {},
|
/* .to_search = */ {},
|
||||||
},
|
}
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// infill
|
// infill
|
||||||
|
@ -2377,14 +2384,14 @@ static struct llama_sampler_i llama_sampler_infill_i = {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
|
struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
|
||||||
return new llama_sampler {
|
return llama_sampler_init(
|
||||||
/* .iface = */ &llama_sampler_infill_i,
|
/* .iface = */ &llama_sampler_infill_i,
|
||||||
/* .ctx = */ new llama_sampler_infill {
|
/* .ctx = */ new llama_sampler_infill {
|
||||||
/* .vocab = */ vocab,
|
/* .vocab = */ vocab,
|
||||||
/* .buf0 = */ std::vector<char>(512),
|
/* .buf0 = */ std::vector<char>(512),
|
||||||
/* .buf1 = */ std::vector<char>(512),
|
/* .buf1 = */ std::vector<char>(512),
|
||||||
},
|
}
|
||||||
};
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// utils
|
// utils
|
||||||
|
|
|
@ -7253,7 +7253,7 @@ struct llm_build_context {
|
||||||
struct ggml_tensor * Qcur = nullptr;
|
struct ggml_tensor * Qcur = nullptr;
|
||||||
struct ggml_tensor * Kcur = nullptr;
|
struct ggml_tensor * Kcur = nullptr;
|
||||||
struct ggml_tensor * Vcur = nullptr;
|
struct ggml_tensor * Vcur = nullptr;
|
||||||
if (model.type == LLM_TYPE_1_5B || model.type == LLM_TYPE_4B || model.type == LLM_TYPE_9B) {
|
if (model.layers[il].wqkv == nullptr) {
|
||||||
Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||||
if (model.layers[il].bq) {
|
if (model.layers[il].bq) {
|
||||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
@ -8837,12 +8837,14 @@ static int llama_decode_impl(
|
||||||
//llama_synchronize(&lctx);
|
//llama_synchronize(&lctx);
|
||||||
|
|
||||||
// decide if we need to defrag the kv cache
|
// decide if we need to defrag the kv cache
|
||||||
if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
|
if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
|
||||||
const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
|
// - do not defrag small contexts (i.e. < 2048 tokens)
|
||||||
|
// - count the padding towards the number of used tokens
|
||||||
|
const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + llama_kv_cache_get_padding(cparams))/float(kv_self.n)) : 0.0f;
|
||||||
|
|
||||||
// queue defragmentation for next llama_kv_cache_update
|
// queue defragmentation for next llama_kv_cache_update
|
||||||
if (fragmentation > cparams.defrag_thold) {
|
if (fragmentation > cparams.defrag_thold) {
|
||||||
//LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
|
LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
|
||||||
|
|
||||||
llama_kv_cache_defrag(kv_self);
|
llama_kv_cache_defrag(kv_self);
|
||||||
}
|
}
|
||||||
|
@ -9469,7 +9471,6 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||||
struct llama_model_params params) {
|
struct llama_model_params params) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
llama_model * model = new llama_model(params);
|
|
||||||
|
|
||||||
unsigned cur_percentage = 0;
|
unsigned cur_percentage = 0;
|
||||||
if (params.progress_callback == NULL) {
|
if (params.progress_callback == NULL) {
|
||||||
|
@ -9488,6 +9489,8 @@ static struct llama_model * llama_model_load_from_file_impl(
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_model * model = new llama_model(params);
|
||||||
|
|
||||||
// create list of devices to use with this model
|
// create list of devices to use with this model
|
||||||
if (params.devices) {
|
if (params.devices) {
|
||||||
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
|
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
|
||||||
|
|
|
@ -626,7 +626,14 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
|
||||||
result.reserve(utf8.size());
|
result.reserve(utf8.size());
|
||||||
size_t offset = 0;
|
size_t offset = 0;
|
||||||
while (offset < utf8.size()) {
|
while (offset < utf8.size()) {
|
||||||
result.push_back(unicode_cpt_from_utf8(utf8, offset));
|
try {
|
||||||
|
result.push_back(unicode_cpt_from_utf8(utf8, offset));
|
||||||
|
}
|
||||||
|
catch (const std::invalid_argument & /*ex*/) {
|
||||||
|
// Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
|
||||||
|
++offset;
|
||||||
|
result.emplace_back(0xFFFD); // replacement character
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue