mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-10 06:44:36 +00:00
Merge branch 'main' into dev
This commit is contained in:
commit
a01fafd126
3 changed files with 41 additions and 11 deletions
|
@ -1643,6 +1643,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
|
|
||||||
if (n_world == 1) {
|
if (n_world == 1) {
|
||||||
uint32_t n_layers = llama_model_n_layers(model);
|
uint32_t n_layers = llama_model_n_layers(model);
|
||||||
|
|
||||||
// assign all layers to this device
|
// assign all layers to this device
|
||||||
params.n_layer_window[0] = n_layers;
|
params.n_layer_window[0] = n_layers;
|
||||||
cparams.n_layer_window[0] = n_layers;
|
cparams.n_layer_window[0] = n_layers;
|
||||||
|
@ -1651,6 +1652,8 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
|
|
||||||
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
|
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
|
||||||
params.n_gpu_layers = std::min((int32_t)n_layers, params.n_gpu_layers);
|
params.n_gpu_layers = std::min((int32_t)n_layers, params.n_gpu_layers);
|
||||||
|
cparams.n_gpu_layers = params.n_gpu_layers;
|
||||||
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -48,6 +48,16 @@
|
||||||
#include <dirent.h>
|
#include <dirent.h>
|
||||||
|
|
||||||
|
|
||||||
|
static int gcd_int(int a, int b) {
|
||||||
|
while (b != 0) {
|
||||||
|
int t = b;
|
||||||
|
b = a % b;
|
||||||
|
a = t;
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static size_t get_page_size() {
|
static size_t get_page_size() {
|
||||||
size_t page_size = 0;
|
size_t page_size = 0;
|
||||||
|
|
||||||
|
@ -155,7 +165,24 @@ uint32_t device_cpu_cores() {
|
||||||
static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, enum profiler_backend_type btype, int n_threads) {
|
static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, enum profiler_backend_type btype, int n_threads) {
|
||||||
int n_repeat = 1;
|
int n_repeat = 1;
|
||||||
int n_embd = std::min(llama_n_embd(model), 4096);
|
int n_embd = std::min(llama_n_embd(model), 4096);
|
||||||
if (btype == PROFILER_BACKEND_TYPE_CPU) n_embd /= 8; // simulate small tensor calculation on cpu
|
|
||||||
|
// simulate small tensor calculation on cpu
|
||||||
|
if (btype == PROFILER_BACKEND_TYPE_CPU) n_embd /= 8;
|
||||||
|
|
||||||
|
// ensure that the block sizes of the tensors are compatible
|
||||||
|
int bs0 = ggml_blck_size(src0t);
|
||||||
|
int bs1 = ggml_blck_size(src1t);
|
||||||
|
int gcd = gcd_int(bs0, bs1);
|
||||||
|
int lcm = bs0 / gcd * bs1;
|
||||||
|
|
||||||
|
if (n_embd % bs0 != 0 || n_embd % bs1 != 0) {
|
||||||
|
if (n_embd < lcm) {
|
||||||
|
n_embd = 2 * lcm;
|
||||||
|
} else {
|
||||||
|
n_embd = 2 * (n_embd / lcm) * lcm;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<float> matrix_A(n_embd * n_embd, 1.0f);
|
std::vector<float> matrix_A(n_embd * n_embd, 1.0f);
|
||||||
std::vector<float> matrix_B(n_embd * n_embd, 1.0f / n_embd);
|
std::vector<float> matrix_B(n_embd * n_embd, 1.0f / n_embd);
|
||||||
|
|
||||||
|
@ -188,9 +215,6 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
|
||||||
};
|
};
|
||||||
struct ggml_context * ctx = ggml_init(params);
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
|
|
||||||
if(n_embd < ggml_blck_size(src0t)){
|
|
||||||
n_embd = 2 * ggml_blck_size(src0t);
|
|
||||||
}
|
|
||||||
struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, src0t, n_embd, n_embd);
|
struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, src0t, n_embd, n_embd);
|
||||||
struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, src1t, n_embd, n_embd);
|
struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, src1t, n_embd, n_embd);
|
||||||
|
|
||||||
|
@ -415,7 +439,7 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in
|
||||||
}
|
}
|
||||||
|
|
||||||
// warm-up
|
// warm-up
|
||||||
// ggml_backend_graph_compute(backend, gf);
|
ggml_backend_graph_compute(backend, gf);
|
||||||
|
|
||||||
const int64_t t_start = ggml_time_us();
|
const int64_t t_start = ggml_time_us();
|
||||||
ggml_backend_graph_compute(backend, gf);
|
ggml_backend_graph_compute(backend, gf);
|
||||||
|
@ -1264,6 +1288,9 @@ static float device_mem_copy(struct llama_model * model, enum profiler_backend_t
|
||||||
ggml_backend_cpu_set_n_threads(backend, n_threads);
|
ggml_backend_cpu_set_n_threads(backend, n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// warm-up
|
||||||
|
ggml_backend_graph_compute(backend, gf);
|
||||||
|
|
||||||
const int64_t t_start = ggml_time_us();
|
const int64_t t_start = ggml_time_us();
|
||||||
ggml_backend_graph_compute(backend, gf);
|
ggml_backend_graph_compute(backend, gf);
|
||||||
const int64_t t_end = ggml_time_us();
|
const int64_t t_end = ggml_time_us();
|
||||||
|
|
|
@ -18225,7 +18225,7 @@ static int llama_decode_internal(
|
||||||
[&]{ llama_kv_cache_clear (&lctx); },
|
[&]{ llama_kv_cache_clear (&lctx); },
|
||||||
[&]{ llama_send_kv_cache_clear (&lctx); },
|
[&]{ llama_send_kv_cache_clear (&lctx); },
|
||||||
is_last_dev)) {
|
is_last_dev)) {
|
||||||
LLAMA_LOG_INFO("%s: received signal kv_cache_clear\n", __func__);
|
LLAMA_LOG_DEBUG("%s: received signal kv_cache_clear\n", __func__);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18233,7 +18233,7 @@ static int llama_decode_internal(
|
||||||
[&]{ llama_kv_cache_seq_rm (&lctx, meta.rm_seq_id, meta.rm_p0, meta.rm_p1); },
|
[&]{ llama_kv_cache_seq_rm (&lctx, meta.rm_seq_id, meta.rm_p0, meta.rm_p1); },
|
||||||
[&]{ llama_send_kv_cache_seq_rm (&lctx, meta.rm_seq_id, meta.rm_p0, meta.rm_p1); },
|
[&]{ llama_send_kv_cache_seq_rm (&lctx, meta.rm_seq_id, meta.rm_p0, meta.rm_p1); },
|
||||||
is_last_dev)) {
|
is_last_dev)) {
|
||||||
LLAMA_LOG_INFO("%s: received signal kv_cache_seq_rm\n", __func__);
|
LLAMA_LOG_DEBUG("%s: received signal kv_cache_seq_rm\n", __func__);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18241,7 +18241,7 @@ static int llama_decode_internal(
|
||||||
[&]{ llama_kv_cache_seq_add (&lctx, meta.add_seq_id, meta.add_p0, meta.add_p1, meta.add_delta); },
|
[&]{ llama_kv_cache_seq_add (&lctx, meta.add_seq_id, meta.add_p0, meta.add_p1, meta.add_delta); },
|
||||||
[&]{ llama_send_kv_cache_seq_add(&lctx, meta.add_seq_id, meta.add_p0, meta.add_p1, meta.add_delta); },
|
[&]{ llama_send_kv_cache_seq_add(&lctx, meta.add_seq_id, meta.add_p0, meta.add_p1, meta.add_delta); },
|
||||||
is_last_dev)) {
|
is_last_dev)) {
|
||||||
LLAMA_LOG_INFO("%s: received signal kv_cache_seq_add\n", __func__);
|
LLAMA_LOG_DEBUG("%s: received signal kv_cache_seq_add\n", __func__);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18249,7 +18249,7 @@ static int llama_decode_internal(
|
||||||
[&]{ llama_kv_cache_seq_cp (&lctx, meta.cp_src_seq_id, meta.cp_dst_seq_id, meta.cp_p0, meta.cp_p1); },
|
[&]{ llama_kv_cache_seq_cp (&lctx, meta.cp_src_seq_id, meta.cp_dst_seq_id, meta.cp_p0, meta.cp_p1); },
|
||||||
[&]{ llama_send_kv_cache_seq_cp (&lctx, meta.cp_src_seq_id, meta.cp_dst_seq_id, meta.cp_p0, meta.cp_p1); },
|
[&]{ llama_send_kv_cache_seq_cp (&lctx, meta.cp_src_seq_id, meta.cp_dst_seq_id, meta.cp_p0, meta.cp_p1); },
|
||||||
is_last_dev)) {
|
is_last_dev)) {
|
||||||
LLAMA_LOG_INFO("%s: received signal kv_cache_seq_cp\n", __func__);
|
LLAMA_LOG_DEBUG("%s: received signal kv_cache_seq_cp\n", __func__);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18257,7 +18257,7 @@ static int llama_decode_internal(
|
||||||
[&]{ llama_kv_cache_seq_div (&lctx, meta.div_seq_id, meta.div_p0, meta.div_p1, meta.div_factor); },
|
[&]{ llama_kv_cache_seq_div (&lctx, meta.div_seq_id, meta.div_p0, meta.div_p1, meta.div_factor); },
|
||||||
[&]{ llama_send_kv_cache_seq_div(&lctx, meta.div_seq_id, meta.div_p0, meta.div_p1, meta.div_factor); },
|
[&]{ llama_send_kv_cache_seq_div(&lctx, meta.div_seq_id, meta.div_p0, meta.div_p1, meta.div_factor); },
|
||||||
is_last_dev)) {
|
is_last_dev)) {
|
||||||
LLAMA_LOG_INFO("%s: received signal kv_cache_seq_div\n", __func__);
|
LLAMA_LOG_DEBUG("%s: received signal kv_cache_seq_div\n", __func__);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue