mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-05 21:39:02 +00:00
fix compute buffer estimate: don't reverse CUDA VRAM for output layer
This commit is contained in:
parent
3a03549fed
commit
11ce0d58f7
1 changed files with 14 additions and 3 deletions
|
@ -20976,7 +20976,7 @@ void * llama_context_setup_backend(
|
||||||
auto & cparams = ctx->cparams;
|
auto & cparams = ctx->cparams;
|
||||||
|
|
||||||
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
|
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
|
||||||
cparams.prefetch = params.prefetch;
|
cparams.prefetch = params.prefetch;
|
||||||
cparams.n_seq_max = std::max(1u, params.n_seq_max);
|
cparams.n_seq_max = std::max(1u, params.n_seq_max);
|
||||||
cparams.n_threads = params.n_threads;
|
cparams.n_threads = params.n_threads;
|
||||||
cparams.n_threads_batch = params.n_threads_batch;
|
cparams.n_threads_batch = params.n_threads_batch;
|
||||||
|
@ -21359,6 +21359,16 @@ void * llama_context_setup_backend(
|
||||||
bool ok = true;
|
bool ok = true;
|
||||||
GGML_ASSERT(ctx->sched.size() == gf.size());
|
GGML_ASSERT(ctx->sched.size() == gf.size());
|
||||||
for (size_t i = 0; i < gf.size(); ++i) {
|
for (size_t i = 0; i < gf.size(); ++i) {
|
||||||
|
|
||||||
|
#if defined(GGML_USE_CUDA)
|
||||||
|
if ((cparams.rank == 0 && (i == 0 || i == gf.size() - 1))
|
||||||
|
|| model->n_gpu_layers == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
#elif defined(GGML_USE_METAL)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
ok = ok & ggml_backend_sched_reserve(ctx->sched[i], gf[i]);
|
ok = ok & ggml_backend_sched_reserve(ctx->sched[i], gf[i]);
|
||||||
}
|
}
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
|
@ -21963,7 +21973,8 @@ void llama_model_compute_buf_size(
|
||||||
(n_inp_pos + n_kq_mask) * type_size_f32 + n_v * type_size_f16 + nb_attn_norm_w,
|
(n_inp_pos + n_kq_mask) * type_size_f32 + n_v * type_size_f16 + nb_attn_norm_w,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
*gpu_buf += (n_out_embd + n_result) * type_size_f32 + nb_output_w;
|
// we run the output layer on CPU by default
|
||||||
|
// *gpu_buf += (n_out_embd + n_result) * type_size_f32 + nb_output_w;
|
||||||
gpu_host_buf = (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_out_embd) * type_size_f32;
|
gpu_host_buf = (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_out_embd) * type_size_f32;
|
||||||
} else {
|
} else {
|
||||||
if (has_gpu_layers) {
|
if (has_gpu_layers) {
|
||||||
|
@ -22036,7 +22047,7 @@ void llama_model_compute_buf_size(
|
||||||
|
|
||||||
LLAMA_LOG_INFO("\n");
|
LLAMA_LOG_INFO("\n");
|
||||||
LLAMA_LOG_INFO("%s: here the compute buffer size is a predicted upper bound, not an exact value\n", __func__);
|
LLAMA_LOG_INFO("%s: here the compute buffer size is a predicted upper bound, not an exact value\n", __func__);
|
||||||
LLAMA_LOG_INFO("%s: (rank %d) compute buffer size = %7.2f MiB (GPU) + %7.2f MiB (CPU / GPU-host buffer)\n", __func__,
|
LLAMA_LOG_INFO("%s: (rank %d) compute buffer size = %7.2f MiB (GPU) + %7.2f MiB (CPU & GPU-host buffer)\n", __func__,
|
||||||
my_rank, *gpu_buf / (1024.0 * 1024.0), *cpu_buf / (1024.0 * 1024.0));
|
my_rank, *gpu_buf / (1024.0 * 1024.0), *cpu_buf / (1024.0 * 1024.0));
|
||||||
|
|
||||||
if (backend == BACKEND_CUDA) {
|
if (backend == BACKEND_CUDA) {
|
||||||
|
|
Loading…
Add table
Reference in a new issue