mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-07 21:09:11 +00:00
fix cuda support
This commit is contained in:
parent
684b2ac05b
commit
9d6a6845ac
1 changed files with 17 additions and 4 deletions
|
@ -17746,6 +17746,11 @@ static int llama_decode_internal(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (i > 0) {
|
||||||
|
// ensure ggml_backend_tensor_get_async of the previous subgraph has finished
|
||||||
|
ggml_backend_sched_synchronize(lctx.sched[i - 1]);
|
||||||
|
}
|
||||||
|
|
||||||
llama_set_inputs(lctx, ubatch);
|
llama_set_inputs(lctx, ubatch);
|
||||||
|
|
||||||
{ // compute graph
|
{ // compute graph
|
||||||
|
@ -17769,9 +17774,15 @@ static int llama_decode_internal(
|
||||||
|
|
||||||
// send the result to the next node (or the master)
|
// send the result to the next node (or the master)
|
||||||
if (n_world == 1 || (my_rank == 0 && is_last_l)) {
|
if (n_world == 1 || (my_rank == 0 && is_last_l)) {
|
||||||
size_t buf_size = sub_gf_out->ne[0]*sub_gf_out->ne[1]*sizeof(float);
|
size_t buf_size = sub_gf_out->ne[0] * sub_gf_out->ne[1] * sizeof(float);
|
||||||
float * embd_buf = is_last_l ? ubatch.out_embd : ubatch.backend_embd;
|
float * embd_buf = is_last_l ? ubatch.out_embd : ubatch.backend_embd;
|
||||||
ggml_backend_tensor_get(sub_gf_out, embd_buf, 0, buf_size);
|
ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(lctx.sched[i], sub_gf_out);
|
||||||
|
|
||||||
|
GGML_ASSERT(buf_size <= ggml_nbytes(sub_gf_out));
|
||||||
|
GGML_ASSERT(backend != nullptr);
|
||||||
|
GGML_ASSERT(embd_buf != nullptr);
|
||||||
|
|
||||||
|
ggml_backend_tensor_get_async(backend, sub_gf_out, embd_buf, 0, buf_size);
|
||||||
} else {
|
} else {
|
||||||
input_tensors tensors;
|
input_tensors tensors;
|
||||||
tensors.sub_gf_out = sub_gf_out;
|
tensors.sub_gf_out = sub_gf_out;
|
||||||
|
@ -21800,7 +21811,9 @@ int32_t llama_decode(
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_synchronize(struct llama_context * ctx) {
|
void llama_synchronize(struct llama_context * ctx) {
|
||||||
ggml_backend_sched_synchronize(ctx->sched.at(0)); // todo.
|
for (ggml_backend_sched_t sched : ctx->sched) {
|
||||||
|
ggml_backend_sched_synchronize(sched);
|
||||||
|
}
|
||||||
|
|
||||||
// FIXME: if multiple single tokens are evaluated without a synchronization,
|
// FIXME: if multiple single tokens are evaluated without a synchronization,
|
||||||
// the stats will be added to the prompt evaluation stats
|
// the stats will be added to the prompt evaluation stats
|
||||||
|
|
Loading…
Add table
Reference in a new issue