embeddings memory usage regression fix

This commit is contained in:
Concedo 2026-01-18 16:26:52 +08:00
parent 3816391a74
commit 7b4517c2fe
3 changed files with 135 additions and 39 deletions

View file

@ -413,6 +413,7 @@ void llama_context::sched_reserve() {
sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, cparams.pipeline_parallel, cparams.op_offload));
if (memory) { //added by kcpp to reduce embeddings memory usage
llama_memory_context_ptr mctx;
if (memory) {
LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
@ -547,6 +548,7 @@ void llama_context::sched_reserve() {
LLAMA_LOG_INFO("%s: reserve took %.2f ms, sched copies = %d\n",
__func__, (t_end_us - t_start_us)/1000.0, ggml_backend_sched_get_n_copies(sched.get()));
} //end kcpp fix to reduce embeddings memory usage
}
void llama_context::synchronize() {