embeddings memory usage regression fix

2026-05-08 09:59:50 +00:00 · 2026-01-18 16:26:52 +08:00 · 2026-01-18 16:26:52 +08:00 · 7b4517c2fe
commit 7b4517c2fe
parent 3816391a74
3 changed files with 135 additions and 39 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -413,6 +413,7 @@ void llama_context::sched_reserve() {

    sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, cparams.pipeline_parallel, cparams.op_offload));

+    if (memory) { //added by kcpp to reduce embeddings memory usage
    llama_memory_context_ptr mctx;
    if (memory) {
        LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
@ -547,6 +548,7 @@ void llama_context::sched_reserve() {

    LLAMA_LOG_INFO("%s: reserve took %.2f ms, sched copies = %d\n",
            __func__, (t_end_us - t_start_us)/1000.0, ggml_backend_sched_get_n_copies(sched.get()));
+    } //end kcpp fix to reduce embeddings memory usage
 }

 void llama_context::synchronize() {