From b019a707b8c4480f9e4d7607fc15bcb62d76ccc0 Mon Sep 17 00:00:00 2001 From: "Li, Zonghang" <870644199@qq.com> Date: Sun, 13 Jul 2025 13:42:24 +0800 Subject: [PATCH] server: fix bugs --- common/arg.cpp | 1 + examples/server/server.cpp | 4 ++++ src/llama.cpp | 13 ++++++++----- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index a338f613..45954b52 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -673,6 +673,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), [](gpt_params & params, int value) { params.n_ctx = value; + params.speculative.n_ctx = value; } ).set_env("LLAMA_ARG_CTX_SIZE")); add_opt(llama_arg( diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5cbd7f38..3844c886 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -760,6 +760,8 @@ struct server_context { llama_free (llama_init_dft.context); llama_free_model(llama_init_dft.model); + model_dft = nullptr; + return false; } @@ -3566,6 +3568,8 @@ int main(int argc, char ** argv) { LOG_INF("%s: loading model\n", __func__); if (!ctx_server.load_model(params)) { + char * stop_signal = nullptr; + llama_free_sockets(ctx_server.ctx, &stop_signal); clean_up(); t.join(); LOG_ERR("%s: exiting due to model loading error\n", __func__); diff --git a/src/llama.cpp b/src/llama.cpp index a57715e3..9aa9cd82 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17878,7 +17878,7 @@ static void llama_send_meta(zmq::socket_t & socket, struct sync_meta * meta, boo if (meta->pos != nullptr) { send_msgs.emplace_back("pos", strlen("pos")); - send_msgs.emplace_back(meta->pos, meta->n_ctx * sizeof(llama_pos)); + send_msgs.emplace_back(meta->pos, meta->n_tokens * sizeof(llama_pos)); } if (meta->n_seq_id != nullptr) { @@ -17986,8 +17986,8 @@ static int llama_recv_meta(zmq::socket_t & socket, struct sync_meta * meta) { } if (key == "pos") { - meta->pos = (llama_pos *) malloc(meta->n_ctx * sizeof(llama_pos)); - std::memcpy(meta->pos, data_msg.data(), meta->n_ctx * sizeof(llama_pos)); + meta->pos = (llama_pos *) malloc(meta->n_tokens * sizeof(llama_pos)); + std::memcpy(meta->pos, data_msg.data(), meta->n_tokens * sizeof(llama_pos)); } if (key == "n_seq_id") { @@ -18304,8 +18304,8 @@ static int llama_decode_internal( if (meta.n_tokens > 0) { batch_all.n_tokens = meta.n_tokens; if (meta.pos != nullptr) { - batch_all.pos = (llama_pos *) malloc(meta.n_ctx * sizeof(llama_pos)); - std::memcpy(batch_all.pos, meta.pos, meta.n_ctx * sizeof(llama_pos)); + batch_all.pos = (llama_pos *) malloc(meta.n_tokens * sizeof(llama_pos)); + std::memcpy(batch_all.pos, meta.pos, meta.n_tokens * sizeof(llama_pos)); } if (meta.n_seq_id != nullptr) { batch_all.n_seq_id = (int32_t *) malloc(meta.n_tokens * sizeof(int32_t)); @@ -22089,6 +22089,9 @@ void llama_model_compute_buf_size( // this value may vary by GPU and CUDA version, but it's lower than 400 MiB in most cases, // another 300 MiB is used to prevent accidental OOM. *gpu_buf += 700 * 1024 * 1024; + } else if (backend == BACKEND_METAL) { + // 300 MiB is used to prevent accidental OOM, e.g., automatic quantization conversion. + *gpu_buf += 300 * 1024 * 1024; } }