From 82787be7eb93a02e15147fb69a4b3b28e7ae92e0 Mon Sep 17 00:00:00 2001 From: leeetao <3122669219@qq.com> Date: Tue, 1 Jul 2025 09:19:19 +0000 Subject: [PATCH] Enable distributed model perplexity measurement for different bit-width models with -lw and -ngl parameters --- examples/perplexity/perplexity.cpp | 4 ++-- src/llama.cpp | 27 ++++++++++++++++++++------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 9d04cfc0..550b1b20 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -533,8 +533,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par llama_send_meta(ctx, &meta); LOG_INF("%s: rank 0 tokens_size sent successfully\n", __func__); } else { - LOG_INF("%s: rank %d waiting 5 seconds for rank 0 to complete tokenization\n", __func__, my_rank); - std::this_thread::sleep_for(std::chrono::milliseconds(5000)); + LOG_INF("%s: rank %d waiting 7 seconds for rank 0 to complete tokenization\n", __func__, my_rank); + std::this_thread::sleep_for(std::chrono::milliseconds(7000)); LOG_INF("%s: rank %d delay completed, now receiving tokens_size\n", __func__, my_rank); if (llama_recv_meta(ctx, &meta) == -1) { return { {}, -1.0, {}, {} }; diff --git a/src/llama.cpp b/src/llama.cpp index 30645576..1020277c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17892,10 +17892,8 @@ void llama_send_meta(llama_context * ctx, struct sync_meta * meta) { send_msgs.emplace_back("tokens_size", strlen("tokens_size")); send_msgs.emplace_back(&(meta->tokens_size), sizeof(meta->tokens_size)); - if (meta->n_chunks >= 0) { - send_msgs.emplace_back("n_chunks", strlen("n_chunks")); - send_msgs.emplace_back(&(meta->n_chunks), sizeof(meta->n_chunks)); - } + send_msgs.emplace_back("n_chunks", strlen("n_chunks")); + send_msgs.emplace_back(&(meta->n_chunks), sizeof(meta->n_chunks)); zmq::send_multipart(*send_socket, send_msgs); return; @@ -18015,6 +18013,11 @@ int llama_recv_meta(llama_context * ctx, struct sync_meta * meta) { recv_socket->set(zmq::sockopt::rcvtimeo, -1); + if (recv_msgs.size() < 2) { + LLAMA_LOG_ERROR("Invalid message format: too few messages\n"); + return -1; + } + const std::string cmd = recv_msgs[0].to_string(); size_t idx = 1; @@ -18023,8 +18026,6 @@ int llama_recv_meta(llama_context * ctx, struct sync_meta * meta) { return 0; } - - if (cmd == "kv_seq_rm" && recv_msgs.size() == 4) { meta->kv_seq_rm = true; std::memcpy(&meta->rm_seq_id, recv_msgs[idx++].data(), sizeof(meta->rm_seq_id)); @@ -18060,6 +18061,19 @@ int llama_recv_meta(llama_context * ctx, struct sync_meta * meta) { return 0; } + if (cmd == "tokens_size" && recv_msgs.size() == 4) { + std::memcpy(&(meta->tokens_size), recv_msgs[1].data(), sizeof(meta->tokens_size)); + + std::string chunks_key = recv_msgs[2].to_string(); + if (chunks_key == "n_chunks") { + std::memcpy(&(meta->n_chunks), recv_msgs[3].data(), sizeof(meta->n_chunks)); + } else { + LLAMA_LOG_ERROR("Expected 'n_chunks' key but got '%s'\n", chunks_key.c_str()); + return -1; + } + return 0; + } + if (recv_msgs.size() % 2 != 0) { LLAMA_LOG_ERROR("Invalid message format: odd number of messages\n"); return -1; @@ -18357,7 +18371,6 @@ static int llama_decode_internal( GGML_ASSERT(!(my_rank == 0 && n_tokens_all == 0) && "n_tokens == 0 on master node"); - GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT if (batch_all.token) { for (uint32_t i = 0; i < n_tokens_all; ++i) { if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {