From e8d3e5a6314646ae1bb4b7ff9bcae27d0d8e2214 Mon Sep 17 00:00:00 2001
From: "Li, Zonghang" <870644199@qq.com>
Date: Fri, 27 Jun 2025 20:16:30 +0400
Subject: [PATCH] update README

---
 README.md     | 8 ++++----
 src/llama.cpp | 6 ++----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index d312aa11..35191f3e 100644
--- a/README.md
+++ b/README.md
@@ -232,10 +232,10 @@ Assume we have a host machine with at least 32 CPU cores, 32 GiB RAM, and 32 GiB
 1. Pull our prebuilt Docker image (e.g., [`prima.cpp:1.0.2-cuda`](https://hub.docker.com/repository/docker/lizonghango00o1/prima.cpp/general)) and run 4 containers:
 
 ```shell
-sudo docker run -dit --name prima-v1 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="0-7"   --network host --gpus all prima.cpp:1.0.1-cuda
-sudo docker run -dit --name prima-v2 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="8-15"  --network host --gpus all prima.cpp:1.0.1-cuda
-sudo docker run -dit --name prima-v3 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="16-23" --network host --gpus all prima.cpp:1.0.1-cuda
-sudo docker run -dit --name prima-v4 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="24-31" --network host --gpus all prima.cpp:1.0.1-cuda
+sudo docker run -dit --name prima-v1 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="0-7"   --network host --gpus all prima.cpp:1.0.2-cuda
+sudo docker run -dit --name prima-v2 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="8-15"  --network host --gpus all prima.cpp:1.0.2-cuda
+sudo docker run -dit --name prima-v3 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="16-23" --network host --gpus all prima.cpp:1.0.2-cuda
+sudo docker run -dit --name prima-v4 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="24-31" --network host --gpus all prima.cpp:1.0.2-cuda
 ```
 
 2. Download the model file [`qwq-32b-q4_k_m.gguf`](https://huggingface.co/Qwen/QwQ-32B-GGUF) and copy it into each container:
diff --git a/src/llama.cpp b/src/llama.cpp
index 01cf82a2..871daac7 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21365,8 +21365,6 @@ void * llama_context_setup_backend(
                         || model->n_gpu_layers == 0) {
                     continue;
                 }
-#elif defined(GGML_USE_METAL)
-
 #endif
 
                 ok = ok & ggml_backend_sched_reserve(ctx->sched[i], gf[i]);
@@ -21933,10 +21931,10 @@ void llama_model_compute_buf_size(
     const int64_t n_result   = hparams.n_vocab * cparams.n_ubatch;
 
     // weights
-    const int64_t nb_output_w    = n_bytes.nb_output_w;
     const int64_t nb_attn_norm_w = n_bytes.nb_attn_norm_w;
     const int64_t nb_attn_q_w    = n_bytes.nb_attn_q_w;
-
+    // const int64_t nb_output_w    = n_bytes.nb_output_w;
+    
     // format bytes
     const int64_t type_size_f32 = ggml_type_size(GGML_TYPE_F32);
     const int64_t type_size_f16 = ggml_type_size(GGML_TYPE_F16);