From e8d3e5a6314646ae1bb4b7ff9bcae27d0d8e2214 Mon Sep 17 00:00:00 2001 From: "Li, Zonghang" <870644199@qq.com> Date: Fri, 27 Jun 2025 20:16:30 +0400 Subject: [PATCH] update README --- README.md | 8 ++++---- src/llama.cpp | 6 ++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index d312aa11..35191f3e 100644 --- a/README.md +++ b/README.md @@ -232,10 +232,10 @@ Assume we have a host machine with at least 32 CPU cores, 32 GiB RAM, and 32 GiB 1. Pull our prebuilt Docker image (e.g., [`prima.cpp:1.0.2-cuda`](https://hub.docker.com/repository/docker/lizonghango00o1/prima.cpp/general)) and run 4 containers: ```shell -sudo docker run -dit --name prima-v1 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="0-7" --network host --gpus all prima.cpp:1.0.1-cuda -sudo docker run -dit --name prima-v2 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="8-15" --network host --gpus all prima.cpp:1.0.1-cuda -sudo docker run -dit --name prima-v3 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="16-23" --network host --gpus all prima.cpp:1.0.1-cuda -sudo docker run -dit --name prima-v4 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="24-31" --network host --gpus all prima.cpp:1.0.1-cuda +sudo docker run -dit --name prima-v1 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="0-7" --network host --gpus all prima.cpp:1.0.2-cuda +sudo docker run -dit --name prima-v2 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="8-15" --network host --gpus all prima.cpp:1.0.2-cuda +sudo docker run -dit --name prima-v3 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="16-23" --network host --gpus all prima.cpp:1.0.2-cuda +sudo docker run -dit --name prima-v4 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="24-31" --network host --gpus all prima.cpp:1.0.2-cuda ``` 2. Download the model file [`qwq-32b-q4_k_m.gguf`](https://huggingface.co/Qwen/QwQ-32B-GGUF) and copy it into each container: diff --git a/src/llama.cpp b/src/llama.cpp index 01cf82a2..871daac7 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -21365,8 +21365,6 @@ void * llama_context_setup_backend( || model->n_gpu_layers == 0) { continue; } -#elif defined(GGML_USE_METAL) - #endif ok = ok & ggml_backend_sched_reserve(ctx->sched[i], gf[i]); @@ -21933,10 +21931,10 @@ void llama_model_compute_buf_size( const int64_t n_result = hparams.n_vocab * cparams.n_ubatch; // weights - const int64_t nb_output_w = n_bytes.nb_output_w; const int64_t nb_attn_norm_w = n_bytes.nb_attn_norm_w; const int64_t nb_attn_q_w = n_bytes.nb_attn_q_w; - + // const int64_t nb_output_w = n_bytes.nb_output_w; + // format bytes const int64_t type_size_f32 = ggml_type_size(GGML_TYPE_F32); const int64_t type_size_f16 = ggml_type_size(GGML_TYPE_F16);