mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-07 14:19:03 +00:00
update README
This commit is contained in:
parent
11ce0d58f7
commit
e8d3e5a631
2 changed files with 6 additions and 8 deletions
|
@ -232,10 +232,10 @@ Assume we have a host machine with at least 32 CPU cores, 32 GiB RAM, and 32 GiB
|
||||||
1. Pull our prebuilt Docker image (e.g., [`prima.cpp:1.0.2-cuda`](https://hub.docker.com/repository/docker/lizonghango00o1/prima.cpp/general)) and run 4 containers:
|
1. Pull our prebuilt Docker image (e.g., [`prima.cpp:1.0.2-cuda`](https://hub.docker.com/repository/docker/lizonghango00o1/prima.cpp/general)) and run 4 containers:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
sudo docker run -dit --name prima-v1 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="0-7" --network host --gpus all prima.cpp:1.0.1-cuda
|
sudo docker run -dit --name prima-v1 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="0-7" --network host --gpus all prima.cpp:1.0.2-cuda
|
||||||
sudo docker run -dit --name prima-v2 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="8-15" --network host --gpus all prima.cpp:1.0.1-cuda
|
sudo docker run -dit --name prima-v2 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="8-15" --network host --gpus all prima.cpp:1.0.2-cuda
|
||||||
sudo docker run -dit --name prima-v3 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="16-23" --network host --gpus all prima.cpp:1.0.1-cuda
|
sudo docker run -dit --name prima-v3 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="16-23" --network host --gpus all prima.cpp:1.0.2-cuda
|
||||||
sudo docker run -dit --name prima-v4 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="24-31" --network host --gpus all prima.cpp:1.0.1-cuda
|
sudo docker run -dit --name prima-v4 --memory=8gb --memory-swap=8gb --cpus 8 --cpuset-cpus="24-31" --network host --gpus all prima.cpp:1.0.2-cuda
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Download the model file [`qwq-32b-q4_k_m.gguf`](https://huggingface.co/Qwen/QwQ-32B-GGUF) and copy it into each container:
|
2. Download the model file [`qwq-32b-q4_k_m.gguf`](https://huggingface.co/Qwen/QwQ-32B-GGUF) and copy it into each container:
|
||||||
|
|
|
@ -21365,8 +21365,6 @@ void * llama_context_setup_backend(
|
||||||
|| model->n_gpu_layers == 0) {
|
|| model->n_gpu_layers == 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_METAL)
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ok = ok & ggml_backend_sched_reserve(ctx->sched[i], gf[i]);
|
ok = ok & ggml_backend_sched_reserve(ctx->sched[i], gf[i]);
|
||||||
|
@ -21933,9 +21931,9 @@ void llama_model_compute_buf_size(
|
||||||
const int64_t n_result = hparams.n_vocab * cparams.n_ubatch;
|
const int64_t n_result = hparams.n_vocab * cparams.n_ubatch;
|
||||||
|
|
||||||
// weights
|
// weights
|
||||||
const int64_t nb_output_w = n_bytes.nb_output_w;
|
|
||||||
const int64_t nb_attn_norm_w = n_bytes.nb_attn_norm_w;
|
const int64_t nb_attn_norm_w = n_bytes.nb_attn_norm_w;
|
||||||
const int64_t nb_attn_q_w = n_bytes.nb_attn_q_w;
|
const int64_t nb_attn_q_w = n_bytes.nb_attn_q_w;
|
||||||
|
// const int64_t nb_output_w = n_bytes.nb_output_w;
|
||||||
|
|
||||||
// format bytes
|
// format bytes
|
||||||
const int64_t type_size_f32 = ggml_type_size(GGML_TYPE_F32);
|
const int64_t type_size_f32 = ggml_type_size(GGML_TYPE_F32);
|
||||||
|
|
Loading…
Add table
Reference in a new issue