mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-05 19:49:03 +00:00
Merge branch 'dev' into feat/auto-exit
This commit is contained in:
commit
7b0ededd24
10 changed files with 488 additions and 87 deletions
8
Makefile
8
Makefile
|
@ -1,5 +1,9 @@
|
|||
# Define the default target now so that it is always the first target
|
||||
BUILD_TARGETS = llama-cli profile-tool
|
||||
BUILD_TARGETS = \
|
||||
llama-server \
|
||||
llama-cli \
|
||||
profile-tool
|
||||
|
||||
# BUILD_TARGETS = \
|
||||
# libllava.a \
|
||||
# llama-baby-llama \
|
||||
|
@ -268,7 +272,7 @@ MK_LDFLAGS += -L/usr/local/lib -lzmq
|
|||
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
MK_CPPFLAGS += -isystem /opt/homebrew/include
|
||||
MK_LDFLAGS += -L/opt/homebrew/lib -lzmq
|
||||
MK_LDFLAGS += -L/opt/homebrew/lib
|
||||
endif
|
||||
|
||||
ifeq ($(USE_HIGHS),1)
|
||||
|
|
69
README.md
69
README.md
|
@ -203,17 +203,17 @@ graph LR;
|
|||
Take QwQ-32B as an example, run the following commands on the devices to launch distributed inference:
|
||||
|
||||
```shell
|
||||
# on head device without a GPU, rank 0:
|
||||
# On head device without a GPU, rank 0:
|
||||
./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 -n 256 -p "what is edge AI?" --world 4 --rank 0 --master 192.168.1.2 --next 192.168.1.3 --prefetch
|
||||
|
||||
# on worker device with 8 GiB VRAM, rank 1:
|
||||
./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 4 --rank 1 --master 192.168.1.2 --next 192.168.1.4 --prefetch --gpu-mem 8
|
||||
# On worker device with 8 GiB VRAM, rank 1:
|
||||
./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 4 --rank 1 --master 192.168.1.2 --next 192.168.1.4 --prefetch --gpu-mem 8
|
||||
|
||||
# on worker device with 11 GiB VRAM, rank 2:
|
||||
./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 4 --rank 2 --master 192.168.1.2 --next 192.168.1.5 --prefetch --gpu-mem 11
|
||||
# On worker device with 11 GiB VRAM, rank 2:
|
||||
./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 4 --rank 2 --master 192.168.1.2 --next 192.168.1.5 --prefetch --gpu-mem 11
|
||||
|
||||
# on worker device without a GPU, rank 3:
|
||||
./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 4 --rank 3 --master 192.168.1.2 --next 192.168.1.2 --prefetch
|
||||
# On worker device without a GPU, rank 3:
|
||||
./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 4 --rank 3 --master 192.168.1.2 --next 192.168.1.2 --prefetch
|
||||
```
|
||||
|
||||
Once started, prima.cpp will profile each device and decide how much workload to assign, e.g., how many model layers each device should handle, and how many of them should run on GPU (if available).
|
||||
|
@ -262,6 +262,37 @@ cd /root/prima.cpp
|
|||
|
||||
> If your host machine does not have a GPU, ignore the `--gpu-mem` option.
|
||||
|
||||
> If you update to the latest code, non-rank 0 nodes can omit `-c 1024`.
|
||||
|
||||
### Run in Server Mode
|
||||
You can run prima.cpp in server mode, by launching `llama-server` on the rank 0 device (with `--host` and `--port` specified) and `llama-cli` on the others. Here is an example with 2 devices:
|
||||
|
||||
```shell
|
||||
# On rank 0, run:
|
||||
./llama-server -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 2 --rank 0 --master 192.168.1.2 --next 192.168.1.3 --prefetch --host 127.0.0.1 --port 8080
|
||||
|
||||
# On rank 1, run:
|
||||
./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 2 --rank 1 --master 192.168.1.2 --next 192.168.1.2 --prefetch
|
||||
```
|
||||
|
||||
After that, you can interact with the rank 0 device by calling the Chat Completion API:
|
||||
|
||||
```shell
|
||||
curl http://127.0.0.1:8080/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "qwq-32b",
|
||||
"messages": [
|
||||
{"role": "user", "content": "what is edge AI?"}
|
||||
],
|
||||
"max_tokens": 200,
|
||||
"temperature": 0.7,
|
||||
"stream": true
|
||||
}'
|
||||
```
|
||||
|
||||
You can also use third-party GUI clients like [AnythingLLM](https://anythingllm.com/) and set the API endpoint from prima.cpp, by default, `http://localhost:8080/v1`.
|
||||
|
||||
## ❓ FAQ
|
||||
|
||||
**1. How can I manually set the workload for each device?**
|
||||
|
@ -273,13 +304,13 @@ By default, prima.cpp automatically profiles devices and assigns workloads. Howe
|
|||
./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 -n 256 -p "what is edge AI?" --world 4 --rank 0 --master 192.168.1.2 --next 192.168.1.3 --prefetch -lw "16,16,16,16"
|
||||
|
||||
# on worker device with 8 GiB VRAM, rank 1, use the option "-ngl":
|
||||
./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 4 --rank 1 --master 192.168.1.2 --next 192.168.1.4 --prefetch -ngl 16
|
||||
./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 4 --rank 1 --master 192.168.1.2 --next 192.168.1.4 --prefetch -ngl 16
|
||||
|
||||
# on worker device with 11 GiB VRAM, rank 2, use the option "-ngl":
|
||||
./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 4 --rank 2 --master 192.168.1.2 --next 192.168.1.5 --prefetch -ngl 16
|
||||
./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 4 --rank 2 --master 192.168.1.2 --next 192.168.1.5 --prefetch -ngl 16
|
||||
|
||||
# on worker device without a GPU, rank 3:
|
||||
./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 4 --rank 3 --master 192.168.1.2 --next 192.168.1.2 --prefetch
|
||||
./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 4 --rank 3 --master 192.168.1.2 --next 192.168.1.2 --prefetch
|
||||
```
|
||||
|
||||
- `-lw` sets the total model layers each device should handle. The format is a comma-separated list, one value per device, in rank order. You can also set `"8,8,8,8"`, `"4,4,4,4"`, `"16,16,24,8"`.
|
||||
|
@ -287,7 +318,15 @@ By default, prima.cpp automatically profiles devices and assigns workloads. Howe
|
|||
|
||||
> Example: if `-lw "16,16,16,16"` is passed to the head device, then each of the 4 devices will handle 16 model layers. A worker with `-ngl 8` (if a GPU is available) will run 8/16 layers on the GPU.
|
||||
|
||||
**2. How to run in chat mode like in llama.cpp?**
|
||||
**2. How to manually profile my device?**
|
||||
|
||||
If `-lw` is set, prima.cpp skips profiling and runs directly with the user-defined `-lw` and `-ngl`. If you wish to profile a device manually, run `profile-tool` on that device.
|
||||
|
||||
```shell
|
||||
./profile-tool -m download/qwq-32b-q4_k_m.gguf
|
||||
```
|
||||
|
||||
**3. How to run in chat mode like in llama.cpp?**
|
||||
|
||||
To enable chat (conversation) mode, simply add the `-cnv` flag on the head device:
|
||||
|
||||
|
@ -298,7 +337,7 @@ To enable chat (conversation) mode, simply add the `-cnv` flag on the head devic
|
|||
|
||||
To quit the chat mode, input `quit` or `exit`.
|
||||
|
||||
**3. How to force prefetching after computing?**
|
||||
**4. How to force prefetching after computing?**
|
||||
|
||||
By default, prima.cpp only advises the OS to prefetch upcoming layer weights. The actual prefetching is then scheduled and handled by the OS, which may introduce some uncertainty. To explicitly trigger prefetching right after computing, you can use the `--force` flag on each device:
|
||||
|
||||
|
@ -309,11 +348,11 @@ By default, prima.cpp only advises the OS to prefetch upcoming layer weights. Th
|
|||
|
||||
This enables more aggressive overlap but also introduce extra memory access latency. Use `--force` only after testing, as its effect depends on your hardware and OS behavior.
|
||||
|
||||
**4. Does it support Windows?**
|
||||
**5. Does it support Windows?**
|
||||
|
||||
Not yet—but it's on the roadmap. Currently, prima.cpp can run on Linux, macOS, Android and HarmonyOS (via Termux). You can mix heterogeneous devices in the cluster.
|
||||
|
||||
**5. Does it support Vulkan or AMD GPUs?**
|
||||
**6. Does it support Vulkan or AMD GPUs?**
|
||||
|
||||
Not yet. Now prima.cpp supports only CUDA-based GPUs. Vulkan is in our roadmap, and AMD GPUs will be supported once we have that device.
|
||||
|
||||
|
@ -326,7 +365,7 @@ If you find this work helpful, please do not hesitate to cite us and send a star
|
|||
```bibtex
|
||||
@misc{li2025primacpp,
|
||||
title={PRIMA.CPP: Speeding Up 70B-Scale LLM Inference on Low-Resource Everyday Home Clusters},
|
||||
author={Zonghang Li and Tao Li and Wenjiao Feng and Mohsen Guizani and Hongfang Yu},
|
||||
author={Zonghang Li and Tao Li and Wenjiao Feng and Mohsen Guizani and Hongfang Yu and Qirong Ho and Wei Xiang},
|
||||
year={2025},
|
||||
eprint={2504.08791},
|
||||
archivePrefix={arXiv},
|
||||
|
|
|
@ -986,13 +986,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||
params.enable_chat_template = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
|
||||
add_opt(llama_arg(
|
||||
{"--no-warmup"},
|
||||
"skip warming up the model with an empty run",
|
||||
[](gpt_params & params) {
|
||||
params.warmup = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
// add_opt(llama_arg(
|
||||
// {"--no-warmup"},
|
||||
// "skip warming up the model with an empty run",
|
||||
// [](gpt_params & params) {
|
||||
// params.warmup = false;
|
||||
// }
|
||||
// ).set_examples({LLAMA_EXAMPLE_MAIN}));
|
||||
add_opt(llama_arg(
|
||||
{"--spm-infill"},
|
||||
format(
|
||||
|
@ -1317,6 +1317,12 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||
{"-ctk", "--cache-type-k"}, "TYPE",
|
||||
format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
LOG_WRN("The option -ctk or --cache-type-k is not supported on Metal, use default type\n");
|
||||
return;
|
||||
#endif
|
||||
|
||||
// TODO: get the type right here
|
||||
params.cache_type_k = value;
|
||||
}
|
||||
|
@ -1325,6 +1331,11 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||
{"-ctv", "--cache-type-v"}, "TYPE",
|
||||
format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
#ifdef GGML_USE_METAL
|
||||
LOG_WRN("The option -ctv or --cache-type-v is not supported on Metal, use default type\n");
|
||||
return;
|
||||
#endif
|
||||
|
||||
// TODO: get the type right here
|
||||
params.cache_type_v = value;
|
||||
}
|
||||
|
@ -1413,13 +1424,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||
params.defrag_thold = std::stof(value);
|
||||
}
|
||||
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
|
||||
add_opt(llama_arg(
|
||||
{"-np", "--parallel"}, "N",
|
||||
format("number of parallel sequences to decode (default: %d)", params.n_parallel),
|
||||
[](gpt_params & params, int value) {
|
||||
params.n_parallel = value;
|
||||
}
|
||||
).set_env("LLAMA_ARG_N_PARALLEL"));
|
||||
// add_opt(llama_arg(
|
||||
// {"-np", "--parallel"}, "N",
|
||||
// format("number of parallel sequences to decode (default: %d)", params.n_parallel),
|
||||
// [](gpt_params & params, int value) {
|
||||
// params.n_parallel = value;
|
||||
// }
|
||||
// ).set_env("LLAMA_ARG_N_PARALLEL"));
|
||||
add_opt(llama_arg(
|
||||
{"-ns", "--sequences"}, "N",
|
||||
format("number of sequences to decode (default: %d)", params.n_sequences),
|
||||
|
|
|
@ -1582,6 +1582,12 @@ static bool tune_layer_allocation(
|
|||
//
|
||||
|
||||
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||
|
||||
#if !(defined(GGML_USE_METAL) || defined(GGML_USE_CUDA))
|
||||
// reset n_gpu_layers to 0 if GPU is not used
|
||||
params.n_gpu_layers = 0;
|
||||
#endif
|
||||
|
||||
llama_init_result iparams;
|
||||
auto mparams = llama_model_params_from_gpt_params(params);
|
||||
|
||||
|
@ -1637,10 +1643,16 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
|
||||
if (n_world == 1) {
|
||||
uint32_t n_layers = llama_model_n_layers(model);
|
||||
// assign all layers to this device
|
||||
params.n_layer_window[0] = n_layers;
|
||||
cparams.n_layer_window[0] = n_layers;
|
||||
mparams.n_layer_window[0] = n_layers;
|
||||
llama_context_n_layer_window(lctx)[0] = n_layers;
|
||||
|
||||
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
|
||||
params.n_gpu_layers = std::min((int32_t)n_layers, params.n_gpu_layers);
|
||||
#endif
|
||||
|
||||
} else {
|
||||
uint32_t n_layer_window[32] = {0}, n_gpu_layers[32] = {0};
|
||||
|
||||
|
@ -1649,12 +1661,20 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
|
||||
// broadcast startup args
|
||||
struct startup_args args;
|
||||
if (my_rank==0){
|
||||
if (my_rank == 0){
|
||||
args.should_profile = auto_schedule;
|
||||
args.n_ctx = params.n_ctx;
|
||||
}
|
||||
|
||||
llama_bcast_startup_args(lctx, my_rank, &args);
|
||||
|
||||
auto_schedule = args.should_profile;
|
||||
if (my_rank > 0) {
|
||||
// receive startup args
|
||||
auto_schedule = args.should_profile;
|
||||
params.n_ctx = args.n_ctx;
|
||||
cparams.n_ctx = args.n_ctx;
|
||||
}
|
||||
|
||||
// if n_world > 1 and need auto schdule, then prifile
|
||||
if (auto_schedule){
|
||||
// get device profile
|
||||
|
@ -1751,6 +1771,11 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
cparams.n_gpu_layers = n_gpu_layers[my_rank];
|
||||
mparams.n_gpu_layers = n_gpu_layers[my_rank];
|
||||
llama_model_set_n_gpu_layers(model, n_gpu_layers[my_rank]);
|
||||
} else { // -ngl is set
|
||||
params.n_gpu_layers = std::min(params.n_gpu_layers, (int32_t)n_layer_window[my_rank]);
|
||||
cparams.n_gpu_layers = params.n_gpu_layers;
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
llama_model_set_n_gpu_layers(model, params.n_gpu_layers);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1820,7 +1845,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
|||
}
|
||||
|
||||
if (params.warmup) {
|
||||
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
||||
LOG_WRN("%s: warming up the model with an empty run - please wait ...\n", __func__);
|
||||
|
||||
const uint32_t my_rank = cparams.rank;
|
||||
std::vector<llama_token> tmp;
|
||||
|
|
|
@ -350,7 +350,6 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in
|
|||
return 0.0f;
|
||||
}
|
||||
|
||||
size_t QK_K = 0;
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F32: {
|
||||
matrix_B = malloc(embd_size * sizeof(float));
|
||||
|
@ -914,7 +913,7 @@ ioengine=%s
|
|||
direct=1
|
||||
time_based=1
|
||||
runtime=1
|
||||
size=4G
|
||||
size=1G
|
||||
group_reporting=1
|
||||
iodepth=1
|
||||
|
||||
|
|
|
@ -313,7 +313,8 @@ struct disk_props {
|
|||
};
|
||||
|
||||
struct startup_args{
|
||||
bool should_profile;
|
||||
bool should_profile;
|
||||
uint32_t n_ctx;
|
||||
};
|
||||
|
||||
struct device_info {
|
||||
|
|
|
@ -351,6 +351,9 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// remove any "future" tokens that we might have inherited from the previous session
|
||||
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
||||
if (my_rank == 0) {
|
||||
llama_send_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
|
||||
|
@ -596,6 +599,11 @@ int main(int argc, char ** argv) {
|
|||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||
|
||||
if (my_rank == 0) {
|
||||
llama_send_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
||||
llama_send_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
||||
}
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
LOG_DBG("after swap: n_past = %d\n", n_past);
|
||||
|
|
|
@ -116,7 +116,7 @@ struct server_task {
|
|||
};
|
||||
|
||||
struct server_task_result {
|
||||
int id = -1;
|
||||
int id = -1;
|
||||
|
||||
json data;
|
||||
|
||||
|
@ -1063,6 +1063,9 @@ struct server_context {
|
|||
|
||||
// clear the entire KV cache
|
||||
llama_kv_cache_clear(ctx);
|
||||
|
||||
llama_send_kv_cache_clear(ctx);
|
||||
|
||||
clean_kv_cache = false;
|
||||
}
|
||||
|
||||
|
@ -1191,7 +1194,7 @@ struct server_context {
|
|||
SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
|
||||
}
|
||||
|
||||
// if context shift is disabled, we stop when it reaches the context limit
|
||||
// we stop when it reaches the context limit, otherwise it may run forever
|
||||
if (slot.n_decoded >= slot.n_ctx) {
|
||||
slot.truncated = true;
|
||||
slot.stopped_limit = true;
|
||||
|
@ -1917,8 +1920,11 @@ struct server_context {
|
|||
|
||||
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_add (ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
|
||||
llama_send_kv_cache_seq_rm (ctx, slot.id , n_keep , n_keep + n_discard);
|
||||
llama_send_kv_cache_seq_add(ctx, slot.id , n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
||||
|
||||
if (slot.params.cache_prompt) {
|
||||
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
|
||||
|
@ -2084,7 +2090,6 @@ struct server_context {
|
|||
// if input prompt is too big, truncate it (if group attention self-extend is disabled)
|
||||
if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) {
|
||||
const int n_left = slot.n_ctx - slot.params.n_keep;
|
||||
|
||||
const int n_block_size = n_left / 2;
|
||||
const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
|
||||
|
||||
|
@ -2161,12 +2166,14 @@ struct server_context {
|
|||
int p0 = (int) system_tokens.size() + slot.n_past;
|
||||
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
|
||||
// could not partially delete (likely using a non-Transformer model)
|
||||
llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
|
||||
llama_kv_cache_seq_rm (ctx, slot.id + 1, -1, -1);
|
||||
llama_send_kv_cache_seq_rm(ctx, slot.id , -1, -1);
|
||||
|
||||
p0 = (int) system_tokens.size();
|
||||
if (p0 != 0) {
|
||||
// copy over the system prompt when there is one
|
||||
llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1);
|
||||
llama_kv_cache_seq_cp (ctx, 0, slot.id + 1, -1, -1);
|
||||
llama_send_kv_cache_seq_cp(ctx, 0, slot.id , -1, -1);
|
||||
}
|
||||
|
||||
// there is no common part left (except for the system prompt)
|
||||
|
@ -2175,6 +2182,8 @@ struct server_context {
|
|||
slot.ga_i = 0;
|
||||
// TODO: is the system prompt ever in the sampling context?
|
||||
gpt_sampler_reset(slot.smpl);
|
||||
} else {
|
||||
llama_send_kv_cache_seq_rm(ctx, slot.id, p0, -1);
|
||||
}
|
||||
|
||||
// remove the non-common part from the cache
|
||||
|
@ -2260,9 +2269,14 @@ struct server_context {
|
|||
SLT_DBG(slot, "div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
|
||||
SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
||||
|
||||
llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd);
|
||||
llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
|
||||
llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd);
|
||||
llama_kv_cache_seq_add (ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd);
|
||||
llama_send_kv_cache_seq_add(ctx, slot.id , slot.ga_i, slot.n_past_se, ib * bd);
|
||||
|
||||
llama_kv_cache_seq_div (ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
|
||||
llama_send_kv_cache_seq_div(ctx, slot.id , slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
|
||||
|
||||
llama_kv_cache_seq_add (ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd);
|
||||
llama_send_kv_cache_seq_add(ctx, slot.id , slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd);
|
||||
|
||||
slot.n_past_se -= bd;
|
||||
|
||||
|
@ -3329,10 +3343,6 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// bind HTTP listen port, run the HTTP server in a thread
|
||||
if (!svr->bind_to_port(params.hostname, params.port)) {
|
||||
//LOG_ERROR("couldn't bind HTTP server socket", {
|
||||
// {"hostname", params.hostname},
|
||||
// {"port", params.port},
|
||||
//});
|
||||
LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port);
|
||||
clean_up();
|
||||
return 1;
|
||||
|
@ -3377,10 +3387,6 @@ int main(int argc, char ** argv) {
|
|||
ctx_server.queue_tasks.terminate();
|
||||
};
|
||||
|
||||
LOG_INF("%s: server is listening on %s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
|
||||
|
||||
ctx_server.queue_tasks.start_loop();
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = signal_handler;
|
||||
|
@ -3395,6 +3401,13 @@ int main(int argc, char ** argv) {
|
|||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||
#endif
|
||||
|
||||
LOG_INF("%s: server is listening on %s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
|
||||
|
||||
ctx_server.queue_tasks.start_loop();
|
||||
|
||||
char * stop_signal = nullptr;
|
||||
llama_free_sockets(ctx_server.ctx, &stop_signal);
|
||||
|
||||
clean_up();
|
||||
t.join();
|
||||
|
||||
|
|
|
@ -283,7 +283,7 @@ extern "C" {
|
|||
uint32_t n_world; // number of nodes
|
||||
uint32_t rank; // my node rank
|
||||
uint32_t n_layer_window[32]; // number of layers to kept each time
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||
|
||||
// main_gpu interpretation depends on split_mode:
|
||||
|
@ -453,7 +453,7 @@ extern "C" {
|
|||
LLAMA_API void llama_free_sockets (struct llama_context * ctx, char ** msg);
|
||||
LLAMA_API int llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set);
|
||||
LLAMA_API int llama_send_device_info (struct llama_context * ctx, struct device_info * dev_info);
|
||||
LLAMA_API int llama_bcast_startup_args(struct llama_context * ctx, uint32_t rank, struct startup_args * args);
|
||||
LLAMA_API int llama_bcast_startup_args(struct llama_context * ctx, uint32_t rank, struct startup_args * args);
|
||||
LLAMA_API int llama_bcast_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers);
|
||||
LLAMA_API int llama_rebuild_topo (struct llama_context * ctx, uint32_t * n_layer_window, struct device_info * dev_info_set);
|
||||
LLAMA_API int llama_recv_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers);
|
||||
|
@ -712,8 +712,10 @@ extern "C" {
|
|||
LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
|
||||
|
||||
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
||||
LLAMA_API void llama_kv_cache_clear(
|
||||
struct llama_context * ctx);
|
||||
LLAMA_API void llama_kv_cache_clear(struct llama_context * ctx);
|
||||
|
||||
// Notify other devices to clear their KV cache
|
||||
LLAMA_API void llama_send_kv_cache_clear(struct llama_context * ctx);
|
||||
|
||||
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
||||
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
||||
|
@ -725,6 +727,13 @@ extern "C" {
|
|||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1);
|
||||
|
||||
// Notify other nodes to remove a range from their KV cache
|
||||
LLAMA_API void llama_send_kv_cache_seq_rm(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1);
|
||||
|
||||
// Copy all tokens that belong to the specified sequence to another sequence
|
||||
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
||||
|
@ -736,6 +745,14 @@ extern "C" {
|
|||
llama_seq_id seq_id_dst,
|
||||
llama_pos p0,
|
||||
llama_pos p1);
|
||||
|
||||
// Notify other nodes to copy a range of KV entries
|
||||
LLAMA_API void llama_send_kv_cache_seq_cp(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id_src,
|
||||
llama_seq_id seq_id_dst,
|
||||
llama_pos p0,
|
||||
llama_pos p1);
|
||||
|
||||
// Removes all tokens that do not belong to the specified sequence
|
||||
LLAMA_API void llama_kv_cache_seq_keep(
|
||||
|
@ -755,6 +772,14 @@ extern "C" {
|
|||
llama_pos p1,
|
||||
llama_pos delta);
|
||||
|
||||
// Notify other nodes to shift (add) their KV cache entries
|
||||
LLAMA_API void llama_send_kv_cache_seq_add(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1,
|
||||
llama_pos delta);
|
||||
|
||||
// Integer division of the positions by factor of `d > 1`
|
||||
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
||||
// - lazily on next llama_decode()
|
||||
|
@ -767,6 +792,14 @@ extern "C" {
|
|||
llama_pos p0,
|
||||
llama_pos p1,
|
||||
int d);
|
||||
|
||||
// Notify other nodes to perform a division operation on a KV cache range
|
||||
LLAMA_API void llama_send_kv_cache_seq_div(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
llama_pos p1,
|
||||
int d);
|
||||
|
||||
// Returns the largest position present in the KV cache for the specified sequence
|
||||
LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
|
||||
|
|
324
src/llama.cpp
324
src/llama.cpp
|
@ -121,6 +121,17 @@ struct Timer {
|
|||
// helpers
|
||||
//
|
||||
|
||||
template<typename LocalFn, typename RemoteFn>
|
||||
bool kv_cache_op(bool flag,
|
||||
LocalFn local_fn,
|
||||
RemoteFn remote_fn,
|
||||
bool is_last_dev) {
|
||||
if (!flag) return false;
|
||||
local_fn();
|
||||
if (!is_last_dev) remote_fn();
|
||||
return true;
|
||||
}
|
||||
|
||||
// trim whitespace from the beginning and end of a string
|
||||
static std::string trim(const std::string & str) {
|
||||
size_t start = 0;
|
||||
|
@ -4171,7 +4182,7 @@ static bool llama_kv_cache_find_slot(
|
|||
}
|
||||
|
||||
if (n_tested >= cache.size) {
|
||||
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
||||
LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -10643,7 +10654,7 @@ struct llm_build_context {
|
|||
cb(lctx.inp_K_shift, "K_shift", -1);
|
||||
ggml_set_input(lctx.inp_K_shift);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
for (int il = 0; il < (int)kv_self.k_l.size(); ++il) {
|
||||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
||||
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
||||
|
@ -10656,13 +10667,19 @@ struct llm_build_context {
|
|||
|
||||
struct ggml_tensor * tmp;
|
||||
if (ggml_is_quantized(k->type)) {
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
GGML_ABORT("The option --cache-type-k is not supported on Metal\n");
|
||||
#endif
|
||||
|
||||
// dequantize to f32 -> RoPE -> quantize back
|
||||
tmp = ggml_cast(ctx0, k, GGML_TYPE_F32);
|
||||
cb(tmp, "K_f32", il);
|
||||
|
||||
for (auto * backend : lctx.backends) {
|
||||
// Figure out which backend KV cache belongs to
|
||||
if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft)) {
|
||||
ggml_backend_sched_set_tensor_backend(lctx.sched.at(0), tmp, backend); // todo.
|
||||
ggml_backend_sched_set_tensor_backend(lctx.sched[0], tmp, backend);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -17783,7 +17800,41 @@ struct input_tensors {
|
|||
};
|
||||
|
||||
struct sync_meta {
|
||||
int32_t n_tokens = 0;
|
||||
int32_t n_tokens = 0;
|
||||
llama_pos * pos = nullptr;
|
||||
llama_pos all_pos_0;
|
||||
llama_pos all_pos_1;
|
||||
uint32_t n_ctx = 0;
|
||||
|
||||
// signal to clear the kv cache
|
||||
bool clear_kv_cache = false;
|
||||
|
||||
// signal to remove a kv cache sequence
|
||||
bool kv_seq_rm = false;
|
||||
llama_seq_id rm_seq_id = 0;
|
||||
llama_pos rm_p0 = 0;
|
||||
llama_pos rm_p1 = 0;
|
||||
|
||||
// signal to add a kv cache sequence
|
||||
bool kv_seq_add = false;
|
||||
llama_seq_id add_seq_id = 0;
|
||||
llama_pos add_p0 = 0;
|
||||
llama_pos add_p1 = 0;
|
||||
llama_pos add_delta = 0;
|
||||
|
||||
// signal to copy a kv cache sequence
|
||||
bool kv_seq_cp = false;
|
||||
llama_seq_id cp_src_seq_id = 0;
|
||||
llama_seq_id cp_dst_seq_id = 0;
|
||||
llama_pos cp_p0 = 0;
|
||||
llama_pos cp_p1 = 0;
|
||||
|
||||
// signal to divide the kv cache range
|
||||
bool kv_seq_div = false;
|
||||
llama_seq_id div_seq_id = 0;
|
||||
llama_pos div_p0 = 0;
|
||||
llama_pos div_p1 = 0;
|
||||
int div_factor = 1;
|
||||
};
|
||||
|
||||
static void llama_send_meta(zmq::socket_t & socket, struct sync_meta * meta) {
|
||||
|
@ -17795,6 +17846,17 @@ static void llama_send_meta(zmq::socket_t & socket, struct sync_meta * meta) {
|
|||
send_msgs.emplace_back("n_tokens", strlen("n_tokens"));
|
||||
send_msgs.emplace_back(&(meta->n_tokens), sizeof(meta->n_tokens));
|
||||
|
||||
if (meta->pos != nullptr) {
|
||||
send_msgs.emplace_back("pos", strlen("pos"));
|
||||
send_msgs.emplace_back(meta->pos, meta->n_ctx * sizeof(llama_pos));
|
||||
}
|
||||
|
||||
send_msgs.emplace_back("all_pos_0", strlen("all_pos_0"));
|
||||
send_msgs.emplace_back(&(meta->all_pos_0), sizeof(meta->all_pos_0));
|
||||
|
||||
send_msgs.emplace_back("all_pos_1", strlen("all_pos_1"));
|
||||
send_msgs.emplace_back(&(meta->all_pos_1), sizeof(meta->all_pos_1));
|
||||
|
||||
zmq::send_multipart(socket, send_msgs);
|
||||
} catch (const zmq::error_t& e) {
|
||||
LLAMA_LOG_INFO("Failed to send meta data: %s\n", e.what());
|
||||
|
@ -17811,6 +17873,49 @@ static int llama_recv_meta(zmq::socket_t & socket, struct sync_meta * meta) {
|
|||
|
||||
socket.set(zmq::sockopt::rcvtimeo, -1);
|
||||
|
||||
const std::string cmd = recv_msgs[0].to_string();
|
||||
size_t idx = 1;
|
||||
|
||||
if (cmd == "clear_kv_cache" && recv_msgs.size() == 1) {
|
||||
meta->clear_kv_cache = true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (cmd == "kv_seq_rm" && recv_msgs.size() == 4) {
|
||||
meta->kv_seq_rm = true;
|
||||
std::memcpy(&meta->rm_seq_id, recv_msgs[idx++].data(), sizeof(meta->rm_seq_id));
|
||||
std::memcpy(&meta->rm_p0, recv_msgs[idx++].data(), sizeof(meta->rm_p0));
|
||||
std::memcpy(&meta->rm_p1, recv_msgs[idx++].data(), sizeof(meta->rm_p1));
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (cmd == "kv_seq_add" && recv_msgs.size() == 5) {
|
||||
meta->kv_seq_add = true;
|
||||
std::memcpy(&meta->add_seq_id, recv_msgs[idx++].data(), sizeof(meta->add_seq_id));
|
||||
std::memcpy(&meta->add_p0, recv_msgs[idx++].data(), sizeof(meta->add_p0));
|
||||
std::memcpy(&meta->add_p1, recv_msgs[idx++].data(), sizeof(meta->add_p1));
|
||||
std::memcpy(&meta->add_delta, recv_msgs[idx++].data(), sizeof(meta->add_delta));
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (cmd == "kv_seq_cp" && recv_msgs.size() == 5) {
|
||||
meta->kv_seq_cp = true;
|
||||
std::memcpy(&meta->cp_src_seq_id, recv_msgs[idx++].data(), sizeof(meta->cp_src_seq_id));
|
||||
std::memcpy(&meta->cp_dst_seq_id, recv_msgs[idx++].data(), sizeof(meta->cp_dst_seq_id));
|
||||
std::memcpy(&meta->cp_p0, recv_msgs[idx++].data(), sizeof(meta->cp_p0));
|
||||
std::memcpy(&meta->cp_p1, recv_msgs[idx++].data(), sizeof(meta->cp_p1));
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (cmd == "kv_seq_div" && recv_msgs.size() == 5) {
|
||||
meta->kv_seq_div = true;
|
||||
std::memcpy(&meta->div_seq_id, recv_msgs[idx++].data(), sizeof(meta->div_seq_id));
|
||||
std::memcpy(&meta->div_p0, recv_msgs[idx++].data(), sizeof(meta->div_p0));
|
||||
std::memcpy(&meta->div_p1, recv_msgs[idx++].data(), sizeof(meta->div_p1));
|
||||
std::memcpy(&meta->div_factor, recv_msgs[idx++].data(), sizeof(meta->div_factor));
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < recv_msgs.size(); i += 2) {
|
||||
std::string key = recv_msgs[i].to_string();
|
||||
zmq::message_t & data_msg = recv_msgs[i + 1];
|
||||
|
@ -17819,6 +17924,21 @@ static int llama_recv_meta(zmq::socket_t & socket, struct sync_meta * meta) {
|
|||
GGML_ASSERT(data_msg.size() == sizeof(meta->n_tokens));
|
||||
std::memcpy(&(meta->n_tokens), data_msg.data(), sizeof(meta->n_tokens));
|
||||
}
|
||||
|
||||
if (key == "pos") {
|
||||
meta->pos = (llama_pos *) malloc(meta->n_ctx * sizeof(llama_pos));
|
||||
std::memcpy(meta->pos, data_msg.data(), meta->n_ctx * sizeof(llama_pos));
|
||||
}
|
||||
|
||||
if (key == "all_pos_0") {
|
||||
GGML_ASSERT(data_msg.size() == sizeof(meta->all_pos_0));
|
||||
std::memcpy(&(meta->all_pos_0), data_msg.data(), sizeof(meta->all_pos_0));
|
||||
}
|
||||
|
||||
if (key == "all_pos_1") {
|
||||
GGML_ASSERT(data_msg.size() == sizeof(meta->all_pos_1));
|
||||
std::memcpy(&(meta->all_pos_1), data_msg.data(), sizeof(meta->all_pos_1));
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -18083,15 +18203,70 @@ static int llama_decode_internal(
|
|||
}
|
||||
|
||||
sync_meta meta;
|
||||
meta.n_ctx = cparams.n_ctx;
|
||||
bool is_last_dev = (my_rank == n_world - 1);
|
||||
|
||||
if (my_rank != 0) {
|
||||
if (llama_recv_meta(*lctx.recv_socket, &meta) == -1) {
|
||||
return -1;
|
||||
}
|
||||
batch_all.n_tokens = meta.n_tokens;
|
||||
|
||||
if (meta.n_tokens > 0) {
|
||||
batch_all.n_tokens = meta.n_tokens;
|
||||
if (meta.pos != nullptr) {
|
||||
batch_all.pos = (llama_pos *) malloc(cparams.n_ctx * sizeof(llama_pos));
|
||||
std::memcpy(batch_all.pos, meta.pos, cparams.n_ctx * sizeof(llama_pos));
|
||||
}
|
||||
batch_all.all_pos_0 = meta.all_pos_0;
|
||||
batch_all.all_pos_1 = meta.all_pos_1;
|
||||
}
|
||||
|
||||
if (kv_cache_op(meta.clear_kv_cache,
|
||||
[&]{ llama_kv_cache_clear (&lctx); },
|
||||
[&]{ llama_send_kv_cache_clear (&lctx); },
|
||||
is_last_dev)) {
|
||||
LLAMA_LOG_INFO("%s: received signal kv_cache_clear\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (kv_cache_op(meta.kv_seq_rm,
|
||||
[&]{ llama_kv_cache_seq_rm (&lctx, meta.rm_seq_id, meta.rm_p0, meta.rm_p1); },
|
||||
[&]{ llama_send_kv_cache_seq_rm (&lctx, meta.rm_seq_id, meta.rm_p0, meta.rm_p1); },
|
||||
is_last_dev)) {
|
||||
LLAMA_LOG_INFO("%s: received signal kv_cache_seq_rm\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (kv_cache_op(meta.kv_seq_add,
|
||||
[&]{ llama_kv_cache_seq_add (&lctx, meta.add_seq_id, meta.add_p0, meta.add_p1, meta.add_delta); },
|
||||
[&]{ llama_send_kv_cache_seq_add(&lctx, meta.add_seq_id, meta.add_p0, meta.add_p1, meta.add_delta); },
|
||||
is_last_dev)) {
|
||||
LLAMA_LOG_INFO("%s: received signal kv_cache_seq_add\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (kv_cache_op(meta.kv_seq_cp,
|
||||
[&]{ llama_kv_cache_seq_cp (&lctx, meta.cp_src_seq_id, meta.cp_dst_seq_id, meta.cp_p0, meta.cp_p1); },
|
||||
[&]{ llama_send_kv_cache_seq_cp (&lctx, meta.cp_src_seq_id, meta.cp_dst_seq_id, meta.cp_p0, meta.cp_p1); },
|
||||
is_last_dev)) {
|
||||
LLAMA_LOG_INFO("%s: received signal kv_cache_seq_cp\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (kv_cache_op(meta.kv_seq_div,
|
||||
[&]{ llama_kv_cache_seq_div (&lctx, meta.div_seq_id, meta.div_p0, meta.div_p1, meta.div_factor); },
|
||||
[&]{ llama_send_kv_cache_seq_div(&lctx, meta.div_seq_id, meta.div_p0, meta.div_p1, meta.div_factor); },
|
||||
is_last_dev)) {
|
||||
LLAMA_LOG_INFO("%s: received signal kv_cache_seq_div\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (my_rank != n_world - 1) {
|
||||
meta.n_tokens = batch_all.n_tokens;
|
||||
if (!is_last_dev) {
|
||||
meta.n_tokens = batch_all.n_tokens;
|
||||
meta.pos = batch_all.pos;
|
||||
meta.all_pos_0 = batch_all.all_pos_0;
|
||||
meta.all_pos_1 = batch_all.all_pos_1;
|
||||
llama_send_meta(*lctx.send_socket, &meta);
|
||||
}
|
||||
|
||||
|
@ -18817,22 +18992,20 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
|||
|
||||
// apply K-shift if needed
|
||||
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
|
||||
throw std::runtime_error("shift not supported\n");
|
||||
|
||||
if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
|
||||
GGML_ABORT("Deepseek2 does not support K-shift");
|
||||
}
|
||||
|
||||
{
|
||||
ggml_backend_sched_reset(lctx.sched.at(0)); // todo.
|
||||
for (size_t i = 0; i < lctx.sched.size(); ++i) {
|
||||
ggml_backend_sched_reset(lctx.sched[i]);
|
||||
|
||||
ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
|
||||
|
||||
ggml_backend_sched_alloc_graph(lctx.sched.at(0), gf); // todo.
|
||||
ggml_backend_sched_alloc_graph(lctx.sched[i], gf);
|
||||
|
||||
llama_set_k_shift(lctx);
|
||||
|
||||
llama_graph_compute(lctx, gf, lctx.sched.at(0), lctx.cparams.n_threads, lctx.threadpool); // todo.
|
||||
llama_graph_compute(lctx, gf, lctx.sched[i], lctx.cparams.n_threads, lctx.threadpool);
|
||||
|
||||
need_reserve = true;
|
||||
}
|
||||
|
@ -18859,8 +19032,6 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
|||
|
||||
// reserve a worst case graph again
|
||||
if (need_reserve) {
|
||||
throw std::runtime_error("reserve not supported\n");
|
||||
|
||||
// TODO: extract to a function
|
||||
// build worst-case graph
|
||||
uint32_t n_seqs = 1; // TODO: worst-case number of sequences
|
||||
|
@ -18868,13 +19039,11 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
|||
llama_token token = llama_token_bos(&lctx.model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
||||
llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
|
||||
std::vector<ggml_cgraph *> gf = llama_build_graph(lctx, ubatch, true);
|
||||
|
||||
// initialize scheduler with the worst-case graph
|
||||
ggml_backend_sched_reset(lctx.sched[0]); // todo.
|
||||
GGML_ASSERT(lctx.sched.size() == gf.size());
|
||||
|
||||
bool ok = true;
|
||||
GGML_ASSERT(lctx.sched.size() == gf.size());
|
||||
for (size_t i = 0; i < gf.size(); ++i) {
|
||||
ggml_backend_sched_reset(lctx.sched[i]);
|
||||
ok = ok & ggml_backend_sched_reserve(lctx.sched[i], gf[i]);
|
||||
}
|
||||
if (!ok) {
|
||||
|
@ -20215,6 +20384,8 @@ void llama_init_sockets(struct llama_context * ctx, uint32_t n_world, uint32_t m
|
|||
LLAMA_LOG_INFO("Error binding/connecting recv socket to endpoint: %s", e.what());
|
||||
exit(1);
|
||||
}
|
||||
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
}
|
||||
|
||||
int llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set) {
|
||||
|
@ -20276,38 +20447,49 @@ int llama_send_device_info(struct llama_context * ctx, struct device_info * dev_
|
|||
return 0;
|
||||
}
|
||||
|
||||
LLAMA_API int llama_bcast_startup_args(llama_context *ctx, uint32_t rank, startup_args *args) {
|
||||
auto n_world = ctx->cparams.n_world;
|
||||
if (n_world == 1) {
|
||||
return 0;
|
||||
}
|
||||
int llama_bcast_startup_args(llama_context * ctx, uint32_t rank, startup_args * args) {
|
||||
int32_t n_world = ctx->cparams.n_world;
|
||||
GGML_ASSERT(n_world > 0);
|
||||
GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr);
|
||||
if (rank==0){
|
||||
|
||||
if (rank == 0){
|
||||
// send
|
||||
try {
|
||||
std::vector<zmq::message_t> send_msgs;
|
||||
|
||||
send_msgs.emplace_back("should_profile", strlen("should_profile"));
|
||||
send_msgs.emplace_back(&args->should_profile, sizeof(args->should_profile));
|
||||
|
||||
send_msgs.emplace_back("n_ctx", strlen("n_ctx"));
|
||||
send_msgs.emplace_back(&args->n_ctx, sizeof(args->n_ctx));
|
||||
|
||||
zmq::send_multipart(*ctx->send_socket, send_msgs);
|
||||
} catch (const zmq::error_t& e) {
|
||||
LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
|
||||
return -1;
|
||||
}
|
||||
}else {
|
||||
} else {
|
||||
// receive
|
||||
std::vector<zmq::message_t> recv_msgs;
|
||||
if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
GGML_ASSERT(recv_msgs[0].to_string() == "should_profile");
|
||||
GGML_ASSERT(recv_msgs[1].size() == sizeof(bool));
|
||||
bool should_profile = *static_cast<bool*>(recv_msgs[1].data());
|
||||
args->should_profile = should_profile;
|
||||
if (rank != n_world-1){
|
||||
|
||||
GGML_ASSERT(recv_msgs[2].to_string() == "n_ctx");
|
||||
GGML_ASSERT(recv_msgs[3].size() == sizeof(uint32_t));
|
||||
uint32_t n_ctx = *static_cast<uint32_t*>(recv_msgs[3].data());
|
||||
args->n_ctx = n_ctx;
|
||||
|
||||
if ((int)rank != (int)n_world - 1){
|
||||
// send
|
||||
try {
|
||||
zmq::send_multipart(*ctx->send_socket, recv_msgs);
|
||||
} catch (const zmq::error_t& e) {
|
||||
} catch (const zmq::error_t & e) {
|
||||
LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
|
||||
return -1;
|
||||
}
|
||||
|
@ -22024,10 +22206,42 @@ void llama_kv_cache_clear(struct llama_context * ctx) {
|
|||
llama_kv_cache_clear(ctx->kv_self);
|
||||
}
|
||||
|
||||
void llama_send_kv_cache_clear(struct llama_context * ctx) {
|
||||
if (ctx->send_socket == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
std::vector<zmq::message_t> send_msgs;
|
||||
const char * cmd = "clear_kv_cache";
|
||||
send_msgs.emplace_back(cmd, strlen(cmd));
|
||||
zmq::send_multipart(*ctx->send_socket, send_msgs);
|
||||
} catch (const zmq::error_t & e) {
|
||||
LLAMA_LOG_INFO("Failed to send KV cache clear signal: %s\n", e.what());
|
||||
}
|
||||
}
|
||||
|
||||
bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
||||
return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
|
||||
}
|
||||
|
||||
void llama_send_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
||||
if (ctx->send_socket == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
std::vector<zmq::message_t> msgs;
|
||||
msgs.emplace_back("kv_seq_rm", strlen("kv_seq_rm"));
|
||||
msgs.emplace_back(&seq_id, sizeof(seq_id));
|
||||
msgs.emplace_back(&p0, sizeof(p0));
|
||||
msgs.emplace_back(&p1, sizeof(p1));
|
||||
zmq::send_multipart(*ctx->send_socket, msgs);
|
||||
} catch (const zmq::error_t & e) {
|
||||
LLAMA_LOG_WARN("Failed to send kv_seq_rm: %s\n", e.what());
|
||||
}
|
||||
}
|
||||
|
||||
void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
||||
if (seq_id_src == seq_id_dst) {
|
||||
return;
|
||||
|
@ -22035,6 +22249,24 @@ void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src,
|
|||
llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
|
||||
}
|
||||
|
||||
void llama_send_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
||||
if (ctx->send_socket == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
std::vector<zmq::message_t> msgs;
|
||||
msgs.emplace_back("kv_seq_cp", strlen("kv_seq_cp"));
|
||||
msgs.emplace_back(&seq_id_src, sizeof(seq_id_src));
|
||||
msgs.emplace_back(&seq_id_dst, sizeof(seq_id_dst));
|
||||
msgs.emplace_back(&p0, sizeof(p0));
|
||||
msgs.emplace_back(&p1, sizeof(p1));
|
||||
zmq::send_multipart(*ctx->send_socket, msgs);
|
||||
} catch (const zmq::error_t & e) {
|
||||
LLAMA_LOG_WARN("Failed to send kv_seq_cp: %s\n", e.what());
|
||||
}
|
||||
}
|
||||
|
||||
void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
|
||||
llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
|
||||
}
|
||||
|
@ -22047,6 +22279,24 @@ void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, lla
|
|||
llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
|
||||
}
|
||||
|
||||
void llama_send_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
||||
if (ctx->send_socket == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
std::vector<zmq::message_t> msgs;
|
||||
msgs.emplace_back("kv_seq_add", strlen("kv_seq_add"));
|
||||
msgs.emplace_back(&seq_id, sizeof(seq_id));
|
||||
msgs.emplace_back(&p0, sizeof(p0));
|
||||
msgs.emplace_back(&p1, sizeof(p1));
|
||||
msgs.emplace_back(&delta, sizeof(delta));
|
||||
zmq::send_multipart(*ctx->send_socket, msgs);
|
||||
} catch (const zmq::error_t & e) {
|
||||
LLAMA_LOG_WARN("Failed to send kv_seq_add: %s\n", e.what());
|
||||
}
|
||||
}
|
||||
|
||||
void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
||||
if (d == 1) {
|
||||
return;
|
||||
|
@ -22055,6 +22305,24 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla
|
|||
llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
|
||||
}
|
||||
|
||||
void llama_send_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
||||
if (ctx->send_socket == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
std::vector<zmq::message_t> msgs;
|
||||
msgs.emplace_back("kv_seq_div", strlen("kv_seq_div"));
|
||||
msgs.emplace_back(&seq_id, sizeof(seq_id));
|
||||
msgs.emplace_back(&p0, sizeof(p0));
|
||||
msgs.emplace_back(&p1, sizeof(p1));
|
||||
msgs.emplace_back(&d, sizeof(d));
|
||||
zmq::send_multipart(*ctx->send_socket, msgs);
|
||||
} catch (const zmq::error_t & e) {
|
||||
LLAMA_LOG_WARN("Failed to send kv_seq_div: %s\n", e.what());
|
||||
}
|
||||
}
|
||||
|
||||
llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
|
||||
return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue