Merge branch 'main' into review

This commit is contained in:
Lizonghang 2025-06-04 15:17:54 +04:00
commit 4adc3791dc
16 changed files with 2036 additions and 681 deletions

7
.gitignore vendored
View file

@ -67,6 +67,7 @@ autogen-*.md
/main /main
/server /server
/profile-tool
# CI # CI
@ -135,4 +136,8 @@ poetry.toml
/lora-tests /lora-tests
# Video # Video
*.mp4 *.mp4
# fio
fio_test*
*.fio

View file

@ -1,5 +1,9 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = llama-cli BUILD_TARGETS = \
llama-server \
llama-cli \
profile-tool
# BUILD_TARGETS = \ # BUILD_TARGETS = \
# libllava.a \ # libllava.a \
# llama-baby-llama \ # llama-baby-llama \
@ -268,7 +272,7 @@ MK_LDFLAGS += -L/usr/local/lib -lzmq
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
MK_CPPFLAGS += -isystem /opt/homebrew/include MK_CPPFLAGS += -isystem /opt/homebrew/include
MK_LDFLAGS += -L/opt/homebrew/lib -lzmq MK_LDFLAGS += -L/opt/homebrew/lib
endif endif
ifeq ($(USE_HIGHS),1) ifeq ($(USE_HIGHS),1)
@ -276,7 +280,7 @@ ifeq ($(USE_HIGHS),1)
HIGHS_LDFLAGS = -L/usr/local/lib -lhighs HIGHS_LDFLAGS = -L/usr/local/lib -lhighs
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
HIGHS_CPPFLAGS += -isystem /opt/homebrew/include/highs HIGHS_CPPFLAGS += -isystem /opt/homebrew/include/highs
HIGHS_LDFLAGS += -L/opt/homebrew/lib -lhighs HIGHS_LDFLAGS += -L/opt/homebrew/lib
endif endif
MK_CPPFLAGS += $(HIGHS_CPPFLAGS) -DUSE_HIGHS MK_CPPFLAGS += $(HIGHS_CPPFLAGS) -DUSE_HIGHS
MK_LDFLAGS += $(HIGHS_LDFLAGS) MK_LDFLAGS += $(HIGHS_LDFLAGS)
@ -1528,6 +1532,11 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
$(OBJ_ALL) $(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual $(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
profile-tool: tools/profile_tool.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
swift: examples/batched.swift swift: examples/batched.swift
(cd examples/batched.swift; make build) (cd examples/batched.swift; make build)

162
README.md
View file

@ -34,7 +34,7 @@ And, if your devices are more powerful, you could unlock even more possibilities
> Device D4 runs inside a Termux-simulated Linux. Device D1 reads disk data in random mode and D2~D4 read in sequential mode. > Device D4 runs inside a Termux-simulated Linux. Device D1 reads disk data in random mode and D2~D4 read in sequential mode.
**Table 2:** Token latency for Llama models. **Table 2:** Token latency for Llama models (w/o device selection).
| **Model** | **llama.cpp** | **exo** | **dllama** | **prima.cpp** | | **Model** | **llama.cpp** | **exo** | **dllama** | **prima.cpp** |
|-----------------|---------------|-----------|------------|---------------| |-----------------|---------------|-----------|------------|---------------|
| Llama 3-8B | **15 ms** | 263 ms | 459 ms | 54 ms | | Llama 3-8B | **15 ms** | 263 ms | 459 ms | 54 ms |
@ -45,7 +45,7 @@ And, if your devices are more powerful, you could unlock even more possibilities
| Llama 1-65B | 8807 ms | - | - | **569 ms** | | Llama 1-65B | 8807 ms | - | - | **569 ms** |
| Llama 3-70B | 10120 ms | OOM | OOM | **674 ms** | | Llama 3-70B | 10120 ms | OOM | OOM | **674 ms** |
**Table 3:** Token latency for Qwen 2.5, QwQ, and DeepSeek R1 models. **Table 3:** Token latency for Qwen 2.5, QwQ, and DeepSeek R1 models (w/o device selection).
| **Model** | **llama.cpp** | **exo** | **dllama** | **prima.cpp** | | **Model** | **llama.cpp** | **exo** | **dllama** | **prima.cpp** |
|-----------------------------------|---------------|---------------|------------|---------------| |-----------------------------------|---------------|---------------|------------|---------------|
@ -61,7 +61,9 @@ And, if your devices are more powerful, you could unlock even more possibilities
> As video recording consumes some RAM, prima.cpp proactively reduces memory usage, resulting in slightly higher latency in the video compared to the table. > As video recording consumes some RAM, prima.cpp proactively reduces memory usage, resulting in slightly higher latency in the video compared to the table.
> In current implementation, each device is assigned at least one model layer. For example, this leads to a 1:1:29:1 split for Llama 3-8B, which makes prima.cpp less efficient. In future updates, we will have a 0:0:32:0 split and idle devices removed, then llama.cpp would become a special case of prima.cpp when serving small models. > In the old version (w/o device selection), each device is assigned at least one model layer. This would lead to a 1:1:29:1 split for Llama 3-8B, which makes prima.cpp slower than llama.cpp.
>
> **New:** In the latest version (with device selection), we will have a 0:0:32:0 split and weak devices removed, then prima.cpp would become llama.cpp when serving small models.
## 🔑 Key Features ## 🔑 Key Features
@ -70,6 +72,7 @@ And, if your devices are more powerful, you could unlock even more possibilities
- - **GPU & CPU Offloading:** If a device has a GPU, you can use both GPU and CPU for inference. For example, when VRAM is full, we can offload some model layers to RAM. - - **GPU & CPU Offloading:** If a device has a GPU, you can use both GPU and CPU for inference. For example, when VRAM is full, we can offload some model layers to RAM.
- - **Piped-ring parallelism with prefetching:** Prefetch upcoming layer weights to overlap disk loading latency and use advanced piped-ring parallelism to prevent the "prefetch-release" effect. This new parallelism improves pipeline parallelism by using a ring structure and allows devices to run multiple cycles to predict a new token. - - **Piped-ring parallelism with prefetching:** Prefetch upcoming layer weights to overlap disk loading latency and use advanced piped-ring parallelism to prevent the "prefetch-release" effect. This new parallelism improves pipeline parallelism by using a ring structure and allows devices to run multiple cycles to predict a new token.
- - **Heterogeneity-aware workload distribution:** A scheduler is designed to optimize workload distribution based on each device's computing power, disk speed, memory, and OS (the OS will affect the disk speed and the memory management strategy). It decides how many model layers a device should handle and how many should run on GPU (if available). - - **Heterogeneity-aware workload distribution:** A scheduler is designed to optimize workload distribution based on each device's computing power, disk speed, memory, and OS (the OS will affect the disk speed and the memory management strategy). It decides how many model layers a device should handle and how many should run on GPU (if available).
- - **Automatic device selection:** If there are weak devices and removing them would speed up inference, prima.cpp will automatically discover and remove them.
- - **Quantization:** We now support Q4K, Q6K, Q80 and IQ1 quantization (GGUF format) and are exploring a Q4K-IQ1 hybrid for a better balance between performance and speed. - - **Quantization:** We now support Q4K, Q6K, Q80 and IQ1 quantization (GGUF format) and are exploring a Q4K-IQ1 hybrid for a better balance between performance and speed.
- **Support Models:** We now support hot models like the **Llama, Qwen (and QwQ), and DeepSeek series**. More will be added in future updates. - **Support Models:** We now support hot models like the **Llama, Qwen (and QwQ), and DeepSeek series**. More will be added in future updates.
- **Cross-Platform:** The cluster can consist of devices with different OSs, including macOS, Linux, Android, HarmonyOS, etc. Now, Android and HarmonyOS devices require Termux, and Windows support will be added in future update. - **Cross-Platform:** The cluster can consist of devices with different OSs, including macOS, Linux, Android, HarmonyOS, etc. Now, Android and HarmonyOS devices require Termux, and Windows support will be added in future update.
@ -78,27 +81,27 @@ And, if your devices are more powerful, you could unlock even more possibilities
Here are the models we have tested so far. You can also try more on Hugging Face! Here are the models we have tested so far. You can also try more on Hugging Face!
### Llama ### Llama
- **Llama 3-8B (Q4K, Q6K, Q80):** [Meta-Llama-3-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3-8B-Instruct-GGUF) - **Llama 3-8B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/LLama-3-8b-Uncensored-i1-GGUF)):** [Meta-Llama-3-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3-8B-Instruct-GGUF)
- **Llama 3-14B (Q4K, Q6K, Q80):** [Llama-3-14B-Instruct-v1](https://huggingface.co/RDson/Llama-3-14B-Instruct-v1-GGUF) - **Llama 3-14B (Q4K, Q6K, Q80):** [Llama-3-14B-Instruct-v1](https://huggingface.co/RDson/Llama-3-14B-Instruct-v1-GGUF)
- **Llama 1-30B (Q4K, Q6K, Q80):** [upstage-llama-30b-instruct-2048](https://huggingface.co/TheBloke/upstage-llama-30b-instruct-2048-GGUF) - **Llama 1-30B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/LLaMA-30B-HF-i1-GGUF)):** [upstage-llama-30b-instruct-2048](https://huggingface.co/TheBloke/upstage-llama-30b-instruct-2048-GGUF)
- **Llama 3-45B (Q4K, Q6K, Q80):** [Llama-3-pruned-45B-Drobeta-Turnu-Severin](https://huggingface.co/mradermacher/Llama-3-pruned-45B-Drobeta-Turnu-Severin-GGUF) - **Llama 3-45B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/Llama-3-pruned-45B-Drobeta-Turnu-Severin-i1-GGUF)):** [Llama-3-pruned-45B-Drobeta-Turnu-Severin](https://huggingface.co/mradermacher/Llama-3-pruned-45B-Drobeta-Turnu-Severin-GGUF)
- **Llama 3-60B (Q4K, Q6K, Q80):** [nyun-llama3-60B](https://huggingface.co/mradermacher/nyun-llama3-60B-GGUF) - **Llama 3-60B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/nyun-llama3-60B-i1-GGUF)):** [nyun-llama3-60B](https://huggingface.co/mradermacher/nyun-llama3-60B-GGUF)
- **Llama 1-65B (Q4K, Q6K, Q80):** [llama-65b](https://huggingface.co/TheBloke/LLaMA-65B-GGUF) - **Llama 1-65B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/llama-65b-instruct-i1-GGUF)):** [llama-65b](https://huggingface.co/TheBloke/LLaMA-65B-GGUF)
- **Llama 3-70B (Q4K, Q6K, Q80):** [Meta-Llama-3-70B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3-70B-Instruct-GGUF) - **Llama 3-70B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/Meta-Llama-3-70B-Instruct-DPO-i1-GGUF)):** [Meta-Llama-3-70B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3-70B-Instruct-GGUF)
### Qwen 2.5 / QwQ ### Qwen 2.5 / QwQ
- **Qwen 2.5-7B (Q4K, Q6K, Q80):** [Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF) - **Qwen 2.5-7B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/Qwen2.5-7B-i1-GGUF)):** [Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF)
- **Qwen 2.5-14B (Q4K, Q6K, Q80):** [Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GGUF) - **Qwen 2.5-14B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/Qwen2.5-14B-i1-GGUF)):** [Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GGUF)
- **Qwen 2.5-32B (Q4K, Q6K, Q80):** [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GGUF) - **Qwen 2.5-32B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/Qwen2.5-32B-i1-GGUF)):** [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GGUF)
- **Qwen 2.5-72B (Q4K, Q6K, Q80):** [Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GGUF) - **Qwen 2.5-72B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/Qwen2.5-72B-Instruct-i1-GGUF)):** [Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GGUF)
- **QwQ-32B (Q4K, Q6K, Q80):** [qwq-32b](https://huggingface.co/Qwen/QwQ-32B-GGUF) - **QwQ-32B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/QwQ-32B-i1-GGUF)):** [qwq-32b](https://huggingface.co/Qwen/QwQ-32B-GGUF)
### DeepSeek ### DeepSeek
- **DeepSeek R1-7B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-7B-GGUF) - **DeepSeek R1-7B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/DeepSeek-R1-Distill-Qwen-7B-Uncensored-i1-GGUF)):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-7B-GGUF)
- **DeepSeek R1-8B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Llama-8B-GGUF) - **DeepSeek R1-8B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/DeepSeek-R1-Distill-Llama-8B-i1-GGUF)):** [deepseek-ai.DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Llama-8B-GGUF)
- **DeepSeek R1-14B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-14B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-14B-GGUF) - **DeepSeek R1-14B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/Qwen2.5-14B-DeepSeek-R1-1M-Uncensored-GGUF)):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-14B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-14B-GGUF)
- **DeepSeek R1-32B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-32B-GGUF) - **DeepSeek R1-32B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/mradermacher/deepseek-r1-qwen-2.5-32B-ablated-i1-GGUF)):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-32B-GGUF)
- **DeepSeek R1-70B (Q4K, Q6K, Q80):** [DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF) - **DeepSeek R1-70B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-70B-GGUF)):** [DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF)
## ⚙️ How to Use? ## ⚙️ How to Use?
@ -154,6 +157,7 @@ make GGML_CUDA=1 -j$(nproc)
make LLAMA_NO_METAL=1 -j$(nproc) make LLAMA_NO_METAL=1 -j$(nproc)
# To enable debug mode, add LLAMA_DEBUG=1: # To enable debug mode, add LLAMA_DEBUG=1:
# WARNING: Running in DEBUG mode will slow down inference!
make LLAMA_DEBUG=1 -j$(nproc) make LLAMA_DEBUG=1 -j$(nproc)
# Otherwise, just use: # Otherwise, just use:
@ -203,17 +207,17 @@ graph LR;
Take QwQ-32B as an example, run the following commands on the devices to launch distributed inference: Take QwQ-32B as an example, run the following commands on the devices to launch distributed inference:
```shell ```shell
# on head device without a GPU, rank 0: # On head device without a GPU, rank 0:
./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 -n 256 -p "what is edge AI?" --world 4 --rank 0 --master 192.168.1.2 --next 192.168.1.3 --prefetch ./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 -n 256 -p "what is edge AI?" --world 4 --rank 0 --master 192.168.1.2 --next 192.168.1.3 --prefetch
# on worker device with 8 GiB VRAM, rank 1: # On worker device with 8 GiB VRAM, rank 1:
./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 4 --rank 1 --master 192.168.1.2 --next 192.168.1.4 --prefetch --gpu-mem 8 ./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 4 --rank 1 --master 192.168.1.2 --next 192.168.1.4 --prefetch --gpu-mem 8
# on worker device with 11 GiB VRAM, rank 2: # On worker device with 11 GiB VRAM, rank 2:
./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 4 --rank 2 --master 192.168.1.2 --next 192.168.1.5 --prefetch --gpu-mem 11 ./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 4 --rank 2 --master 192.168.1.2 --next 192.168.1.5 --prefetch --gpu-mem 11
# on worker device without a GPU, rank 3: # On worker device without a GPU, rank 3:
./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 4 --rank 3 --master 192.168.1.2 --next 192.168.1.2 --prefetch ./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 4 --rank 3 --master 192.168.1.2 --next 192.168.1.2 --prefetch
``` ```
Once started, prima.cpp will profile each device and decide how much workload to assign, e.g., how many model layers each device should handle, and how many of them should run on GPU (if available). Once started, prima.cpp will profile each device and decide how much workload to assign, e.g., how many model layers each device should handle, and how many of them should run on GPU (if available).
@ -260,4 +264,108 @@ cd /root/prima.cpp
(prima-v4) ./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 4 --rank 3 --prefetch --gpu-mem 8 (prima-v4) ./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 4 --rank 3 --prefetch --gpu-mem 8
``` ```
> If your host machine does not have a GPU, ignore the `--gpu-mem` option. > If your host machine does not have a GPU, ignore the `--gpu-mem` option.
> If you update to the latest code, non-rank 0 nodes can omit `-c 1024`.
### Run in Server Mode
You can run prima.cpp in server mode, by launching `llama-server` on the rank 0 device (with `--host` and `--port` specified) and `llama-cli` on the others. Here is an example with 2 devices:
```shell
# On rank 0, run:
./llama-server -m download/qwq-32b-q4_k_m.gguf -c 1024 --world 2 --rank 0 --master 192.168.1.2 --next 192.168.1.3 --prefetch --host 127.0.0.1 --port 8080
# On rank 1, run:
./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 2 --rank 1 --master 192.168.1.2 --next 192.168.1.2 --prefetch
```
After that, you can interact with the rank 0 device by calling the Chat Completion API:
```shell
curl http://127.0.0.1:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "qwq-32b",
"messages": [
{"role": "user", "content": "what is edge AI?"}
],
"max_tokens": 200,
"temperature": 0.7,
"stream": true
}'
```
You can also use third-party GUI clients like [AnythingLLM](https://anythingllm.com/) and set the API endpoint from prima.cpp, by default, `http://localhost:8080/v1`.
## ❓ FAQ
**1. How can I manually set the workload for each device?**
By default, prima.cpp automatically profiles devices and assigns workloads. However, if you want to manually control the layer distribution, you can use the `-lw` (or `--layer-window`, `--n-layer-window`) and `-ngl` options:
```shell
# on head device without a GPU, rank 0, use the option "-lw":
./llama-cli -m download/qwq-32b-q4_k_m.gguf -c 1024 -n 256 -p "what is edge AI?" --world 4 --rank 0 --master 192.168.1.2 --next 192.168.1.3 --prefetch -lw "16,16,16,16"
# on worker device with 8 GiB VRAM, rank 1, use the option "-ngl":
./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 4 --rank 1 --master 192.168.1.2 --next 192.168.1.4 --prefetch -ngl 16
# on worker device with 11 GiB VRAM, rank 2, use the option "-ngl":
./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 4 --rank 2 --master 192.168.1.2 --next 192.168.1.5 --prefetch -ngl 16
# on worker device without a GPU, rank 3:
./llama-cli -m download/qwq-32b-q4_k_m.gguf --world 4 --rank 3 --master 192.168.1.2 --next 192.168.1.2 --prefetch
```
- `-lw` sets the total model layers each device should handle. The format is a comma-separated list, one value per device, in rank order. You can also set `"8,8,8,8"`, `"4,4,4,4"`, `"16,16,24,8"`.
- `-ngl` sets how many of those model layers should run on the GPU.
> Example: if `-lw "16,16,16,16"` is passed to the head device, then each of the 4 devices will handle 16 model layers. A worker with `-ngl 8` (if a GPU is available) will run 8/16 layers on the GPU.
**2. How to manually profile my device?**
If `-lw` is set, prima.cpp skips profiling and runs directly with the user-defined `-lw` and `-ngl`. If you wish to profile a device manually, run `profile-tool` on that device.
```shell
./profile-tool -m download/qwq-32b-q4_k_m.gguf
```
**3. How to run in chat mode like in llama.cpp?**
To enable chat (conversation) mode, simply add the `-cnv` flag on the head device:
```shell
# on head device, rank 0, use the option "-cnv":
./llama-cli ... --rank 0 -p "You are an AI assistant" -cnv
```
To quit the chat mode, input `quit` or `exit`.
**4. How to force prefetching after computing?**
By default, prima.cpp only advises the OS to prefetch upcoming layer weights. The actual prefetching is then scheduled and handled by the OS, which may introduce some uncertainty. To explicitly trigger prefetching right after computing, you can use the `--force` flag on each device:
```shell
# on each device, use the option "--force":
./llama-cli ... --prefetch --force
```
This enables more aggressive overlap but also introduce extra memory access latency. Use `--force` only after testing, as its effect depends on your hardware and OS behavior.
**5. Does it support Windows?**
Not yet—but it's on the roadmap. Currently, prima.cpp can run on Linux, macOS, Android and HarmonyOS (via Termux). You can mix heterogeneous devices in the cluster.
**6. Does it support Vulkan or AMD GPUs?**
Not yet. Now prima.cpp supports only CUDA-based GPUs. Vulkan is in our roadmap, and AMD GPUs will be supported once we have that device.
**7. Why did I get "No layer is assigned to me, exit"?**
No worries, this is expected. Prima.cpp found that this device was too slow, and dropping it could speed up inference, so it was removed.
## ❤️ Acknowledgment
This project builds upon the incredible work from the open-source community, especially [ggml, gguf](https://github.com/ggml-org/ggml), and [llama.cpp](https://github.com/ggml-org/llama.cpp). We gratefully acknowledge their contributions.
## 📚 Cite Us
If you find this work helpful, please do not hesitate to cite us and send a star! 🤩

View file

@ -986,13 +986,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
params.enable_chat_template = false; params.enable_chat_template = false;
} }
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
add_opt(llama_arg( // add_opt(llama_arg(
{"--no-warmup"}, // {"--no-warmup"},
"skip warming up the model with an empty run", // "skip warming up the model with an empty run",
[](gpt_params & params) { // [](gpt_params & params) {
params.warmup = false; // params.warmup = false;
} // }
).set_examples({LLAMA_EXAMPLE_MAIN})); // ).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(llama_arg( add_opt(llama_arg(
{"--spm-infill"}, {"--spm-infill"},
format( format(
@ -1317,6 +1317,12 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
{"-ctk", "--cache-type-k"}, "TYPE", {"-ctk", "--cache-type-k"}, "TYPE",
format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()), format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
[](gpt_params & params, const std::string & value) { [](gpt_params & params, const std::string & value) {
#ifdef GGML_USE_METAL
LOG_WRN("The option -ctk or --cache-type-k is not supported on Metal, use default type\n");
return;
#endif
// TODO: get the type right here // TODO: get the type right here
params.cache_type_k = value; params.cache_type_k = value;
} }
@ -1325,6 +1331,11 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
{"-ctv", "--cache-type-v"}, "TYPE", {"-ctv", "--cache-type-v"}, "TYPE",
format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()), format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
[](gpt_params & params, const std::string & value) { [](gpt_params & params, const std::string & value) {
#ifdef GGML_USE_METAL
LOG_WRN("The option -ctv or --cache-type-v is not supported on Metal, use default type\n");
return;
#endif
// TODO: get the type right here // TODO: get the type right here
params.cache_type_v = value; params.cache_type_v = value;
} }
@ -1413,13 +1424,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
params.defrag_thold = std::stof(value); params.defrag_thold = std::stof(value);
} }
).set_env("LLAMA_ARG_DEFRAG_THOLD")); ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
add_opt(llama_arg( // add_opt(llama_arg(
{"-np", "--parallel"}, "N", // {"-np", "--parallel"}, "N",
format("number of parallel sequences to decode (default: %d)", params.n_parallel), // format("number of parallel sequences to decode (default: %d)", params.n_parallel),
[](gpt_params & params, int value) { // [](gpt_params & params, int value) {
params.n_parallel = value; // params.n_parallel = value;
} // }
).set_env("LLAMA_ARG_N_PARALLEL")); // ).set_env("LLAMA_ARG_N_PARALLEL"));
add_opt(llama_arg( add_opt(llama_arg(
{"-ns", "--sequences"}, "N", {"-ns", "--sequences"}, "N",
format("number of sequences to decode (default: %d)", params.n_sequences), format("number of sequences to decode (default: %d)", params.n_sequences),

View file

@ -847,8 +847,7 @@ static std::string vec_to_str(const std::vector<T> & vec) {
} }
static bool assign_layers_to_device( static bool assign_layers_to_device(
uint32_t n_world, uint32_t n_world,
uint32_t my_rank,
const device_info * dev_info_set, const device_info * dev_info_set,
uint32_t * n_layer_window, uint32_t * n_layer_window,
uint32_t * n_gpu_layers, uint32_t * n_gpu_layers,
@ -857,15 +856,8 @@ static bool assign_layers_to_device(
float min_disk_read_speed = 0.1f) { // minimum disk I/O speed: 100 MB/s float min_disk_read_speed = 0.1f) { // minimum disk I/O speed: 100 MB/s
GGML_ASSERT(dev_info_set != nullptr); GGML_ASSERT(dev_info_set != nullptr);
GGML_ASSERT(n_layer_window != nullptr); GGML_ASSERT(n_layer_window != nullptr);
GGML_ASSERT(my_rank == 0);
// if only 1 device, it is assigned all layers
const uint32_t n_layer = llama_model_n_layers(model); const uint32_t n_layer = llama_model_n_layers(model);
if (n_world == 1) {
n_layer_window[0] = n_layer;
return true;
}
std::vector<int> w(n_world, 0); std::vector<int> w(n_world, 0);
std::vector<int> n(n_world, 0); std::vector<int> n(n_world, 0);
std::vector<float> mem_budget(n_world, 0.0f); std::vector<float> mem_budget(n_world, 0.0f);
@ -901,13 +893,19 @@ static bool assign_layers_to_device(
float t_read_ram_cpu = 0.0f; float t_read_ram_cpu = 0.0f;
float t_calc_cpu = ( float t_calc_cpu = (
master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + master.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) +
master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms master.model_flops.layer_iq2xxs_f32/ (dev.cpu_props.flops_iq2xxs_f32* 1e9 + EPS) +
master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) +
master.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) +
master.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) +
master.model_flops.layer_iq1m_f32 / (dev.cpu_props.flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms
// t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms // t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
@ -921,24 +919,36 @@ static bool assign_layers_to_device(
if (dev.gpu_support.metal) { if (dev.gpu_support.metal) {
t_calc_gpu = ( t_calc_gpu = (
master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) + master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) +
master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) + master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) +
master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) + master.model_flops.layer_q2k_f32 / (dev.gpu_props.metal_flops_q2k_f32 * 1e9 + EPS) +
master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) + master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) +
master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) + master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) +
master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) + master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) +
master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.metal_flops_iq2xxs_f32 * 1e9 + EPS) +
master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) +
master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS) +
master.model_flops.layer_iq1s_f32 / (dev.gpu_props.metal_flops_iq1s_f32 * 1e9 + EPS) +
master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.metal_flops_iq4nl_f32 * 1e9 + EPS) +
master.model_flops.layer_iq1m_f32 / (dev.gpu_props.metal_flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms
// t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms // t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms
} else { } else {
t_calc_gpu = ( t_calc_gpu = (
master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) + master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) +
master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) + master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) +
master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) + master.model_flops.layer_q2k_f32 / (dev.gpu_props.cuda_flops_q2k_f32 * 1e9 + EPS) +
master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) + master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) +
master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) + master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) +
master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) + master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) +
master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.cuda_flops_iq2xxs_f32 * 1e9 + EPS) +
master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) +
master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS) +
master.model_flops.layer_iq1s_f32 / (dev.gpu_props.cuda_flops_iq1s_f32 * 1e9 + EPS) +
master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.cuda_flops_iq4nl_f32 * 1e9 + EPS) +
master.model_flops.layer_iq1m_f32 / (dev.gpu_props.cuda_flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms
// t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms // t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms
} }
@ -1084,9 +1094,8 @@ static bool assign_layers_to_device(
}; };
(void)print_matrix; (void)print_matrix;
double final_objective = 1.0e30; std::vector<double> final_solution, rollback_solution;
std::vector<double> final_solution; int final_k = -1, rollback_k = -1;
int final_k = -1;
// iterative optimization to find a valid set assignment (M1, M2, M3, M4) // iterative optimization to find a valid set assignment (M1, M2, M3, M4)
while (true) { while (true) {
@ -1113,14 +1122,18 @@ static bool assign_layers_to_device(
if (m == 0) { if (m == 0) {
kappa = ( kappa = (
dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) + dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) + dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) + dev.model_flops.layer_q2k_f32 / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) +
dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) + dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) + dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) + dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms dev.model_flops.layer_iq2xxs_f32 / (dev.cpu_props.flops_iq2xxs_f32 * 1e9 + EPS) +
dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) +
dev.model_flops.layer_iq1s_f32 / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) +
dev.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) +
dev.model_flops.layer_iq1m_f32 / (dev.cpu_props.flops_iq1m_f32 * 1e9 + EPS) ) * 1000; // in ms
// kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms // kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
kappa += (bi / n_vocab) / (disk_speed[m] * 1e9) * 1000; // in ms kappa += (bi / n_vocab) / (disk_speed[m] * 1e9) * 1000; // in ms
@ -1354,19 +1367,48 @@ static bool assign_layers_to_device(
// get the solution // get the solution
const HighsModelStatus& model_status = highs.getModelStatus(); const HighsModelStatus& model_status = highs.getModelStatus();
if (model_status != HighsModelStatus::kOptimal) continue;
if (model_status != HighsModelStatus::kOptimal) {
bool is_all_in_M4 = true;
for (uint32_t m = 0; m < n_world; ++m) {
if (!in_set(m, M4)) {
is_all_in_M4 = false;
break;
}
}
if (!is_all_in_M4) continue;
}
// record the best solution // record the best solution
const HighsSolution& solution = highs.getSolution(); const HighsSolution& solution = highs.getSolution();
double objective_value = highs.getInfo().objective_function_value; double objective_value = highs.getInfo().objective_function_value;
if (objective_value < best_objective) {
best_objective = objective_value;
best_k = k;
best_solution = solution.col_value;
}
LOG_INF("k = %2d, obj = %7.1f, solution: %s | best_k = %2d, best_obj = %7.1f, best_solution: %s\n", if (solution.value_valid) {
k, objective_value, vec_to_str(solution.col_value).c_str(), best_k, best_objective, vec_to_str(best_solution).c_str()); if (objective_value < best_objective) {
best_objective = objective_value;
best_k = k;
best_solution = solution.col_value;
}
LOG_INF("k = %2d, obj = %7.1f, solution: %s | best_k = %2d, best_obj = %7.1f, best_solution: %s\n",
k, objective_value, vec_to_str(solution.col_value).c_str(), best_k, best_objective, vec_to_str(best_solution).c_str());
}
}
if (best_solution.empty()) {
LOG_INF("No feasible solution found for this set assignment, rolling back to previous sets.\n");
final_solution = rollback_solution;
final_k = rollback_k;
// update w[m] and n[m]
GGML_ASSERT(final_solution.size() == n_world * 2 && "Invalid solution\n");
std::copy(final_solution.begin(), final_solution.begin() + n_world, w.begin());
std::copy(final_solution.begin() + n_world, final_solution.end(), n.begin());
break;
} else {
rollback_solution = best_solution;
rollback_k = best_k;
} }
// check the solution // check the solution
@ -1420,7 +1462,6 @@ static bool assign_layers_to_device(
// update the global best solution // update the global best solution
final_k = best_k; final_k = best_k;
final_objective = best_objective;
final_solution = best_solution; final_solution = best_solution;
if (solution_unchanged) break; if (solution_unchanged) break;
@ -1439,8 +1480,7 @@ static bool assign_layers_to_device(
LOG_INF(" - N Layer Window : %d\n", w[m]); LOG_INF(" - N Layer Window : %d\n", w[m]);
LOG_INF(" - N GPU Layers : %d\n", n[m]); LOG_INF(" - N GPU Layers : %d\n", n[m]);
} }
// LOG_INF("\nEstimated Latency: %.3f ms\n", final_objective); LOG_INF("\n");
// LOG_INF("------------------------------------------");
// copy value from w and n to n_layer_window and n_gpu_layers, respectively // copy value from w and n to n_layer_window and n_gpu_layers, respectively
std::copy(w.begin(), w.end(), n_layer_window); std::copy(w.begin(), w.end(), n_layer_window);
@ -1500,11 +1540,81 @@ static bool assign_layers_to_device(
return true; return true;
} }
static bool assign_layers_and_select_devices(
uint32_t n_world,
std::vector<device_info> dev_infos,
uint32_t * n_layer_window,
uint32_t * n_gpu_layers,
struct llama_model * model,
const struct llama_context_params cparams) {
memset(n_layer_window, 0, n_world * sizeof(uint32_t));
memset(n_gpu_layers, 0, n_world * sizeof(uint32_t));
std::vector<device_info> dev_infos_temp = dev_infos;
std::vector<uint32_t> n_layer_windows_temp, n_gpu_layers_temp;
while (n_world > 0) {
std::vector<device_info> dev_infos_ = dev_infos_temp;
std::vector<uint32_t> n_layer_windows_(n_world, 0), n_gpu_layers_(n_world, 0);
if (!assign_layers_to_device(n_world, dev_infos_.data(),
n_layer_windows_.data(), n_gpu_layers_.data(), model, cparams)) {
return false;
}
dev_infos_temp.clear();
n_layer_windows_temp.clear();
n_gpu_layers_temp.clear();
for (uint32_t i = 0; i < n_world; i++) {
if (n_layer_windows_[i] > 1 || i == 0 ) {
dev_infos_temp.push_back(dev_infos_[i]);
n_layer_windows_temp.push_back(n_layer_windows_[i]);
n_gpu_layers_temp.push_back(n_gpu_layers_[i]);
} else {
// remove this device
LOG_INF("Remove device %s (rank %d) with only %d layer assigned.\n",
dev_infos_[i].device_name, dev_infos_[i].rank, n_layer_windows_[i]);
}
}
if(dev_infos_temp.size() == n_world) {
// no device be removed
break;
}
n_world = dev_infos_temp.size();
LOG_INF("Reassign layers to the remaining %d device(s).\n\n", n_world);
}
uint32_t i = 0 , j = 0;
while (j < n_world) {
if (dev_infos[i].rank == dev_infos_temp[j].rank) {
n_layer_window[i] = n_layer_windows_temp[j];
n_gpu_layers[i] = n_gpu_layers_temp[j];
j++;
} else {
n_layer_window[i] = 0;
n_gpu_layers[i] = 0;
}
i++;
}
return true;
}
// //
// Model utils // Model utils
// //
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) { struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
#if !(defined(GGML_USE_METAL) || defined(GGML_USE_CUDA))
// reset n_gpu_layers to 0 if GPU is not used
params.n_gpu_layers = 0;
#endif
llama_init_result iparams; llama_init_result iparams;
auto mparams = llama_model_params_from_gpt_params(params); auto mparams = llama_model_params_from_gpt_params(params);
@ -1554,57 +1664,137 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
uint32_t my_rank = params.rank; uint32_t my_rank = params.rank;
bool auto_schedule = params.n_layer_window[0] == 0; bool auto_schedule = params.n_layer_window[0] == 0;
// get device profile
LOG_INF("\nstart profiling this device, this may take some seconds ...\n");
dev_info.rank = params.rank;
if (n_world > 1) {
llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
}
// create llama context // create llama context
struct llama_context_params cparams = llama_context_params_from_gpt_params(params); struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
llama_context * lctx = llama_new_context_with_model(model, cparams); llama_context * lctx = llama_new_context_with_model(model, cparams);
if (n_world == 1) { if (n_world == 1) {
uint32_t n_layers = llama_model_n_layers(model); uint32_t n_layers = llama_model_n_layers(model);
// assign all layers to this device
params.n_layer_window[0] = n_layers; params.n_layer_window[0] = n_layers;
cparams.n_layer_window[0] = n_layers; cparams.n_layer_window[0] = n_layers;
mparams.n_layer_window[0] = n_layers; mparams.n_layer_window[0] = n_layers;
llama_context_n_layer_window(lctx)[0] = n_layers; llama_context_n_layer_window(lctx)[0] = n_layers;
#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
params.n_gpu_layers = std::min((int32_t)n_layers, params.n_gpu_layers);
cparams.n_gpu_layers = params.n_gpu_layers;
mparams.n_gpu_layers = params.n_gpu_layers;
#endif
} else { } else {
uint32_t n_layer_window[32] = {0}, n_gpu_layers[32] = {0}; uint32_t n_layer_window[32] = {0}, n_gpu_layers[32] = {0};
// initialize sockets // initialize sockets
llama_init_sockets(lctx, n_world, my_rank); llama_init_sockets(lctx, n_world, my_rank);
// broadcast startup args
struct startup_args args;
if (my_rank == 0){
args.should_profile = auto_schedule;
args.n_ctx = params.n_ctx;
}
llama_bcast_startup_args(lctx, my_rank, &args);
if (my_rank > 0) {
// receive startup args
auto_schedule = args.should_profile;
params.n_ctx = args.n_ctx;
cparams.n_ctx = args.n_ctx;
}
// if n_world > 1 and need auto schdule, then prifile
if (auto_schedule){
// get device profile
LOG_INF("\nstart profiling this device, this may take some seconds ...\n");
dev_info.rank = params.rank;
dev_info.next_ip = params.next_node_ip.c_str();
if (n_world > 1) {
llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
}
}
// sychronize device profile to the master node // sychronize device profile to the master node
struct device_info * dev_info_set = nullptr;
if (my_rank == 0) { if (my_rank == 0) {
dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
dev_info_set[0] = dev_info;
llama_gather_device_info(lctx, dev_info_set);
device_print_props(dev_info_set, n_world, model, cparams);
if (auto_schedule) { if (auto_schedule) {
// automatically determine n_layer_window and n_gpu_layers std::vector<device_info> dev_info_set(n_world);
if (!assign_layers_to_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) { dev_info_set[0] = dev_info;
llama_gather_device_info(lctx, dev_info_set.data());
device_print_props(dev_info_set.data(), n_world, model, cparams);
// assign layers to devices and remove weak devices
if (!assign_layers_and_select_devices(n_world, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) {
LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__); LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__);
llama_free(lctx); llama_free(lctx);
llama_free_model(model); llama_free_model(model);
return iparams; return iparams;
} }
llama_bcast_layer_setup(lctx, n_layer_window, n_gpu_layers); llama_bcast_layer_setup(lctx, n_layer_window, n_gpu_layers);
llama_rebuild_topo(lctx, n_layer_window, dev_info_set.data());
} else { } else {
// use the user-defined n_layer_window // use the user-defined n_layer_window
std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), n_layer_window); std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), n_layer_window);
llama_bcast_layer_setup(lctx, n_layer_window, nullptr); llama_bcast_layer_setup(lctx, n_layer_window, nullptr);
} }
} else { } else {
llama_send_device_info(lctx, &dev_info); if (auto_schedule){
llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers); llama_send_device_info(lctx, &dev_info);
llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers);
llama_rebuild_topo (lctx, n_layer_window, nullptr);
} else {
llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers);
}
} }
// if this is a weak device, then exit
if (n_layer_window[my_rank] <= 0) {
LOG_INF("No layer is assigned to me, exit.\n");
llama_free(lctx);
llama_free_model(model);
exit(0);
}
// update my rank and n_world
uint32_t update_rank = 0, update_n_world = 1;
std::vector<uint32_t> n_layer_window_temp = {n_layer_window[0]}, n_gpu_layers_temp = {n_gpu_layers[0]};
for (uint32_t i = 1; i < n_world; i++) {
if (n_layer_window[i] <= 0) {
continue;
}
if (i <= my_rank) {
update_rank++;
}
update_n_world++;
n_layer_window_temp.push_back(n_layer_window[i]);
n_gpu_layers_temp.push_back(n_gpu_layers[i]);
}
memset(n_layer_window, 0, n_world * sizeof(uint32_t));
memset(n_gpu_layers, 0, n_world * sizeof(uint32_t));
for (uint32_t i = 0; i < update_n_world; i++) {
n_layer_window[i] = n_layer_window_temp[i];
n_gpu_layers[i] = n_gpu_layers_temp[i];
}
// update my rank
cparams.rank = update_rank;
mparams.rank = update_rank;
params.rank = update_rank;
my_rank = update_rank;
// update n_world
cparams.n_world = update_n_world;
mparams.n_world = update_n_world;
params.n_world = update_n_world;
n_world = update_n_world;
llama_update_context_with_rankworld(lctx, update_rank, update_n_world);
// update n_layer_window and n_gpu_layers // update n_layer_window and n_gpu_layers
std::copy(std::begin(n_layer_window), std::end(n_layer_window), params.n_layer_window); std::copy(std::begin(n_layer_window), std::end(n_layer_window), params.n_layer_window);
std::copy(std::begin(n_layer_window), std::end(n_layer_window), cparams.n_layer_window); std::copy(std::begin(n_layer_window), std::end(n_layer_window), cparams.n_layer_window);
@ -1616,6 +1806,11 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
cparams.n_gpu_layers = n_gpu_layers[my_rank]; cparams.n_gpu_layers = n_gpu_layers[my_rank];
mparams.n_gpu_layers = n_gpu_layers[my_rank]; mparams.n_gpu_layers = n_gpu_layers[my_rank];
llama_model_set_n_gpu_layers(model, n_gpu_layers[my_rank]); llama_model_set_n_gpu_layers(model, n_gpu_layers[my_rank]);
} else { // -ngl is set
params.n_gpu_layers = std::min(params.n_gpu_layers, (int32_t)n_layer_window[my_rank]);
cparams.n_gpu_layers = params.n_gpu_layers;
mparams.n_gpu_layers = params.n_gpu_layers;
llama_model_set_n_gpu_layers(model, params.n_gpu_layers);
} }
} }
@ -1685,7 +1880,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
} }
if (params.warmup) { if (params.warmup) {
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); LOG_WRN("%s: warming up the model with an empty run - please wait ...\n", __func__);
const uint32_t my_rank = cparams.rank; const uint32_t my_rank = cparams.rank;
std::vector<llama_token> tmp; std::vector<llama_token> tmp;
@ -1766,33 +1961,25 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
return mparams; return mparams;
} }
static ggml_type kv_cache_type_from_str(const std::string & s) { const std::vector<ggml_type> kv_cache_types = {
if (s == "f32") { GGML_TYPE_F32,
return GGML_TYPE_F32; GGML_TYPE_F16,
} GGML_TYPE_BF16, // Added BF16 data type support
if (s == "f16") { GGML_TYPE_Q8_0,
return GGML_TYPE_F16; GGML_TYPE_Q4_0,
} GGML_TYPE_Q4_1,
if (s == "q8_0") { GGML_TYPE_IQ4_NL,
return GGML_TYPE_Q8_0; GGML_TYPE_Q5_0,
} GGML_TYPE_Q5_1,
if (s == "q4_0") { };
return GGML_TYPE_Q4_0;
}
if (s == "q4_1") {
return GGML_TYPE_Q4_1;
}
if (s == "iq4_nl") {
return GGML_TYPE_IQ4_NL;
}
if (s == "q5_0") {
return GGML_TYPE_Q5_0;
}
if (s == "q5_1") {
return GGML_TYPE_Q5_1;
}
throw std::runtime_error("Invalid cache type: " + s); static ggml_type kv_cache_type_from_str(const std::string & s) {
for (const auto & type : kv_cache_types) {
if (ggml_type_name(type) == s) {
return type;
}
}
throw std::runtime_error("Unsupported cache type: " + s);
} }
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {

File diff suppressed because it is too large Load diff

View file

@ -15,25 +15,36 @@ struct cpu_props {
const char * name; const char * name;
const char * description; const char * description;
uint32_t cores; uint32_t cores;
float flops_f32_f32; // in GFLOPS float flops_f32_f32; // in GFLOPS
float flops_f16_f32; // in GFLOPS float flops_f16_f32; // in GFLOPS
float flops_q4k_f32; // in GFLOPS float flops_q2k_f32; // in GFLOPS
float flops_q50_f32; // in GFLOPS float flops_q4k_f32; // in GFLOPS
float flops_q5k_f32; // in GFLOPS float flops_q5k_f32; // in GFLOPS
float flops_q6k_f32; // in GFLOPS float flops_q6k_f32; // in GFLOPS
float flops_q80_f32; // in GFLOPS float flops_iq2xxs_f32; // in GFLOPS
float flops_q50_f32; // in GFLOPS
float flops_q80_f32; // in GFLOPS
float flops_iq1s_f32; // in GFLOPS
float flops_iq4nl_f32; // in GFLOPS
float flops_iq1m_f32; // in GFLOPS
cpu_props() : cpu_props()
name(""), : name (""),
description(""), description (""),
cores(0), cores (0),
flops_f32_f32(0.0f), flops_f32_f32 (0.0f),
flops_f16_f32(0.0f), flops_f16_f32 (0.0f),
flops_q4k_f32(0.0f), flops_q2k_f32 (0.0f),
flops_q50_f32(0.0f), flops_q4k_f32 (0.0f),
flops_q5k_f32(0.0f), flops_q5k_f32 (0.0f),
flops_q6k_f32(0.0f), flops_q6k_f32 (0.0f),
flops_q80_f32(0.0f) {} flops_iq2xxs_f32(0.0f),
flops_q50_f32 (0.0f),
flops_q80_f32 (0.0f),
flops_iq1s_f32 (0.0f),
flops_iq4nl_f32 (0.0f),
flops_iq1m_f32 (0.0f)
{}
}; };
struct memory_info { struct memory_info {
@ -77,132 +88,204 @@ struct gpu_support {
struct gpu_props { struct gpu_props {
const char * name; const char * name;
const char * description; const char * description;
float memory_free; // in GiB float memory_free; // in GiB
float memory_total; // in GiB float memory_total; // in GiB
float metal_read_vram_bw; // in GB/s float metal_read_vram_bw; // in GB/s
float metal_flops_f32_f32; // in GFLOPS float metal_flops_f32_f32; // in GFLOPS
float metal_flops_f16_f32; // in GFLOPS float metal_flops_f16_f32; // in GFLOPS
float metal_flops_q4k_f32; // in GFLOPS float metal_flops_q2k_f32; // in GFLOPS
float metal_flops_q50_f32; // in GFLOPS float metal_flops_q4k_f32; // in GFLOPS
float metal_flops_q5k_f32; // in GFLOPS float metal_flops_q5k_f32; // in GFLOPS
float metal_flops_q6k_f32; // in GFLOPS float metal_flops_q6k_f32; // in GFLOPS
float metal_flops_q80_f32; // in GFLOPS float metal_flops_iq2xxs_f32; // in GFLOPS
float metal_mem_cpy_delay; // in ms float metal_flops_q50_f32; // in GFLOPS
float cuda_read_vram_bw; // in GB/s float metal_flops_q80_f32; // in GFLOPS
float cuda_flops_f32_f32; // in GFLOPS float metal_flops_iq1s_f32; // in GFLOPS
float cuda_flops_f16_f32; // in GFLOPS float metal_flops_iq4nl_f32; // in GFLOPS
float cuda_flops_q4k_f32; // in GFLOPS float metal_flops_iq1m_f32; // in GFLOPS
float cuda_flops_q50_f32; // in GFLOPS float metal_mem_cpy_delay; // in ms
float cuda_flops_q5k_f32; // in GFLOPS float cuda_read_vram_bw; // in GB/s
float cuda_flops_q6k_f32; // in GFLOPS float cuda_flops_f32_f32; // in GFLOPS
float cuda_flops_q80_f32; // in GFLOPS float cuda_flops_f16_f32; // in GFLOPS
float cuda_mem_cpy_delay; // in ms float cuda_flops_q2k_f32; // in GFLOPS
float cuda_flops_q4k_f32; // in GFLOPS
float cuda_flops_q5k_f32; // in GFLOPS
float cuda_flops_q6k_f32; // in GFLOPS
float cuda_flops_iq2xxs_f32; // in GFLOPS
float cuda_flops_q50_f32; // in GFLOPS
float cuda_flops_q80_f32; // in GFLOPS
float cuda_flops_iq1s_f32; // in GFLOPS
float cuda_flops_iq4nl_f32; // in GFLOPS
float cuda_flops_iq1m_f32; // in GFLOPS
float cuda_mem_cpy_delay; // in ms
gpu_props() : gpu_props() :
name(""), name (""),
description(""), description (""),
memory_free (0.0f), memory_free (0.0f),
memory_total (0.0f), memory_total (0.0f),
metal_read_vram_bw (0.0f), metal_read_vram_bw (0.0f),
metal_flops_f32_f32(0.0f), metal_flops_f32_f32 (0.0f),
metal_flops_f16_f32(0.0f), metal_flops_f16_f32 (0.0f),
metal_flops_q4k_f32(0.0f), metal_flops_q2k_f32 (0.0f),
metal_flops_q50_f32(0.0f), metal_flops_q4k_f32 (0.0f),
metal_flops_q5k_f32(0.0f), metal_flops_q5k_f32 (0.0f),
metal_flops_q6k_f32(0.0f), metal_flops_q6k_f32 (0.0f),
metal_flops_q80_f32(0.0f), metal_flops_iq2xxs_f32 (0.0f),
metal_mem_cpy_delay(0.0f), metal_flops_q50_f32 (0.0f),
cuda_read_vram_bw (0.0f), metal_flops_q80_f32 (0.0f),
cuda_flops_f32_f32 (0.0f), metal_flops_iq1s_f32 (0.0f),
cuda_flops_f16_f32 (0.0f), metal_flops_iq4nl_f32 (0.0f),
cuda_flops_q4k_f32 (0.0f), metal_flops_iq1m_f32 (0.0f),
cuda_flops_q50_f32 (0.0f), metal_mem_cpy_delay (0.0f),
cuda_flops_q5k_f32 (0.0f), cuda_read_vram_bw (0.0f),
cuda_flops_q6k_f32 (0.0f), cuda_flops_f32_f32 (0.0f),
cuda_flops_q80_f32 (0.0f), cuda_flops_f16_f32 (0.0f),
cuda_mem_cpy_delay (0.0f) {} cuda_flops_q2k_f32 (0.0f),
cuda_flops_q4k_f32 (0.0f),
cuda_flops_q5k_f32 (0.0f),
cuda_flops_q6k_f32 (0.0f),
cuda_flops_iq2xxs_f32 (0.0f),
cuda_flops_q50_f32 (0.0f),
cuda_flops_q80_f32 (0.0f),
cuda_flops_iq1s_f32 (0.0f),
cuda_flops_iq4nl_f32 (0.0f),
cuda_flops_iq1m_f32 (0.0f),
cuda_mem_cpy_delay (0.0f) {}
}; };
struct model_flops { struct model_flops {
float inp_embd_ms; float inp_embd_ms;
int64_t output_f32_f32; int64_t output_f32_f32;
int64_t output_f16_f32; int64_t output_f16_f32;
int64_t output_q2k_f32;
int64_t output_q4k_f32; int64_t output_q4k_f32;
int64_t output_q50_f32;
int64_t output_q5k_f32; int64_t output_q5k_f32;
int64_t output_q6k_f32; int64_t output_q6k_f32;
int64_t output_iq2xxs_f32;
int64_t output_q50_f32;
int64_t output_q80_f32; int64_t output_q80_f32;
int64_t output_iq1s_f32;
int64_t output_iq4nl_f32;
int64_t output_iq1m_f32;
int64_t layer_f32_f32; int64_t layer_f32_f32;
int64_t layer_f16_f32; int64_t layer_f16_f32;
int64_t layer_q2k_f32;
int64_t layer_q4k_f32; int64_t layer_q4k_f32;
int64_t layer_q50_f32;
int64_t layer_q5k_f32; int64_t layer_q5k_f32;
int64_t layer_q6k_f32; int64_t layer_q6k_f32;
int64_t layer_iq2xxs_f32;
int64_t layer_q50_f32;
int64_t layer_q80_f32; int64_t layer_q80_f32;
int64_t layer_iq1s_f32;
int64_t layer_iq4nl_f32;
int64_t layer_iq1m_f32;
model_flops() : model_flops() :
inp_embd_ms(0.0f), inp_embd_ms (0.0f),
output_f32_f32(0), output_f32_f32 (0),
output_f16_f32(0), output_f16_f32 (0),
output_q4k_f32(0), output_q2k_f32 (0),
output_q50_f32(0), output_q4k_f32 (0),
output_q5k_f32(0), output_q5k_f32 (0),
output_q6k_f32(0), output_q6k_f32 (0),
output_q80_f32(0), output_iq2xxs_f32 (0),
layer_f32_f32 (0), output_q50_f32 (0),
layer_f16_f32 (0), output_q80_f32 (0),
layer_q4k_f32 (0), output_iq1s_f32 (0),
layer_q50_f32 (0), output_iq4nl_f32 (0),
layer_q5k_f32 (0), output_iq1m_f32 (0),
layer_q6k_f32 (0), layer_f32_f32 (0),
layer_q80_f32 (0) {} layer_f16_f32 (0),
layer_q2k_f32 (0),
layer_q4k_f32 (0),
layer_q5k_f32 (0),
layer_q6k_f32 (0),
layer_iq2xxs_f32 (0),
layer_q50_f32 (0),
layer_q80_f32 (0),
layer_iq1s_f32 (0),
layer_iq4nl_f32 (0),
layer_iq1m_f32 (0)
{}
}; };
struct model_params { struct model_params {
int64_t input_f32; int64_t input_f32;
int64_t input_f16; int64_t input_f16;
int64_t input_q2k;
int64_t input_q4k; int64_t input_q4k;
int64_t input_q50;
int64_t input_q5k; int64_t input_q5k;
int64_t input_q6k; int64_t input_q6k;
int64_t input_iq2xxs;
int64_t input_q50;
int64_t input_q80; int64_t input_q80;
int64_t input_iq1s;
int64_t input_iq4nl;
int64_t input_iq1m;
int64_t output_f32; int64_t output_f32;
int64_t output_f16; int64_t output_f16;
int64_t output_q2k;
int64_t output_q4k; int64_t output_q4k;
int64_t output_q50;
int64_t output_q5k; int64_t output_q5k;
int64_t output_q6k; int64_t output_q6k;
int64_t output_iq2xxs;
int64_t output_q50;
int64_t output_q80; int64_t output_q80;
int64_t output_iq1s;
int64_t output_iq4nl;
int64_t output_iq1m;
int64_t layer_f32; int64_t layer_f32;
int64_t layer_f16; int64_t layer_f16;
int64_t layer_q2k;
int64_t layer_q4k; int64_t layer_q4k;
int64_t layer_q50;
int64_t layer_q5k; int64_t layer_q5k;
int64_t layer_q6k; int64_t layer_q6k;
int64_t layer_iq2xxs;
int64_t layer_q50;
int64_t layer_q80; int64_t layer_q80;
int64_t layer_iq1s;
int64_t layer_iq4nl;
int64_t layer_iq1m;
model_params() : model_params() :
input_f32 (0), input_f32 (0),
input_f16 (0), input_f16 (0),
input_q4k (0), input_q2k (0),
input_q50 (0), input_q4k (0),
input_q5k (0), input_q5k (0),
input_q6k (0), input_q6k (0),
input_q80 (0), input_iq2xxs (0),
output_f32(0), input_q50 (0),
output_f16(0), input_q80 (0),
output_q4k(0), input_iq1s (0),
output_q50(0), input_iq4nl (0),
output_q5k(0), input_iq1m (0),
output_q6k(0), output_f32 (0),
output_q80(0), output_f16 (0),
layer_f32 (0), output_q2k (0),
layer_f16 (0), output_q4k (0),
layer_q4k (0), output_q5k (0),
layer_q50 (0), output_q6k (0),
layer_q5k (0), output_iq2xxs (0),
layer_q6k (0), output_q50 (0),
layer_q80 (0) {} output_q80 (0),
output_iq1s (0),
output_iq4nl (0),
output_iq1m (0),
layer_f32 (0),
layer_f16 (0),
layer_q2k (0),
layer_q4k (0),
layer_q5k (0),
layer_q6k (0),
layer_iq2xxs (0),
layer_q50 (0),
layer_q80 (0),
layer_iq1s (0),
layer_iq4nl (0),
layer_iq1m (0)
{}
}; };
struct model_bytes { struct model_bytes {
@ -229,10 +312,16 @@ struct disk_props {
write_rnd_bw(0.0f) {} write_rnd_bw(0.0f) {}
}; };
struct startup_args{
bool should_profile;
uint32_t n_ctx;
};
struct device_info { struct device_info {
uint32_t rank; uint32_t rank;
const char * device_name; const char * device_name;
const char * device_os; const char * device_os;
const char * next_ip;
struct disk_props disk; struct disk_props disk;
struct cpu_props cpu_props; struct cpu_props cpu_props;
struct memory_info memory; struct memory_info memory;
@ -246,6 +335,7 @@ struct device_info {
rank(0), rank(0),
device_name(""), device_name(""),
device_os(""), device_os(""),
next_ip(""),
disk(), disk(),
cpu_props(), cpu_props(),
memory(), memory(),

View file

@ -143,8 +143,8 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
const uint32_t n_world = params.n_world; uint32_t n_world = params.n_world;
const uint32_t my_rank = params.rank; uint32_t my_rank = params.rank;
GGML_ASSERT(!(n_world == 1 && my_rank > 0)); GGML_ASSERT(!(n_world == 1 && my_rank > 0));
// check if --n-layer-window and --world is matched // check if --n-layer-window and --world is matched
@ -201,6 +201,10 @@ int main(int argc, char ** argv) {
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
llama_init_result llama_init = llama_init_from_gpt_params(params); llama_init_result llama_init = llama_init_from_gpt_params(params);
// update my rank and world size if any devices removed
my_rank = params.rank;
n_world = params.n_world;
model = llama_init.model; model = llama_init.model;
ctx = llama_init.context; ctx = llama_init.context;
@ -348,6 +352,9 @@ int main(int argc, char ** argv) {
// remove any "future" tokens that we might have inherited from the previous session // remove any "future" tokens that we might have inherited from the previous session
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1); llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
if (my_rank == 0) {
llama_send_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
}
} }
LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n", LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
@ -593,6 +600,11 @@ int main(int argc, char ** argv) {
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
if (my_rank == 0) {
llama_send_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
llama_send_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
}
n_past -= n_discard; n_past -= n_discard;
LOG_DBG("after swap: n_past = %d\n", n_past); LOG_DBG("after swap: n_past = %d\n", n_past);

View file

@ -116,7 +116,7 @@ struct server_task {
}; };
struct server_task_result { struct server_task_result {
int id = -1; int id = -1;
json data; json data;
@ -1063,6 +1063,9 @@ struct server_context {
// clear the entire KV cache // clear the entire KV cache
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
llama_send_kv_cache_clear(ctx);
clean_kv_cache = false; clean_kv_cache = false;
} }
@ -1191,7 +1194,7 @@ struct server_context {
SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict); SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
} }
// if context shift is disabled, we stop when it reaches the context limit // we stop when it reaches the context limit, otherwise it may run forever
if (slot.n_decoded >= slot.n_ctx) { if (slot.n_decoded >= slot.n_ctx) {
slot.truncated = true; slot.truncated = true;
slot.stopped_limit = true; slot.stopped_limit = true;
@ -1917,8 +1920,11 @@ struct server_context {
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard); llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); llama_kv_cache_seq_add (ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
llama_send_kv_cache_seq_rm (ctx, slot.id , n_keep , n_keep + n_discard);
llama_send_kv_cache_seq_add(ctx, slot.id , n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
if (slot.params.cache_prompt) { if (slot.params.cache_prompt) {
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@ -2084,7 +2090,6 @@ struct server_context {
// if input prompt is too big, truncate it (if group attention self-extend is disabled) // if input prompt is too big, truncate it (if group attention self-extend is disabled)
if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) { if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) {
const int n_left = slot.n_ctx - slot.params.n_keep; const int n_left = slot.n_ctx - slot.params.n_keep;
const int n_block_size = n_left / 2; const int n_block_size = n_left / 2;
const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
@ -2161,12 +2166,14 @@ struct server_context {
int p0 = (int) system_tokens.size() + slot.n_past; int p0 = (int) system_tokens.size() + slot.n_past;
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) { if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
// could not partially delete (likely using a non-Transformer model) // could not partially delete (likely using a non-Transformer model)
llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1); llama_kv_cache_seq_rm (ctx, slot.id + 1, -1, -1);
llama_send_kv_cache_seq_rm(ctx, slot.id , -1, -1);
p0 = (int) system_tokens.size(); p0 = (int) system_tokens.size();
if (p0 != 0) { if (p0 != 0) {
// copy over the system prompt when there is one // copy over the system prompt when there is one
llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1); llama_kv_cache_seq_cp (ctx, 0, slot.id + 1, -1, -1);
llama_send_kv_cache_seq_cp(ctx, 0, slot.id , -1, -1);
} }
// there is no common part left (except for the system prompt) // there is no common part left (except for the system prompt)
@ -2175,6 +2182,8 @@ struct server_context {
slot.ga_i = 0; slot.ga_i = 0;
// TODO: is the system prompt ever in the sampling context? // TODO: is the system prompt ever in the sampling context?
gpt_sampler_reset(slot.smpl); gpt_sampler_reset(slot.smpl);
} else {
llama_send_kv_cache_seq_rm(ctx, slot.id, p0, -1);
} }
// remove the non-common part from the cache // remove the non-common part from the cache
@ -2260,9 +2269,14 @@ struct server_context {
SLT_DBG(slot, "div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); SLT_DBG(slot, "div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd); llama_kv_cache_seq_add (ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd);
llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n); llama_send_kv_cache_seq_add(ctx, slot.id , slot.ga_i, slot.n_past_se, ib * bd);
llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd);
llama_kv_cache_seq_div (ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
llama_send_kv_cache_seq_div(ctx, slot.id , slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
llama_kv_cache_seq_add (ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd);
llama_send_kv_cache_seq_add(ctx, slot.id , slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd);
slot.n_past_se -= bd; slot.n_past_se -= bd;
@ -3329,10 +3343,6 @@ int main(int argc, char ** argv) {
// bind HTTP listen port, run the HTTP server in a thread // bind HTTP listen port, run the HTTP server in a thread
if (!svr->bind_to_port(params.hostname, params.port)) { if (!svr->bind_to_port(params.hostname, params.port)) {
//LOG_ERROR("couldn't bind HTTP server socket", {
// {"hostname", params.hostname},
// {"port", params.port},
//});
LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port); LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port);
clean_up(); clean_up();
return 1; return 1;
@ -3377,10 +3387,6 @@ int main(int argc, char ** argv) {
ctx_server.queue_tasks.terminate(); ctx_server.queue_tasks.terminate();
}; };
LOG_INF("%s: server is listening on %s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
ctx_server.queue_tasks.start_loop();
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action; struct sigaction sigint_action;
sigint_action.sa_handler = signal_handler; sigint_action.sa_handler = signal_handler;
@ -3395,6 +3401,13 @@ int main(int argc, char ** argv) {
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true); SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif #endif
LOG_INF("%s: server is listening on %s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
ctx_server.queue_tasks.start_loop();
char * stop_signal = nullptr;
llama_free_sockets(ctx_server.ctx, &stop_signal);
clean_up(); clean_up();
t.join(); t.join();

View file

@ -385,12 +385,12 @@ extern "C" {
GGML_TYPE_F64 = 28, GGML_TYPE_F64 = 28,
GGML_TYPE_IQ1_M = 29, GGML_TYPE_IQ1_M = 29,
GGML_TYPE_BF16 = 30, GGML_TYPE_BF16 = 30,
GGML_TYPE_Q4_0_4_4 = 31, // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
GGML_TYPE_Q4_0_4_8 = 32, // GGML_TYPE_Q4_0_4_8 = 32,
GGML_TYPE_Q4_0_8_8 = 33, // GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_TQ1_0 = 34, GGML_TYPE_TQ1_0 = 34,
GGML_TYPE_TQ2_0 = 35, GGML_TYPE_TQ2_0 = 35,
GGML_TYPE_COUNT, GGML_TYPE_COUNT = 39,
}; };
// precision // precision
@ -431,9 +431,6 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
}; };
// available tensor operations: // available tensor operations:

View file

@ -15725,15 +15725,6 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
{ {
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb); VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
} break; } break;
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
{
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
} break;
case GGML_TYPE_Q4_0_8_8:
{
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
} break;
case GGML_TYPE_I8: case GGML_TYPE_I8:
case GGML_TYPE_I16: case GGML_TYPE_I16:

View file

@ -1076,54 +1076,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_BF16, .vec_dot_type = GGML_TYPE_BF16,
.nrows = 1, .nrows = 1,
}, },
[GGML_TYPE_Q4_0_4_4] = {
.type_name = "q4_0_4x4",
.blck_size = QK4_0,
.blck_size_interleave = 4,
.type_size = sizeof(block_q4_0),
.is_quantized = true,
.to_float = NULL,
.from_float = NULL,
.from_float_ref = NULL,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
.ncols = 4,
.gemv = ggml_gemv_q4_0_4x4_q8_0,
.gemm = ggml_gemm_q4_0_4x4_q8_0,
},
[GGML_TYPE_Q4_0_4_8] = {
.type_name = "q4_0_4x8",
.blck_size = QK4_0,
.blck_size_interleave = 8,
.type_size = sizeof(block_q4_0),
.is_quantized = true,
.to_float = NULL,
.from_float = NULL,
.from_float_ref = NULL,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
.ncols = 4,
.gemv = ggml_gemv_q4_0_4x8_q8_0,
.gemm = ggml_gemm_q4_0_4x8_q8_0,
},
[GGML_TYPE_Q4_0_8_8] = {
.type_name = "q4_0_8x8",
.blck_size = QK4_0,
.blck_size_interleave = 8,
.type_size = sizeof(block_q4_0),
.is_quantized = true,
.to_float = NULL,
.from_float = NULL,
.from_float_ref = NULL,
.vec_dot = NULL,
.vec_dot_type = GGML_TYPE_Q8_0,
.nrows = 1,
.ncols = 8,
.gemv = ggml_gemv_q4_0_8x8_q8_0,
.gemm = ggml_gemm_q4_0_8x8_q8_0,
},
[GGML_TYPE_TQ1_0] = { [GGML_TYPE_TQ1_0] = {
.type_name = "tq1_0", .type_name = "tq1_0",
.blck_size = QK_K, .blck_size = QK_K,
@ -3472,7 +3424,7 @@ size_t ggml_row_size(enum ggml_type type, int64_t ne) {
double ggml_type_sizef(enum ggml_type type) { double ggml_type_sizef(enum ggml_type type) {
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size; return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
} }
const char * ggml_type_name(enum ggml_type type) { const char * ggml_type_name(enum ggml_type type) {
return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE"; return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
} }
@ -3578,9 +3530,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break; case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break; case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break; case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break;
case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break;
case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break;
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
} }
@ -4107,7 +4056,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
/*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data, /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
/*.name =*/ { 0 }, /*.name =*/ { 0 },
/*.extra =*/ NULL, /*.extra =*/ NULL,
///*.padding =*/ { 0 }, // /*.padding =*/ { 0 },
}; };
#ifdef __clang__ #ifdef __clang__
@ -9517,9 +9466,6 @@ static void ggml_compute_forward_add(
case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
{ {
ggml_compute_forward_add_q_f32(params, dst); ggml_compute_forward_add_q_f32(params, dst);
} break; } break;
@ -9897,9 +9843,6 @@ static void ggml_compute_forward_add1(
case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
{ {
ggml_compute_forward_add1_q_f32(params, dst); ggml_compute_forward_add1_q_f32(params, dst);
} break; } break;
@ -10027,9 +9970,6 @@ static void ggml_compute_forward_acc(
case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
default: default:
{ {
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
@ -13093,9 +13033,6 @@ static void ggml_compute_forward_out_prod(
case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
{ {
ggml_compute_forward_out_prod_q_f32(params, dst); ggml_compute_forward_out_prod_q_f32(params, dst);
} break; } break;
@ -13283,9 +13220,6 @@ static void ggml_compute_forward_set(
case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
default: default:
{ {
GGML_ABORT("fatal error"); GGML_ABORT("fatal error");
@ -13547,9 +13481,6 @@ static void ggml_compute_forward_get_rows(
case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
{ {
ggml_compute_forward_get_rows_q(params, dst); ggml_compute_forward_get_rows_q(params, dst);
} break; } break;
@ -14139,9 +14070,6 @@ static void ggml_compute_forward_clamp(
case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q8_K: case GGML_TYPE_Q8_K:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
case GGML_TYPE_I8: case GGML_TYPE_I8:
case GGML_TYPE_I16: case GGML_TYPE_I16:
case GGML_TYPE_I32: case GGML_TYPE_I32:
@ -21941,9 +21869,6 @@ size_t ggml_quantize_chunk(
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ {
size_t elemsize = sizeof(ggml_fp16_t); size_t elemsize = sizeof(ggml_fp16_t);

View file

@ -165,18 +165,18 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors, 1 bit quantization
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors, 1 bit quantization
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors // LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors // LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors // LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
@ -283,7 +283,7 @@ extern "C" {
uint32_t n_world; // number of nodes uint32_t n_world; // number of nodes
uint32_t rank; // my node rank uint32_t rank; // my node rank
uint32_t n_layer_window[32]; // number of layers to kept each time uint32_t n_layer_window[32]; // number of layers to kept each time
int32_t n_gpu_layers; // number of layers to store in VRAM int32_t n_gpu_layers; // number of layers to store in VRAM
enum llama_split_mode split_mode; // how to split the model across multiple GPUs enum llama_split_mode split_mode; // how to split the model across multiple GPUs
// main_gpu interpretation depends on split_mode: // main_gpu interpretation depends on split_mode:
@ -453,14 +453,21 @@ extern "C" {
LLAMA_API void llama_free_sockets (struct llama_context * ctx, char ** msg); LLAMA_API void llama_free_sockets (struct llama_context * ctx, char ** msg);
LLAMA_API int llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set); LLAMA_API int llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set);
LLAMA_API int llama_send_device_info (struct llama_context * ctx, struct device_info * dev_info); LLAMA_API int llama_send_device_info (struct llama_context * ctx, struct device_info * dev_info);
LLAMA_API int llama_bcast_startup_args(struct llama_context * ctx, uint32_t rank, struct startup_args * args);
LLAMA_API int llama_bcast_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers); LLAMA_API int llama_bcast_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers);
LLAMA_API int llama_rebuild_topo (struct llama_context * ctx, uint32_t * n_layer_window, struct device_info * dev_info_set);
LLAMA_API int llama_recv_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers); LLAMA_API int llama_recv_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers);
LLAMA_API int llm_load_tensors( LLAMA_API int llm_load_tensors(
struct llama_model_loader * ml, struct llama_model_loader * ml,
struct llama_model * model, struct llama_model * model,
struct llama_model_params params); struct llama_model_params params);
LLAMA_API void llama_update_context_with_rankworld(
struct llama_context * ctx,
uint32_t rank,
uint32_t n_world);
LLAMA_API struct llama_context * llama_new_context_with_model( LLAMA_API struct llama_context * llama_new_context_with_model(
struct llama_model * model, struct llama_model * model,
struct llama_context_params params); struct llama_context_params params);
@ -706,8 +713,10 @@ extern "C" {
LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx); LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
// Clear the KV cache - both cell info is erased and KV data is zeroed // Clear the KV cache - both cell info is erased and KV data is zeroed
LLAMA_API void llama_kv_cache_clear( LLAMA_API void llama_kv_cache_clear(struct llama_context * ctx);
struct llama_context * ctx);
// Notify other devices to clear their KV cache
LLAMA_API void llama_send_kv_cache_clear(struct llama_context * ctx);
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1) // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
@ -719,6 +728,13 @@ extern "C" {
llama_seq_id seq_id, llama_seq_id seq_id,
llama_pos p0, llama_pos p0,
llama_pos p1); llama_pos p1);
// Notify other nodes to remove a range from their KV cache
LLAMA_API void llama_send_kv_cache_seq_rm(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1);
// Copy all tokens that belong to the specified sequence to another sequence // Copy all tokens that belong to the specified sequence to another sequence
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
@ -730,6 +746,14 @@ extern "C" {
llama_seq_id seq_id_dst, llama_seq_id seq_id_dst,
llama_pos p0, llama_pos p0,
llama_pos p1); llama_pos p1);
// Notify other nodes to copy a range of KV entries
LLAMA_API void llama_send_kv_cache_seq_cp(
struct llama_context * ctx,
llama_seq_id seq_id_src,
llama_seq_id seq_id_dst,
llama_pos p0,
llama_pos p1);
// Removes all tokens that do not belong to the specified sequence // Removes all tokens that do not belong to the specified sequence
LLAMA_API void llama_kv_cache_seq_keep( LLAMA_API void llama_kv_cache_seq_keep(
@ -749,6 +773,14 @@ extern "C" {
llama_pos p1, llama_pos p1,
llama_pos delta); llama_pos delta);
// Notify other nodes to shift (add) their KV cache entries
LLAMA_API void llama_send_kv_cache_seq_add(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
llama_pos delta);
// Integer division of the positions by factor of `d > 1` // Integer division of the positions by factor of `d > 1`
// If the KV cache is RoPEd, the KV data is updated accordingly: // If the KV cache is RoPEd, the KV data is updated accordingly:
// - lazily on next llama_decode() // - lazily on next llama_decode()
@ -761,6 +793,14 @@ extern "C" {
llama_pos p0, llama_pos p0,
llama_pos p1, llama_pos p1,
int d); int d);
// Notify other nodes to perform a division operation on a KV cache range
LLAMA_API void llama_send_kv_cache_seq_div(
struct llama_context * ctx,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
int d);
// Returns the largest position present in the KV cache for the specified sequence // Returns the largest position present in the KV cache for the specified sequence
LLAMA_API llama_pos llama_kv_cache_seq_pos_max( LLAMA_API llama_pos llama_kv_cache_seq_pos_max(

View file

@ -127,7 +127,7 @@ printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
pushd "$build_dir" pushd "$build_dir"
tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1')) tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
if [ ${#tests[@]} -eq 0 ]; then if [ ${#tests[@]} -eq 0 ]; then
abort "No tests avaliable... check your compliation process..." abort "No tests avaliable... check your compilation process..."
fi fi
popd > /dev/null || exit 1 popd > /dev/null || exit 1

File diff suppressed because it is too large Load diff

62
tools/profile_tool.cpp Normal file
View file

@ -0,0 +1,62 @@
#include "arg.h"
#include "common.h"
#include "console.h"
#include "log.h"
#include "llama.h"
static void print_usage(int argc, char ** argv) {
(void) argc;
LOG("\nexample usage:\n");
LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
LOG("\n");
}
int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
return 1;
}
if (params.n_ctx != 0 && params.n_ctx < 8) {
LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
if (params.rope_freq_base != 0.0) {
LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
}
if (params.rope_freq_scale != 0.0) {
LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
}
// load the model and apply lora adapter, if any
auto mparams = llama_model_params_from_gpt_params(params);
struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
struct llama_model * model = nullptr;
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else if (!params.model_url.empty()) {
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
} else {
model = llama_load_model_from_file(params.model.c_str(), mparams);
}
if (model == NULL) {
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
return -1;
}
llama_model_loader * ml = llama_model_load(params.model.c_str(), model, &mparams);
device_info dev_info;
llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
device_print_props(&dev_info, 1, model, cparams);
llama_free_model(model);
return 0;
}