Merge branch 'master' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	.github/workflows/editorconfig.yml
#	.gitignore
#	CMakeLists.txt
#	README.md
This commit is contained in:
Concedo 2024-01-31 18:53:38 +08:00
commit 15deabd200
15 changed files with 1060 additions and 1371 deletions

View file

@ -8,10 +8,14 @@
[Linux](#linux) [Linux](#linux)
[Windows](#windows)
[Environment Variable](#environment-variable) [Environment Variable](#environment-variable)
[Known Issue](#known-issue) [Known Issue](#known-issue)
[Q&A](#q&a)
[Todo](#todo) [Todo](#todo)
## Background ## Background
@ -33,7 +37,7 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|OS|Status|Verified| |OS|Status|Verified|
|-|-|-| |-|-|-|
|Linux|Support|Ubuntu 22.04| |Linux|Support|Ubuntu 22.04|
|Windows|Ongoing| | |Windows|Support|Windows 11|
## Intel GPU ## Intel GPU
@ -42,7 +46,7 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|-|-|-| |-|-|-|
|Intel Data Center Max Series| Support| Max 1550| |Intel Data Center Max Series| Support| Max 1550|
|Intel Data Center Flex Series| Support| Flex 170| |Intel Data Center Flex Series| Support| Flex 170|
|Intel Arc Series| Support| Arc 770| |Intel Arc Series| Support| Arc 770, 730M|
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake| |Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7| |Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
@ -131,6 +135,7 @@ cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
#build all binary #build all binary
cmake --build . --config Release -v cmake --build . --config Release -v
cd ..
``` ```
or or
@ -195,7 +200,7 @@ GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building
or run by script: or run by script:
``` ```
./examples/sycl/run_llama2.sh ./examples/sycl/run-llama2.sh
``` ```
Note: Note:
@ -205,11 +210,175 @@ Note:
5. Check the device ID in output 5. Check the device ID in output
Like Like:
``` ```
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
``` ```
## Windows
### Setup Environment
1. Install Intel GPU driver.
Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
2. Install Intel® oneAPI Base toolkit.
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
Recommend to install to default folder: **/opt/intel/oneapi**.
Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
b. Enable oneAPI running environment:
- In Search, input 'oneAPI'.
Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
- In Run:
In CMD:
```
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
```
c. Check GPU
In oneAPI command line:
```
sycl-ls
```
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
Output (example):
```
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
[opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO [31.0.101.5186]
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
```
3. Install cmake & make
a. Download & install cmake for windows: https://cmake.org/download/
b. Download & install make for windows provided by mingw-w64: https://www.mingw-w64.org/downloads/
### Build locally:
In oneAPI command line window:
```
mkdir -p build
cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
:: for FP16
:: faster for long-prompt inference
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
:: for FP32
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
:: build example/main only
:: make main
:: build all binary
make -j
cd ..
```
or
```
.\examples\sycl\win-build-sycl.bat
```
Note:
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
### Run
1. Put model file to folder **models**
2. Enable oneAPI running environment
- In Search, input 'oneAPI'.
Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
- In Run:
In CMD:
```
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
```
3. List device ID
Run without parameter:
```
build\bin\ls-sycl-device.exe
or
build\bin\main.exe
```
Check the ID in startup log, like:
```
found 4 SYCL devices:
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
```
|Attribute|Note|
|-|-|
|compute capability 1.3|Level-zero running time, recommended |
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
4. Set device ID and execute llama.cpp
Set device ID = 0 by **set GGML_SYCL_DEVICE=0**
```
set GGML_SYCL_DEVICE=0
build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0
```
or run by script:
```
.\examples\sycl\win-run-llama2.bat
```
Note:
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
5. Check the device ID in output
Like:
```
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
```
## Environment Variable ## Environment Variable
@ -220,7 +389,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.| |LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.| |LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path| |CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path| |CMAKE_CXX_COMPILER|icpx (Linux), icx (Windows)|use icpx/icx for SYCL code path|
#### Running #### Running
@ -232,19 +401,24 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
## Known Issue ## Known Issue
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
Miss to enable oneAPI running environment.
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
- Hang during startup - Hang during startup
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block. llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
Solution: add **--no-mmap**. Solution: add **--no-mmap**.
## Q&A
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
Miss to enable oneAPI running environment.
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
- In Windows, no result, not error.
Miss to enable oneAPI running environment.
## Todo ## Todo
- Support to build in Windows. - Support to build in Windows.

View file

@ -1521,7 +1521,9 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false"); fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false"); fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false"); fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");

View file

@ -564,6 +564,7 @@ struct test {
static const bool cuda; static const bool cuda;
static const bool opencl; static const bool opencl;
static const bool vulkan; static const bool vulkan;
static const bool kompute;
static const bool metal; static const bool metal;
static const bool gpu_blas; static const bool gpu_blas;
static const bool blas; static const bool blas;
@ -648,6 +649,9 @@ struct test {
if (vulkan) { if (vulkan) {
return "Vulkan"; return "Vulkan";
} }
if (kompute) {
return "Kompute";
}
if (metal) { if (metal) {
return "Metal"; return "Metal";
} }
@ -663,7 +667,7 @@ struct test {
static const std::vector<std::string> & get_fields() { static const std::vector<std::string> & get_fields() {
static const std::vector<std::string> fields = { static const std::vector<std::string> fields = {
"build_commit", "build_number", "build_commit", "build_number",
"cuda", "opencl", "vulkan", "metal", "gpu_blas", "blas", "cuda", "opencl", "vulkan", "kompute", "metal", "gpu_blas", "blas",
"cpu_info", "gpu_info", "cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params", "model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_threads", "type_k", "type_v", "n_batch", "n_threads", "type_k", "type_v",
@ -687,8 +691,9 @@ struct test {
field == "avg_ns" || field == "stddev_ns") { field == "avg_ns" || field == "stddev_ns") {
return INT; return INT;
} }
if (field == "cuda" || field == "opencl" || field == "vulkan"|| field == "metal" || field == "gpu_blas" || field == "blas" || if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") { field == "gpu_blas" || field == "blas" || field == "f16_kv" || field == "no_kv_offload" ||
field == "mul_mat_q") {
return BOOL; return BOOL;
} }
if (field == "avg_ts" || field == "stddev_ts") { if (field == "avg_ts" || field == "stddev_ts") {
@ -715,7 +720,8 @@ struct test {
} }
std::vector<std::string> values = { std::vector<std::string> values = {
build_commit, std::to_string(build_number), build_commit, std::to_string(build_number),
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas), std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info, cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
@ -744,6 +750,7 @@ const int test::build_number = LLAMA_BUILD_NUMBER;
const bool test::cuda = !!ggml_cpu_has_cublas(); const bool test::cuda = !!ggml_cpu_has_cublas();
const bool test::opencl = !!ggml_cpu_has_clblast(); const bool test::opencl = !!ggml_cpu_has_clblast();
const bool test::vulkan = !!ggml_cpu_has_vulkan(); const bool test::vulkan = !!ggml_cpu_has_vulkan();
const bool test::kompute = !!ggml_cpu_has_kompute();
const bool test::metal = !!ggml_cpu_has_metal(); const bool test::metal = !!ggml_cpu_has_metal();
const bool test::gpu_blas = !!ggml_cpu_has_gpublas(); const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
const bool test::blas = !!ggml_cpu_has_blas(); const bool test::blas = !!ggml_cpu_has_blas();

View file

@ -48,6 +48,7 @@ chat_completion() {
top_p: 0.9, top_p: 0.9,
n_keep: $n_keep, n_keep: $n_keep,
n_predict: 256, n_predict: 256,
cache_prompt: true,
stop: ["\n### Human:"], stop: ["\n### Human:"],
stream: true stream: true
}')" }')"

View file

@ -186,7 +186,7 @@ struct llama_client_slot
llama_sampling_context *ctx_sampling = nullptr; llama_sampling_context *ctx_sampling = nullptr;
int32_t ga_i = 0; // group-attention state int32_t ga_i = 0; // group-attention state
int32_t ga_n = 1;// group-attention factor int32_t ga_n = 1; // group-attention factor
int32_t ga_w = 512; // group-attention width int32_t ga_w = 512; // group-attention width
int32_t n_past_se = 0; // self-extend int32_t n_past_se = 0; // self-extend
@ -221,6 +221,7 @@ struct llama_client_slot
infill = false; infill = false;
ga_i = 0; ga_i = 0;
n_past_se = 0; n_past_se = 0;
generated_token_probs.clear(); generated_token_probs.clear();
for (slot_image & img : images) for (slot_image & img : images)
@ -1228,7 +1229,7 @@ struct llama_server_context
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
for (int i = 0; i < (int) append_tokens.size(); ++i) for (int i = 0; i < (int) append_tokens.size(); ++i)
{ {
llama_batch_add(batch, append_tokens[i], slot.n_past, { slot.id }, true); llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
slot.n_past += 1; slot.n_past += 1;
} }
} }
@ -1296,6 +1297,8 @@ struct llama_server_context
for (llama_client_slot &slot : slots) for (llama_client_slot &slot : slots)
{ {
slot.cache_tokens.clear(); slot.cache_tokens.clear();
slot.n_past = 0;
slot.n_past_se = 0;
} }
} }
@ -1365,26 +1368,26 @@ struct llama_server_context
kv_cache_clear(); kv_cache_clear();
} }
return true; return true;
} else { }
task_server task; task_server task;
task.type = TASK_TYPE_NEXT_RESPONSE; task.type = TASK_TYPE_NEXT_RESPONSE;
task.target_id = -1; task.target_id = -1;
queue_tasks.post(task); queue_tasks.post(task);
}
for (llama_client_slot &slot : slots) for (llama_client_slot &slot : slots)
{ {
if (slot.ga_n == 1) if (slot.ga_n == 1)
{ {
if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx) if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
{ {
// Shift context // Shift context
const int n_left = slot.n_past - slot.params.n_keep - 1; const int n_left = system_tokens.size() + slot.n_past - slot.params.n_keep - 1;
const int n_discard = n_left / 2; const int n_discard = n_left / 2;
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard); LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1); llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1);
llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard); llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, system_tokens.size() + slot.n_past, -n_discard);
for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++) for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
{ {
@ -1430,8 +1433,10 @@ struct llama_server_context
slot.i_batch = batch.n_tokens; slot.i_batch = batch.n_tokens;
const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
// TODO: we always have to take into account the "system_tokens"
// this is not great and needs to be improved somehow
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
slot.n_past += 1; slot.n_past += 1;
} }
@ -1592,10 +1597,13 @@ struct llama_server_context
// process the prefix of first image // process the prefix of first image
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens; std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past; int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
int ga_i = slot.ga_i;
int32_t ga_i = slot.ga_i;
int32_t ga_n = slot.ga_n; int32_t ga_n = slot.ga_n;
int32_t ga_w = slot.ga_w; int32_t ga_w = slot.ga_w;
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past) for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
{ {
if (slot.ga_n != 1) if (slot.ga_n != 1)
@ -1607,7 +1615,7 @@ struct llama_server_context
} }
} }
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false); llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
slot_npast += 1; slot_npast++;
} }
if (has_images && !ingest_images(slot, n_batch)) if (has_images && !ingest_images(slot, n_batch))
@ -1667,6 +1675,7 @@ struct llama_server_context
slot.n_past_se += n_tokens; slot.n_past_se += n_tokens;
} }
} }
llama_batch batch_view = llama_batch batch_view =
{ {
n_tokens, n_tokens,
@ -1819,15 +1828,15 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel); printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel);
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
printf(" -spf FNAME, --system-prompt-file FNAME\n"); printf(" -spf FNAME, --system-prompt-file FNAME\n");
printf(" Set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n"); printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
printf(" --log-disable disables logging to a file.\n"); printf(" --log-disable disables logging to a file.\n");
printf("\n"); printf("\n");
printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf(" -gan N, --grp-attn-n N Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`"); printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
printf(" -gaw N, --grp-attn-w N Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`"); printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
printf("\n"); printf("\n");
} }

View file

@ -0,0 +1,23 @@
:: MIT license
:: Copyright (C) 2024 Intel Corporation
:: SPDX-License-Identifier: MIT
mkdir -p build
cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
:: for FP16
:: faster for long-prompt inference
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
:: for FP32
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
:: build example/main only
:: make main
:: build all binary
make -j
cd ..

View file

@ -0,0 +1,13 @@
:: MIT license
:: Copyright (C) 2024 Intel Corporation
:: SPDX-License-Identifier: MIT
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
set GGML_SYCL_DEVICE=0
rem set GGML_SYCL_DEBUG=1
.\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0

View file

@ -3688,38 +3688,38 @@ constexpr constant static uint64_t iq2xs_grid[512] = {
}; };
constexpr constant static uint32_t iq3xxs_grid[256] = { constexpr constant static uint32_t iq3xxs_grid[256] = {
0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3c, 0x04041404, 0x04041414, 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
0x04041c0c, 0x04042414, 0x04043c1c, 0x04043c2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3c04, 0x04140404, 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3c, 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
0x04142c0c, 0x04142c3c, 0x04143c2c, 0x041c040c, 0x041c043c, 0x041c0c04, 0x041c0c14, 0x041c142c, 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
0x041c3c04, 0x04240c1c, 0x04241c3c, 0x04242424, 0x04242c3c, 0x04243c1c, 0x04243c2c, 0x042c040c, 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
0x042c043c, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043c0c04, 0x043c0c24, 0x043c0c34, 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
0x043c241c, 0x043c340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243c, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c, 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04, 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
0x0c143c14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3c0c, 0x0c34042c, 0x0c3c1414, 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
0x0c3c2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434, 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
0x140c1c04, 0x140c341c, 0x140c343c, 0x140c3c04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3c, 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
0x14141404, 0x14141414, 0x14141c3c, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24, 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
0x141c3c04, 0x141c3c24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143c, 0x142c240c, 0x142c3c24, 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
0x143c040c, 0x143c041c, 0x143c0c34, 0x143c242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c, 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043c14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c, 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143c14, 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243c, 0x1c243c14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414, 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3c1c1c, 0x1c3c3404, 0x24040424, 0x24040c3c, 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
0x24041c2c, 0x24041c3c, 0x24042c1c, 0x24042c3c, 0x240c3c24, 0x24141404, 0x24141c3c, 0x24142404, 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
0x24143404, 0x24143434, 0x241c043c, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c, 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
0x242c241c, 0x242c3c04, 0x243c042c, 0x243c0c04, 0x243c0c14, 0x243c1c04, 0x2c040c14, 0x2c04240c, 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
0x2c043c04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143c14, 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143c, 0x2c243c14, 0x2c2c0414, 0x2c2c1c0c, 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
0x2c342c04, 0x2c3c1424, 0x2c3c2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c, 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
0x340c340c, 0x34140c3c, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
0x34341c1c, 0x343c041c, 0x343c140c, 0x3c04041c, 0x3c04042c, 0x3c04043c, 0x3c040c04, 0x3c041c14, 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
0x3c042c14, 0x3c0c1434, 0x3c0c2404, 0x3c140c14, 0x3c14242c, 0x3c142c14, 0x3c1c0404, 0x3c1c0c2c, 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
0x3c1c1c1c, 0x3c1c3404, 0x3c24140c, 0x3c24240c, 0x3c2c0404, 0x3c2c0414, 0x3c2c1424, 0x3c341c04, 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
}; };

File diff suppressed because it is too large Load diff

View file

@ -817,7 +817,7 @@ static void ggml_vk_load_shaders() {
// mulmat // mulmat
std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, vk_device.subgroup_size * 2, 64, 2, 4, 4, vk_device.subgroup_size }; std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, vk_device.subgroup_size * 2, 64, 2, 4, 4, vk_device.subgroup_size };
std::initializer_list<uint32_t> warptile_m = { 128, 64, 64, 16, vk_device.subgroup_size, 32, 2, 4, 2, vk_device.subgroup_size }; std::initializer_list<uint32_t> warptile_m = { 128, 64, 64, 16, vk_device.subgroup_size, 32, 2, 4, 2, vk_device.subgroup_size };
std::initializer_list<uint32_t> warptile_s = { vk_device.subgroup_size, 32, 32, 8, 32, 32, 2, 2, 2, vk_device.subgroup_size }; std::initializer_list<uint32_t> warptile_s = { vk_device.subgroup_size, 32, 32, 16, 32, 32, 2, 2, 2, vk_device.subgroup_size };
std::array<uint32_t, 3> l_wg_denoms = {128, 128, 1 }; std::array<uint32_t, 3> l_wg_denoms = {128, 128, 1 };
std::array<uint32_t, 3> m_wg_denoms = { 64, 64, 1 }; std::array<uint32_t, 3> m_wg_denoms = { 64, 64, 1 };
@ -2873,7 +2873,8 @@ static void ggml_vk_op_f32(vk_context * ctx, const ggml_tensor * src0, const ggm
if (op == GGML_OP_CPY) { if (op == GGML_OP_CPY) {
GGML_ASSERT(!transfer_src0); GGML_ASSERT(!transfer_src0);
GGML_ASSERT(!transfer_src1); GGML_ASSERT(!transfer_src1);
d_sz = dst->ne[1] * dst->nb[1]; x_sz = ggml_nbytes(src0);
d_sz = ggml_nbytes(dst);
if (extra->offset + d_sz >= d_D->size) { if (extra->offset + d_sz >= d_D->size) {
d_sz = VK_WHOLE_SIZE; d_sz = VK_WHOLE_SIZE;
@ -4556,8 +4557,15 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml
} }
ggml_vk_preallocate_buffers(); ggml_vk_preallocate_buffers();
int last_node = cgraph->n_nodes - 1;
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_GPU) {
last_node -= 1;
}
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1); ggml_vk_build_graph(cgraph->nodes[i], i == last_node);
} }
ggml_compute_params params = {}; ggml_compute_params params = {};

11
ggml.c
View file

@ -20510,6 +20510,14 @@ int ggml_cpu_has_vulkan(void) {
#endif #endif
} }
int ggml_cpu_has_kompute(void) {
#if defined(GGML_USE_KOMPUTE)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_sycl(void) { int ggml_cpu_has_sycl(void) {
#if defined(GGML_USE_SYCL) #if defined(GGML_USE_SYCL)
return 1; return 1;
@ -20519,7 +20527,8 @@ int ggml_cpu_has_sycl(void) {
} }
int ggml_cpu_has_gpublas(void) { int ggml_cpu_has_gpublas(void) {
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_sycl(); return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
ggml_cpu_has_sycl();
} }
int ggml_cpu_has_sse3(void) { int ggml_cpu_has_sse3(void) {

1
ggml.h
View file

@ -2273,6 +2273,7 @@ extern "C" {
GGML_API int ggml_cpu_has_cublas (void); GGML_API int ggml_cpu_has_cublas (void);
GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_clblast (void);
GGML_API int ggml_cpu_has_vulkan (void); GGML_API int ggml_cpu_has_vulkan (void);
GGML_API int ggml_cpu_has_kompute (void);
GGML_API int ggml_cpu_has_gpublas (void); GGML_API int ggml_cpu_has_gpublas (void);
GGML_API int ggml_cpu_has_sse3 (void); GGML_API int ggml_cpu_has_sse3 (void);
GGML_API int ggml_cpu_has_ssse3 (void); GGML_API int ggml_cpu_has_ssse3 (void);

View file

@ -19,8 +19,8 @@ shader_int8_ext = """
# Type-specific defines # Type-specific defines
shader_f16_defines = """ shader_f16_defines = """
#define QUANT_K 32 #define QUANT_K 1
#define QUANT_R 2 #define QUANT_R 1
#define A_TYPE float16_t #define A_TYPE float16_t
""" """

View file

@ -2760,10 +2760,10 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small"; case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small";
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XSS - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
default: return "unknown, may not work"; default: return "unknown, may not work";
} }
@ -6954,11 +6954,6 @@ static int llama_decode_internal(
n_threads = std::min(4, n_threads); n_threads = std::min(4, n_threads);
} }
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
if ((ggml_cpu_has_cublas() || ggml_cpu_has_vulkan()) && fully_offloaded) {
n_threads = 1;
}
#ifdef GGML_USE_MPI #ifdef GGML_USE_MPI
const int64_t n_layer = hparams.n_layer; const int64_t n_layer = hparams.n_layer;
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);

View file

@ -0,0 +1,19 @@
:: MIT license
:: Copyright (C) 2024 Intel Corporation
:: SPDX-License-Identifier: MIT
set URL=%1
set COMPONENTS=%2
curl.exe --output %TEMP%\webimage.exe --url %URL% --retry 5 --retry-delay 5
start /b /wait %TEMP%\webimage.exe -s -x -f webimage_extracted --log extract.log
del %TEMP%\webimage.exe
if "%COMPONENTS%"=="" (
webimage_extracted\bootstrapper.exe -s --action install --eula=accept -p=NEED_VS2017_INTEGRATION=0 -p=NEED_VS2019_INTEGRATION=0 -p=NEED_VS2022_INTEGRATION=0 --log-dir=.
) else (
webimage_extracted\bootstrapper.exe -s --action install --components=%COMPONENTS% --eula=accept -p=NEED_VS2017_INTEGRATION=0 -p=NEED_VS2019_INTEGRATION=0 -p=NEED_VS2022_INTEGRATION=0 --log-dir=.
)
set installer_exit_code=%ERRORLEVEL%
rd /s/q "webimage_extracted"
exit /b %installer_exit_code%